fix for tokenizers change

This commit is contained in:
Wing Lian
2026-02-19 21:52:44 -05:00
parent c5d20bbd79
commit 9722aaf7d8
2 changed files with 9 additions and 5 deletions

View File

@@ -84,7 +84,8 @@ class TestTokenizers:
}
)
tokenizer = load_tokenizer(cfg)
assert tokenizer("<|im_start|>user")["input_ids"] == [1, 32000, 1404]
assert "LlamaTokenizer" in tokenizer.__class__.__name__
assert tokenizer("<|im_start|>user")["input_ids"] == [1, 32000, 1792]
assert len(tokenizer) == 32001
# ensure reloading the tokenizer again from cfg results in same vocab length