fix for tokenizers change

This commit is contained in:
Wing Lian
2026-02-19 21:52:44 -05:00
parent c5d20bbd79
commit 9722aaf7d8
2 changed files with 9 additions and 5 deletions

View File

@@ -115,6 +115,9 @@ class TestAssistantChatTemplateLlama3:
def test_phi35(self, phi35_tokenizer, assistant_dataset): def test_phi35(self, phi35_tokenizer, assistant_dataset):
LOG.info("Testing phi-3.5 with assistant dataset") LOG.info("Testing phi-3.5 with assistant dataset")
assert "LlamaTokenizer" in phi35_tokenizer.__class__.__name__, (
"phi35 tokenizer should be a LlamaTokenizer"
)
strategy = ChatTemplateStrategy( strategy = ChatTemplateStrategy(
ChatTemplatePrompter( ChatTemplatePrompter(
phi35_tokenizer, phi35_tokenizer,
@@ -140,13 +143,13 @@ class TestAssistantChatTemplateLlama3:
# fmt: off # fmt: off
expected_input_ids = [ expected_input_ids = [
32010, # user 32010, # user
22172, 32007, # user eot 12199, 32007, # user eot
32001, # assistant 32001, # assistant
22172, 32007, # assistant eot 12199, 32007, # assistant eot
32010, # user 32010, # user
1781, 26966, 32007, # user eot 16773, 26966, 32007, # user eot
32001, # assistant 32001, # assistant
1781, 26966, 32007, # assistant eot 16773, 26966, 32007, # assistant eot
] ]
expected_labels = [ expected_labels = [
-100, # user -100, # user

View File

@@ -84,7 +84,8 @@ class TestTokenizers:
} }
) )
tokenizer = load_tokenizer(cfg) tokenizer = load_tokenizer(cfg)
assert tokenizer("<|im_start|>user")["input_ids"] == [1, 32000, 1404] assert "LlamaTokenizer" in tokenizer.__class__.__name__
assert tokenizer("<|im_start|>user")["input_ids"] == [1, 32000, 1792]
assert len(tokenizer) == 32001 assert len(tokenizer) == 32001
# ensure reloading the tokenizer again from cfg results in same vocab length # ensure reloading the tokenizer again from cfg results in same vocab length