fix for tokenizers change

This commit is contained in:
Wing Lian
2026-02-19 21:52:44 -05:00
parent c5d20bbd79
commit 9722aaf7d8
2 changed files with 9 additions and 5 deletions

View File

@@ -115,6 +115,9 @@ class TestAssistantChatTemplateLlama3:
def test_phi35(self, phi35_tokenizer, assistant_dataset):
LOG.info("Testing phi-3.5 with assistant dataset")
assert "LlamaTokenizer" in phi35_tokenizer.__class__.__name__, (
"phi35 tokenizer should be a LlamaTokenizer"
)
strategy = ChatTemplateStrategy(
ChatTemplatePrompter(
phi35_tokenizer,
@@ -140,13 +143,13 @@ class TestAssistantChatTemplateLlama3:
# fmt: off
expected_input_ids = [
32010, # user
22172, 32007, # user eot
12199, 32007, # user eot
32001, # assistant
22172, 32007, # assistant eot
12199, 32007, # assistant eot
32010, # user
1781, 26966, 32007, # user eot
16773, 26966, 32007, # user eot
32001, # assistant
1781, 26966, 32007, # assistant eot
16773, 26966, 32007, # assistant eot
]
expected_labels = [
-100, # user

View File

@@ -84,7 +84,8 @@ class TestTokenizers:
}
)
tokenizer = load_tokenizer(cfg)
assert tokenizer("<|im_start|>user")["input_ids"] == [1, 32000, 1404]
assert "LlamaTokenizer" in tokenizer.__class__.__name__
assert tokenizer("<|im_start|>user")["input_ids"] == [1, 32000, 1792]
assert len(tokenizer) == 32001
# ensure reloading the tokenizer again from cfg results in same vocab length