From 9722aaf7d85e2406f300f491053f6f591fb549a8 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 19 Feb 2026 21:52:44 -0500 Subject: [PATCH] fix for tokenizers change --- tests/prompt_strategies/test_chat_templates.py | 11 +++++++---- tests/test_tokenizers.py | 3 ++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/prompt_strategies/test_chat_templates.py b/tests/prompt_strategies/test_chat_templates.py index 90e0e274b..f5d9d6f54 100644 --- a/tests/prompt_strategies/test_chat_templates.py +++ b/tests/prompt_strategies/test_chat_templates.py @@ -115,6 +115,9 @@ class TestAssistantChatTemplateLlama3: def test_phi35(self, phi35_tokenizer, assistant_dataset): LOG.info("Testing phi-3.5 with assistant dataset") + assert "LlamaTokenizer" in phi35_tokenizer.__class__.__name__, ( + "phi35 tokenizer should be a LlamaTokenizer" + ) strategy = ChatTemplateStrategy( ChatTemplatePrompter( phi35_tokenizer, @@ -140,13 +143,13 @@ class TestAssistantChatTemplateLlama3: # fmt: off expected_input_ids = [ 32010, # user - 22172, 32007, # user eot + 12199, 32007, # user eot 32001, # assistant - 22172, 32007, # assistant eot + 12199, 32007, # assistant eot 32010, # user - 1781, 26966, 32007, # user eot + 16773, 26966, 32007, # user eot 32001, # assistant - 1781, 26966, 32007, # assistant eot + 16773, 26966, 32007, # assistant eot ] expected_labels = [ -100, # user diff --git a/tests/test_tokenizers.py b/tests/test_tokenizers.py index 114c2bea2..17f50f2d9 100644 --- a/tests/test_tokenizers.py +++ b/tests/test_tokenizers.py @@ -84,7 +84,8 @@ class TestTokenizers: } ) tokenizer = load_tokenizer(cfg) - assert tokenizer("<|im_start|>user")["input_ids"] == [1, 32000, 1404] + assert "LlamaTokenizer" in tokenizer.__class__.__name__ + assert tokenizer("<|im_start|>user")["input_ids"] == [1, 32000, 1792] assert len(tokenizer) == 32001 # ensure reloading the tokenizer again from cfg results in same vocab length