Feat: Warns to add to modules_to_save when adding tokens or switching special_tokens (#787)

* Feat: Auto add to modules_to_save when adding tokens * fix: swap to error instead of warning * feat: add check when special_tokens differ and add test
2023-12-22 21:49:07 +09:00
parent 62ba1609b6
commit 1ffa3866f2
4 changed files with 104 additions and 0 deletions
--- a/tests/test_tokenizers.py
+++ b/tests/test_tokenizers.py
@@ -3,6 +3,8 @@ Test cases for the tokenizer loading
 """
 import unittest

+import pytest
+
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_tokenizer

@@ -31,6 +33,40 @@ class TestTokenizers(unittest.TestCase):
        tokenizer = load_tokenizer(cfg)
        assert "Fast" not in tokenizer.__class__.__name__

+    def test_special_tokens_modules_to_save(self):
+        # setting special_tokens to new token
+        cfg = DictDefault(
+            {
+                "tokenizer_config": "huggyllama/llama-7b",
+                "adapter": "lora",
+                "special_tokens": {"bos_token": "[INST]"},
+            }
+        )
+        with pytest.raises(
+            ValueError,
+            match=r".*Please set lora_modules_to_save*",
+        ):
+            load_tokenizer(cfg)
+
+        # setting special_tokens but not changing from default
+        cfg = DictDefault(
+            {
+                "tokenizer_config": "huggyllama/llama-7b",
+                "adapter": "lora",
+                "special_tokens": {"bos_token": "<s>"},
+            }
+        )
+        load_tokenizer(cfg)
+
+        # non-adapter setting special_tokens
+        cfg = DictDefault(
+            {
+                "tokenizer_config": "huggyllama/llama-7b",
+                "special_tokens": {"bos_token": "[INST]"},
+            }
+        )
+        load_tokenizer(cfg)
+

 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -682,6 +682,43 @@ class ValidationTest(unittest.TestCase):

        validate_config(cfg)

+    def test_add_tokens_adapter(self):
+        cfg = DictDefault(
+            {"adapter": "qlora", "load_in_4bit": True, "tokens": ["<|imstart|>"]}
+        )
+
+        with pytest.raises(
+            ValueError,
+            match=r".*lora_modules_to_save not properly set yet adding new tokens*",
+        ):
+            validate_config(cfg)
+
+        cfg = DictDefault(
+            {
+                "adapter": "qlora",
+                "load_in_4bit": True,
+                "tokens": ["<|imstart|>"],
+                "lora_modules_to_save": ["embed_tokens"],
+            }
+        )
+
+        with pytest.raises(
+            ValueError,
+            match=r".*lora_modules_to_save not properly set yet adding new tokens*",
+        ):
+            validate_config(cfg)
+
+        cfg = DictDefault(
+            {
+                "adapter": "qlora",
+                "load_in_4bit": True,
+                "tokens": ["<|imstart|>"],
+                "lora_modules_to_save": ["embed_tokens", "lm_head"],
+            }
+        )
+
+        validate_config(cfg)
+

 class ValidationWandbTest(ValidationTest):
    """