Feat: Warns to add to modules_to_save when adding tokens or switching special_tokens (#787)

* Feat: Auto add to modules_to_save when adding tokens

* fix: swap to error instead of warning

* feat: add check when special_tokens differ and add test
This commit is contained in:
NanoCode012
2023-12-22 21:49:07 +09:00
committed by GitHub
parent 62ba1609b6
commit 1ffa3866f2
4 changed files with 104 additions and 0 deletions

View File

@@ -3,6 +3,8 @@ Test cases for the tokenizer loading
"""
import unittest
import pytest
from axolotl.utils.dict import DictDefault
from axolotl.utils.models import load_tokenizer
@@ -31,6 +33,40 @@ class TestTokenizers(unittest.TestCase):
tokenizer = load_tokenizer(cfg)
assert "Fast" not in tokenizer.__class__.__name__
def test_special_tokens_modules_to_save(self):
# setting special_tokens to new token
cfg = DictDefault(
{
"tokenizer_config": "huggyllama/llama-7b",
"adapter": "lora",
"special_tokens": {"bos_token": "[INST]"},
}
)
with pytest.raises(
ValueError,
match=r".*Please set lora_modules_to_save*",
):
load_tokenizer(cfg)
# setting special_tokens but not changing from default
cfg = DictDefault(
{
"tokenizer_config": "huggyllama/llama-7b",
"adapter": "lora",
"special_tokens": {"bos_token": "<s>"},
}
)
load_tokenizer(cfg)
# non-adapter setting special_tokens
cfg = DictDefault(
{
"tokenizer_config": "huggyllama/llama-7b",
"special_tokens": {"bos_token": "[INST]"},
}
)
load_tokenizer(cfg)
if __name__ == "__main__":
unittest.main()

View File

@@ -682,6 +682,43 @@ class ValidationTest(unittest.TestCase):
validate_config(cfg)
def test_add_tokens_adapter(self):
cfg = DictDefault(
{"adapter": "qlora", "load_in_4bit": True, "tokens": ["<|imstart|>"]}
)
with pytest.raises(
ValueError,
match=r".*lora_modules_to_save not properly set yet adding new tokens*",
):
validate_config(cfg)
cfg = DictDefault(
{
"adapter": "qlora",
"load_in_4bit": True,
"tokens": ["<|imstart|>"],
"lora_modules_to_save": ["embed_tokens"],
}
)
with pytest.raises(
ValueError,
match=r".*lora_modules_to_save not properly set yet adding new tokens*",
):
validate_config(cfg)
cfg = DictDefault(
{
"adapter": "qlora",
"load_in_4bit": True,
"tokens": ["<|imstart|>"],
"lora_modules_to_save": ["embed_tokens", "lm_head"],
}
)
validate_config(cfg)
class ValidationWandbTest(ValidationTest):
"""