Support for additional_special_tokens (#1221) [skip ci]
* Support for additional_special_tokens * Support for additional_special_tokens. Adjust whitespace. * Support for additional_special_tokens. Use correct quotes. * Support for additional_special_tokens. Safe pop. * Support for additional_special_tokens. nt. * Support for additional_special_tokens. cfg.special_tokens may be None. * add token if not in vocabulary when adding additional_special_tokens * fix logic for copy/pasta * bugfix for popping from config and tokenizer reload * no need to add tokens manually now with previous bugfix --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>
This commit is contained in:
@@ -67,6 +67,21 @@ class TestTokenizers(unittest.TestCase):
|
||||
)
|
||||
load_tokenizer(cfg)
|
||||
|
||||
def test_add_additional_special_tokens(self):
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"tokenizer_config": "huggyllama/llama-7b",
|
||||
"special_tokens": {"additional_special_tokens": ["<|im_start|>"]},
|
||||
}
|
||||
)
|
||||
tokenizer = load_tokenizer(cfg)
|
||||
self.assertEqual(tokenizer("<|im_start|>user")["input_ids"], [1, 32000, 1404])
|
||||
self.assertEqual(len(tokenizer), 32001)
|
||||
|
||||
# ensure reloading the tokenizer again from cfg results in same vocab length
|
||||
tokenizer = load_tokenizer(cfg)
|
||||
self.assertEqual(len(tokenizer), 32001)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
Reference in New Issue
Block a user