From e0b7eeabfd0f997d41964d1301900a6b277b4146 Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Fri, 6 Oct 2023 03:50:49 +0900 Subject: [PATCH] Fix(tokenizer): Set rstrip,lstrip,norm to False (#678) --- src/axolotl/utils/models.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 6c8e7b8f0..7ef22cf5f 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -11,6 +11,7 @@ from optimum.bettertransformer import BetterTransformer from peft import PeftConfig, prepare_model_for_kbit_training from peft.tuners.lora import QuantLinear from transformers import ( # noqa: F401 + AddedToken, AutoConfig, AutoModelForCausalLM, AutoTokenizer, @@ -82,9 +83,16 @@ def load_tokenizer(cfg): if cfg.special_tokens: for k, val in cfg.special_tokens.items(): - tokenizer.add_special_tokens({k: val}) + tokenizer.add_special_tokens( + {k: AddedToken(val, rstrip=False, lstrip=False, normalized=False)} + ) if cfg.tokens: - tokenizer.add_tokens(list(cfg.tokens)) + tokenizer.add_tokens( + [ + AddedToken(token, rstrip=False, lstrip=False, normalized=False) + for token in cfg.tokens + ] + ) return tokenizer