From e029ab34ea805c71a59aec372fc1a0930a0d072e Mon Sep 17 00:00:00 2001 From: Aman Karmani Date: Sun, 13 Aug 2023 01:30:54 +0000 Subject: [PATCH] quiet noise from llama tokenizer by setting pad token earlier --- src/axolotl/utils/models.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index cfd85f9e5..2f672433d 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -59,17 +59,17 @@ def load_tokenizer( **tokenizer_kwargs, ) - LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}") - LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}") - LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}") - LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}") - if tokenizer.__class__.__name__ in [ "LlamaTokenizer", "LlamaTokenizerFast", ]: tokenizer.pad_token = LLAMA_DEFAULT_PAD_TOKEN + LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}") + LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}") + LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}") + LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}") + if tokenizer.__class__.__name__ == "GPTNeoXTokenizerFast": tokenizer.add_special_tokens({"pad_token": "[PAD]"}) os.environ["TOKENIZERS_PARALLELISM"] = "false"