From cb9797ef5a069000d064d5b678e23ea023a535e4 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 24 Aug 2023 13:20:35 -0400 Subject: [PATCH] improve llama pad token handling (#475) * improve llama pad token handling * tweak logic to not clobber --- examples/gptq-lora-7b/config.yml | 2 +- src/axolotl/prompt_tokenizers.py | 2 +- src/axolotl/utils/data.py | 7 ++++--- src/axolotl/utils/models.py | 7 ++++--- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/examples/gptq-lora-7b/config.yml b/examples/gptq-lora-7b/config.yml index d5fbe3f13..d909f7d07 100644 --- a/examples/gptq-lora-7b/config.yml +++ b/examples/gptq-lora-7b/config.yml @@ -57,7 +57,7 @@ weight_decay: 0.0001 fsdp: fsdp_config: tokens: - pad_token: "[PAD]" + pad_token: "" bos_token: "" eos_token: "" unk_token: "" diff --git a/src/axolotl/prompt_tokenizers.py b/src/axolotl/prompt_tokenizers.py index 9bdd5644a..ed32ab24a 100644 --- a/src/axolotl/prompt_tokenizers.py +++ b/src/axolotl/prompt_tokenizers.py @@ -13,7 +13,7 @@ from axolotl.prompters import IGNORE_TOKEN_ID LOG = logging.getLogger("axolotl") IGNORE_INDEX = -100 -LLAMA_DEFAULT_PAD_TOKEN = "[PAD]" # nosec +LLAMA_DEFAULT_PAD_TOKEN = "" # nosec LLAMA_DEFAULT_EOS_TOKEN = "" # nosec LLAMA_DEFAULT_BOS_TOKEN = "" # nosec LLAMA_DEFAULT_UNK_TOKEN = "" # nosec diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index f6a722a82..b801e6a57 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -54,9 +54,10 @@ DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared" def prepare_dataset(cfg, tokenizer): if not cfg.pretraining_dataset: - train_dataset, eval_dataset = load_prepare_datasets( - tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH - ) + with zero_first(is_main_process()): + train_dataset, eval_dataset = load_prepare_datasets( + tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH + ) else: train_dataset = load_pretraining_dataset( cfg.pretraining_dataset, diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 522ab3cb4..4fad740c5 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -22,7 +22,7 @@ from transformers import ( # noqa: F401 PreTrainedTokenizerBase, ) -from axolotl.prompt_tokenizers import LLAMA_DEFAULT_PAD_TOKEN +from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN from axolotl.utils.bench import log_gpu_memory_usage LOG = logging.getLogger("axolotl") @@ -58,8 +58,9 @@ def load_tokenizer(cfg): if tokenizer.__class__.__name__ in [ "LlamaTokenizer", "LlamaTokenizerFast", - ]: - tokenizer.pad_token = LLAMA_DEFAULT_PAD_TOKEN + ] and not hasattr(tokenizer, "pad_token"): + # set a pad_token, but use eos_token so we don't add a new token + tokenizer.pad_token = LLAMA_DEFAULT_EOS_TOKEN LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}") LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")