improve llama pad token handling (#475)

* improve llama pad token handling

* tweak logic to not clobber
This commit is contained in:
Wing Lian
2023-08-24 13:20:35 -04:00
committed by GitHub
parent bde3c5a478
commit cb9797ef5a
4 changed files with 10 additions and 8 deletions

View File

@@ -57,7 +57,7 @@ weight_decay: 0.0001
fsdp:
fsdp_config:
tokens:
pad_token: "[PAD]"
pad_token: "<pad>"
bos_token: "<s>"
eos_token: "</s>"
unk_token: "<unk>"

View File

@@ -13,7 +13,7 @@ from axolotl.prompters import IGNORE_TOKEN_ID
LOG = logging.getLogger("axolotl")
IGNORE_INDEX = -100
LLAMA_DEFAULT_PAD_TOKEN = "[PAD]" # nosec
LLAMA_DEFAULT_PAD_TOKEN = "<pad>" # nosec
LLAMA_DEFAULT_EOS_TOKEN = "</s>" # nosec
LLAMA_DEFAULT_BOS_TOKEN = "<s>" # nosec
LLAMA_DEFAULT_UNK_TOKEN = "<unk>" # nosec

View File

@@ -54,9 +54,10 @@ DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared"
def prepare_dataset(cfg, tokenizer):
if not cfg.pretraining_dataset:
train_dataset, eval_dataset = load_prepare_datasets(
tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
)
with zero_first(is_main_process()):
train_dataset, eval_dataset = load_prepare_datasets(
tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
)
else:
train_dataset = load_pretraining_dataset(
cfg.pretraining_dataset,

View File

@@ -22,7 +22,7 @@ from transformers import ( # noqa: F401
PreTrainedTokenizerBase,
)
from axolotl.prompt_tokenizers import LLAMA_DEFAULT_PAD_TOKEN
from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN
from axolotl.utils.bench import log_gpu_memory_usage
LOG = logging.getLogger("axolotl")
@@ -58,8 +58,9 @@ def load_tokenizer(cfg):
if tokenizer.__class__.__name__ in [
"LlamaTokenizer",
"LlamaTokenizerFast",
]:
tokenizer.pad_token = LLAMA_DEFAULT_PAD_TOKEN
] and not hasattr(tokenizer, "pad_token"):
# set a pad_token, but use eos_token so we don't add a new token
tokenizer.pad_token = LLAMA_DEFAULT_EOS_TOKEN
LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")