From 934fc851da4a77fb3c56187681e2802237f3ff49 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sat, 6 Apr 2024 19:55:19 -0700 Subject: [PATCH] drop empty token from beginning if tokenizer has no bos_token (in the case of qwen) (#1490) --- src/axolotl/core/trainer_builder.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/axolotl/core/trainer_builder.py b/src/axolotl/core/trainer_builder.py index cc7275184..8d08d60b3 100644 --- a/src/axolotl/core/trainer_builder.py +++ b/src/axolotl/core/trainer_builder.py @@ -23,6 +23,7 @@ from torch.optim.lr_scheduler import OneCycleLR from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler from transformers import ( EarlyStoppingCallback, + PreTrainedModel, Trainer, TrainerCallback, TrainingArguments, @@ -802,6 +803,15 @@ class AxolotlDPOTrainer(DPOTrainer): return super().push_to_hub(*args, **kwargs) + def tokenize_row( + self, feature, model: Optional[Union[PreTrainedModel, torch.nn.Module]] = None + ) -> Dict: + res = super().tokenize_row(feature, model=model) + if self.tokenizer.bos_token_id is None and res["prompt_input_ids"][0] is None: + for key in res.keys(): + res[key] = res[key][1:] + return res + class TrainerBuilderBase(abc.ABC): """