diff --git a/src/axolotl/prompt_strategies/pretrain.py b/src/axolotl/prompt_strategies/pretrain.py index ecf3c2d62..893cd0233 100644 --- a/src/axolotl/prompt_strategies/pretrain.py +++ b/src/axolotl/prompt_strategies/pretrain.py @@ -41,6 +41,7 @@ class PretrainTokenizationStrategy(PromptTokenizingStrategy): seq + [self.tokenizer.eos_token_id] for seq in res["input_ids"] ] res["attention_mask"] = [seq + [1] for seq in res["attention_mask"]] + res["labels"] = res["input_ids"].copy() return res @@ -52,7 +53,7 @@ def load(tokenizer, cfg): if cfg.pretraining_dataset: cfg_ds = cfg.pretraining_dataset else: - cfg_ds = cfg.dataset + cfg_ds = cfg.datasets strat = PretrainTokenizationStrategy( PretrainTokenizer(), tokenizer,