From 7712955b35eec8e9f5d6d2313c7ab64a57e13303 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Fri, 4 Aug 2023 13:57:17 -0400 Subject: [PATCH] fix chatml system prompt for openorca, legacy tokenizer opts --- src/axolotl/prompt_strategies/alpaca_w_system.py | 2 +- src/axolotl/utils/dataloader.py | 1 + src/axolotl/utils/models.py | 6 ++++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/axolotl/prompt_strategies/alpaca_w_system.py b/src/axolotl/prompt_strategies/alpaca_w_system.py index 8875ec7bc..134b446fd 100644 --- a/src/axolotl/prompt_strategies/alpaca_w_system.py +++ b/src/axolotl/prompt_strategies/alpaca_w_system.py @@ -102,7 +102,7 @@ class OpenOrcaSystemDataPrompter(SystemDataPrompter): self.turn_no_input_format = ( "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n" ) - self.system_format = "<|im_start|>{system}<|im_end|>\n" + self.system_format = "<|im_start|>system\n{system}<|im_end|>\n" class OpenOrcaPromptTokenizingStrategy(InstructionWSystemPromptTokenizingStrategy): diff --git a/src/axolotl/utils/dataloader.py b/src/axolotl/utils/dataloader.py index fbd22eb57..f4c18c604 100644 --- a/src/axolotl/utils/dataloader.py +++ b/src/axolotl/utils/dataloader.py @@ -154,6 +154,7 @@ class MultipackDistributedDataloader: self.packing_efficiency_estimate = packing_efficiency_estimate or 1.0 def generate_batches(self, set_stats=False): + LOG.info("generating packed batches") if self.sampler: indices = [idx for idx in self.sampler] else: diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 227960e6e..b770bb47c 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -36,20 +36,26 @@ def load_tokenizer( tokenizer_type, cfg, ): + tokenizer_kwargs = {} use_fast = True # this is the default if cfg.tokenizer_use_fast is not None: use_fast = cfg.tokenizer_use_fast + if cfg.tokenizer_legacy is not None: + # True is the default w/ https://github.com/huggingface/transformers/pull/25224 + tokenizer_kwargs["legacy"] = cfg.tokenizer_legacy if tokenizer_type: tokenizer = getattr(transformers, tokenizer_type).from_pretrained( tokenizer_config, trust_remote_code=cfg.trust_remote_code or False, use_fast=use_fast, + **tokenizer_kwargs, ) else: tokenizer = AutoTokenizer.from_pretrained( tokenizer_config, trust_remote_code=cfg.trust_remote_code or False, use_fast=use_fast, + **tokenizer_kwargs, ) LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")