fix chatml system prompt for openorca, legacy tokenizer opts

This commit is contained in:
Wing Lian
2023-08-04 13:57:17 -04:00
parent f93f0017cd
commit 7712955b35
3 changed files with 8 additions and 1 deletions

View File

@@ -102,7 +102,7 @@ class OpenOrcaSystemDataPrompter(SystemDataPrompter):
self.turn_no_input_format = ( self.turn_no_input_format = (
"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n" "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"
) )
self.system_format = "<|im_start|>{system}<|im_end|>\n" self.system_format = "<|im_start|>system\n{system}<|im_end|>\n"
class OpenOrcaPromptTokenizingStrategy(InstructionWSystemPromptTokenizingStrategy): class OpenOrcaPromptTokenizingStrategy(InstructionWSystemPromptTokenizingStrategy):

View File

@@ -154,6 +154,7 @@ class MultipackDistributedDataloader:
self.packing_efficiency_estimate = packing_efficiency_estimate or 1.0 self.packing_efficiency_estimate = packing_efficiency_estimate or 1.0
def generate_batches(self, set_stats=False): def generate_batches(self, set_stats=False):
LOG.info("generating packed batches")
if self.sampler: if self.sampler:
indices = [idx for idx in self.sampler] indices = [idx for idx in self.sampler]
else: else:

View File

@@ -36,20 +36,26 @@ def load_tokenizer(
tokenizer_type, tokenizer_type,
cfg, cfg,
): ):
tokenizer_kwargs = {}
use_fast = True # this is the default use_fast = True # this is the default
if cfg.tokenizer_use_fast is not None: if cfg.tokenizer_use_fast is not None:
use_fast = cfg.tokenizer_use_fast use_fast = cfg.tokenizer_use_fast
if cfg.tokenizer_legacy is not None:
# True is the default w/ https://github.com/huggingface/transformers/pull/25224
tokenizer_kwargs["legacy"] = cfg.tokenizer_legacy
if tokenizer_type: if tokenizer_type:
tokenizer = getattr(transformers, tokenizer_type).from_pretrained( tokenizer = getattr(transformers, tokenizer_type).from_pretrained(
tokenizer_config, tokenizer_config,
trust_remote_code=cfg.trust_remote_code or False, trust_remote_code=cfg.trust_remote_code or False,
use_fast=use_fast, use_fast=use_fast,
**tokenizer_kwargs,
) )
else: else:
tokenizer = AutoTokenizer.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(
tokenizer_config, tokenizer_config,
trust_remote_code=cfg.trust_remote_code or False, trust_remote_code=cfg.trust_remote_code or False,
use_fast=use_fast, use_fast=use_fast,
**tokenizer_kwargs,
) )
LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}") LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")