fix chatml system prompt for openorca, legacy tokenizer opts
This commit is contained in:
@@ -102,7 +102,7 @@ class OpenOrcaSystemDataPrompter(SystemDataPrompter):
|
|||||||
self.turn_no_input_format = (
|
self.turn_no_input_format = (
|
||||||
"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"
|
"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"
|
||||||
)
|
)
|
||||||
self.system_format = "<|im_start|>{system}<|im_end|>\n"
|
self.system_format = "<|im_start|>system\n{system}<|im_end|>\n"
|
||||||
|
|
||||||
|
|
||||||
class OpenOrcaPromptTokenizingStrategy(InstructionWSystemPromptTokenizingStrategy):
|
class OpenOrcaPromptTokenizingStrategy(InstructionWSystemPromptTokenizingStrategy):
|
||||||
|
|||||||
@@ -154,6 +154,7 @@ class MultipackDistributedDataloader:
|
|||||||
self.packing_efficiency_estimate = packing_efficiency_estimate or 1.0
|
self.packing_efficiency_estimate = packing_efficiency_estimate or 1.0
|
||||||
|
|
||||||
def generate_batches(self, set_stats=False):
|
def generate_batches(self, set_stats=False):
|
||||||
|
LOG.info("generating packed batches")
|
||||||
if self.sampler:
|
if self.sampler:
|
||||||
indices = [idx for idx in self.sampler]
|
indices = [idx for idx in self.sampler]
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -36,20 +36,26 @@ def load_tokenizer(
|
|||||||
tokenizer_type,
|
tokenizer_type,
|
||||||
cfg,
|
cfg,
|
||||||
):
|
):
|
||||||
|
tokenizer_kwargs = {}
|
||||||
use_fast = True # this is the default
|
use_fast = True # this is the default
|
||||||
if cfg.tokenizer_use_fast is not None:
|
if cfg.tokenizer_use_fast is not None:
|
||||||
use_fast = cfg.tokenizer_use_fast
|
use_fast = cfg.tokenizer_use_fast
|
||||||
|
if cfg.tokenizer_legacy is not None:
|
||||||
|
# True is the default w/ https://github.com/huggingface/transformers/pull/25224
|
||||||
|
tokenizer_kwargs["legacy"] = cfg.tokenizer_legacy
|
||||||
if tokenizer_type:
|
if tokenizer_type:
|
||||||
tokenizer = getattr(transformers, tokenizer_type).from_pretrained(
|
tokenizer = getattr(transformers, tokenizer_type).from_pretrained(
|
||||||
tokenizer_config,
|
tokenizer_config,
|
||||||
trust_remote_code=cfg.trust_remote_code or False,
|
trust_remote_code=cfg.trust_remote_code or False,
|
||||||
use_fast=use_fast,
|
use_fast=use_fast,
|
||||||
|
**tokenizer_kwargs,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
tokenizer_config,
|
tokenizer_config,
|
||||||
trust_remote_code=cfg.trust_remote_code or False,
|
trust_remote_code=cfg.trust_remote_code or False,
|
||||||
use_fast=use_fast,
|
use_fast=use_fast,
|
||||||
|
**tokenizer_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
|
LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
|
||||||
|
|||||||
Reference in New Issue
Block a user