wip add new proposed message structure (#1904)
* wip add new proposed message structure * tokenization * wip * wip transform builder * wip make the chat dataset loadable * wip chatml + llama 3 new chat objects * chore: lint * chore: lint * fix tokenization * remove dacite dependency since we're using pydantic now * fix handling when already correctly split in messages * make sure to remove chat features from tokenized ds * move chat to be a input transform for messages * make sure llama3 has the bos token * remove non-working special token code * fix messages strat loader
This commit is contained in:
@@ -23,6 +23,7 @@ from axolotl.prompt_tokenizers import (
|
||||
AlpacaMultipleChoicePromptTokenizingStrategy,
|
||||
AlpacaPromptTokenizingStrategy,
|
||||
AlpacaReflectionPTStrategy,
|
||||
DatasetWrappingStrategy,
|
||||
GPTeacherPromptTokenizingStrategy,
|
||||
JeopardyPromptTokenizingStrategy,
|
||||
OpenAssistantPromptTokenizingStrategy,
|
||||
@@ -573,7 +574,7 @@ def get_dataset_wrapper(
|
||||
d_base_type,
|
||||
dataset,
|
||||
d_prompt_style=None,
|
||||
processor=None,
|
||||
processor=None, # pylint: disable=unused-argument
|
||||
):
|
||||
dataset_wrapper = None
|
||||
dataset_prompter = None
|
||||
@@ -608,15 +609,16 @@ def get_dataset_wrapper(
|
||||
)
|
||||
elif cfg.skip_prepare_dataset:
|
||||
dataset_wrapper = dataset
|
||||
elif ds_strategy := load(
|
||||
config_dataset.type, tokenizer, cfg, config_dataset, processor=processor
|
||||
):
|
||||
dataset_prompter = UnsupportedPrompter()
|
||||
dataset_wrapper = TokenizedPromptDataset(
|
||||
ds_strategy,
|
||||
dataset,
|
||||
**ds_kwargs,
|
||||
)
|
||||
elif ds_strategy := load(config_dataset.type, tokenizer, cfg, config_dataset):
|
||||
if isinstance(ds_strategy, DatasetWrappingStrategy):
|
||||
dataset_wrapper = ds_strategy.wrap_dataset(dataset, **ds_kwargs)
|
||||
else:
|
||||
dataset_prompter = UnsupportedPrompter()
|
||||
dataset_wrapper = TokenizedPromptDataset(
|
||||
ds_strategy,
|
||||
dataset,
|
||||
**ds_kwargs,
|
||||
)
|
||||
elif d_base_type == "alpaca":
|
||||
dataset_prompter = AlpacaPrompter(d_prompt_style)
|
||||
ds_strategy = AlpacaPromptTokenizingStrategy(
|
||||
|
||||
Reference in New Issue
Block a user