wip add new proposed message structure (#1904)

* wip add new proposed message structure

* tokenization

* wip

* wip transform builder

* wip make the chat dataset loadable

* wip chatml + llama 3 new chat objects

* chore: lint

* chore: lint

* fix tokenization

* remove dacite dependency since we're using pydantic now

* fix handling when already correctly split in messages

* make sure to remove chat features from tokenized ds

* move chat to be a input transform for messages

* make sure llama3 has the bos token

* remove non-working special token code

* fix messages strat loader
This commit is contained in:
Wing Lian
2024-10-13 12:15:18 -04:00
committed by GitHub
parent 1834cdc364
commit cd2d89f467
23 changed files with 1285 additions and 15 deletions

View File

@@ -23,6 +23,7 @@ from axolotl.prompt_tokenizers import (
AlpacaMultipleChoicePromptTokenizingStrategy,
AlpacaPromptTokenizingStrategy,
AlpacaReflectionPTStrategy,
DatasetWrappingStrategy,
GPTeacherPromptTokenizingStrategy,
JeopardyPromptTokenizingStrategy,
OpenAssistantPromptTokenizingStrategy,
@@ -573,7 +574,7 @@ def get_dataset_wrapper(
d_base_type,
dataset,
d_prompt_style=None,
processor=None,
processor=None, # pylint: disable=unused-argument
):
dataset_wrapper = None
dataset_prompter = None
@@ -608,15 +609,16 @@ def get_dataset_wrapper(
)
elif cfg.skip_prepare_dataset:
dataset_wrapper = dataset
elif ds_strategy := load(
config_dataset.type, tokenizer, cfg, config_dataset, processor=processor
):
dataset_prompter = UnsupportedPrompter()
dataset_wrapper = TokenizedPromptDataset(
ds_strategy,
dataset,
**ds_kwargs,
)
elif ds_strategy := load(config_dataset.type, tokenizer, cfg, config_dataset):
if isinstance(ds_strategy, DatasetWrappingStrategy):
dataset_wrapper = ds_strategy.wrap_dataset(dataset, **ds_kwargs)
else:
dataset_prompter = UnsupportedPrompter()
dataset_wrapper = TokenizedPromptDataset(
ds_strategy,
dataset,
**ds_kwargs,
)
elif d_base_type == "alpaca":
dataset_prompter = AlpacaPrompter(d_prompt_style)
ds_strategy = AlpacaPromptTokenizingStrategy(