adding test_datasets compat with pretraining_dataset (streaming) (#2206) [skip ci]

2024-12-20 21:43:33 -05:00
parent 42bd32a233
commit 70541145f1
1 changed files with 12 additions and 0 deletions
--- a/src/axolotl/utils/data/sft.py
+++ b/src/axolotl/utils/data/sft.py
@@ -85,6 +85,7 @@ def prepare_dataset(cfg, tokenizer, processor=None):
                    processor=processor,
                )
    else:
        # Load streaming dataset if pretraining_dataset is given
        path = cfg.pretraining_dataset
        split = "train"
        name = None
@@ -116,7 +117,18 @@ def prepare_dataset(cfg, tokenizer, processor=None):
        )
        # https://discuss.huggingface.co/t/how-to-use-huggingface-trainer-streaming-datasets-without-wrapping-it-with-torchdatas-iterablewrapper/25230
        train_dataset = train_dataset.with_format("torch")
        # Load eval dataset (non-streaming) if specified
        eval_dataset = None
        if cfg.test_datasets:
            _, eval_dataset, _ = load_prepare_datasets(
                tokenizer,
                cfg,
                DEFAULT_DATASET_PREPARED_PATH,
                split="test",
                processor=processor,
            )
        if cfg.dataset_exact_deduplication:
            LOG.info("Deduplication not available for pretrained datasets")