From 70541145f169401540185dca5ddac94f94640fc6 Mon Sep 17 00:00:00 2001 From: Dan Saunders Date: Fri, 20 Dec 2024 21:43:33 -0500 Subject: [PATCH] adding test_datasets compat with pretraining_dataset (streaming) (#2206) [skip ci] --- src/axolotl/utils/data/sft.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/axolotl/utils/data/sft.py b/src/axolotl/utils/data/sft.py index 286e5f2d7..e2cb8f9f6 100644 --- a/src/axolotl/utils/data/sft.py +++ b/src/axolotl/utils/data/sft.py @@ -85,6 +85,7 @@ def prepare_dataset(cfg, tokenizer, processor=None): processor=processor, ) else: + # Load streaming dataset if pretraining_dataset is given path = cfg.pretraining_dataset split = "train" name = None @@ -116,7 +117,18 @@ def prepare_dataset(cfg, tokenizer, processor=None): ) # https://discuss.huggingface.co/t/how-to-use-huggingface-trainer-streaming-datasets-without-wrapping-it-with-torchdatas-iterablewrapper/25230 train_dataset = train_dataset.with_format("torch") + + # Load eval dataset (non-streaming) if specified eval_dataset = None + if cfg.test_datasets: + _, eval_dataset, _ = load_prepare_datasets( + tokenizer, + cfg, + DEFAULT_DATASET_PREPARED_PATH, + split="test", + processor=processor, + ) + if cfg.dataset_exact_deduplication: LOG.info("Deduplication not available for pretrained datasets")