From 6383630155ee5b65ec14fd1b185385a1d90e884c Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Wed, 2 Jul 2025 19:06:00 +0700 Subject: [PATCH] Fix: tokenize stall due to not shuffling dataset (#2845) * fix: shuffle dataset even if only one to fix tokenize stall * fix: warn if shuffling merged with curriculum sampling * chore: refactor --- src/axolotl/utils/data/shared.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/axolotl/utils/data/shared.py b/src/axolotl/utils/data/shared.py index 3c58b4c85..a537c5b65 100644 --- a/src/axolotl/utils/data/shared.py +++ b/src/axolotl/utils/data/shared.py @@ -524,13 +524,24 @@ def merge_datasets(datasets: list[Dataset], cfg: DictDefault) -> Dataset: Merged dataset. """ if len(datasets) == 1: - return datasets[0] + ds = datasets[0] + + # Do not shuffle if curriculum sampling is enabled + if cfg.curriculum_sampling: + return ds + + return ds.shuffle(seed=cfg.seed) LOG.info("Merging datasets...") merged_dataset = concatenate_datasets(datasets) if cfg.shuffle_merged_datasets: LOG.debug("Shuffling merged datasets...") + if cfg.curriculum_sampling: + LOG.warning( + "Shuffling merged datasets with curriculum sampling is not recommended. " + "This will randomize the order of samples." + ) merged_dataset = merged_dataset.shuffle(seed=cfg.seed) else: LOG.debug("Not shuffling merged datasets.")