Fix: tokenize stall due to not shuffling dataset (#2845)

* fix: shuffle dataset even if only one to fix tokenize stall * fix: warn if shuffling merged with curriculum sampling * chore: refactor
2025-07-02 19:06:00 +07:00
parent f2b352f2e5
commit 6383630155
1 changed files with 12 additions and 1 deletions
--- a/src/axolotl/utils/data/shared.py
+++ b/src/axolotl/utils/data/shared.py
@@ -524,13 +524,24 @@ def merge_datasets(datasets: list[Dataset], cfg: DictDefault) -> Dataset:
        Merged dataset.
    """
    if len(datasets) == 1:
-        return datasets[0]
+        ds = datasets[0]
+
+        # Do not shuffle if curriculum sampling is enabled
+        if cfg.curriculum_sampling:
+            return ds
+
+        return ds.shuffle(seed=cfg.seed)

    LOG.info("Merging datasets...")
    merged_dataset = concatenate_datasets(datasets)

    if cfg.shuffle_merged_datasets:
        LOG.debug("Shuffling merged datasets...")
+        if cfg.curriculum_sampling:
+            LOG.warning(
+                "Shuffling merged datasets with curriculum sampling is not recommended. "
+                "This will randomize the order of samples."
+            )
        merged_dataset = merged_dataset.shuffle(seed=cfg.seed)
    else:
        LOG.debug("Not shuffling merged datasets.")