add shuffle_before_merging_datasets option to allow independent shuffling of datasets before merging (#2981) [skip ci]

2025-07-27 23:04:56 +02:00
parent 28804b82e4
commit 430be216d8
3 changed files with 21 additions and 8 deletions
--- a/src/axolotl/utils/data/shared.py
+++ b/src/axolotl/utils/data/shared.py
@@ -543,6 +543,12 @@ def merge_datasets(datasets: list[Dataset], cfg: DictDefault) -> Dataset:

        return ds.shuffle(seed=cfg.seed)

+    # If enabled, shuffle each dataset independently before merging.
+    # This allows curriculum learning strategies to be applied at the dataset level.
+    if cfg.shuffle_before_merging_datasets:
+        LOG.info("Shuffling each dataset individually before merging...")
+        datasets = [ds.shuffle(seed=cfg.seed) for ds in datasets]
+
    LOG.info("Merging datasets...")
    merged_dataset = concatenate_datasets(datasets)

--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -179,6 +179,12 @@ class AxolotlInputConfig(
            "description": "If false, the datasets will not be shuffled and will keep their original order in `datasets`. The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true."
        },
    )
+    shuffle_before_merging_datasets: bool | None = Field(
+        default=False,
+        json_schema_extra={
+            "description": "If true, each dataset in `datasets` will be shuffled before merging. This allows curriculum learning strategies to be applied at the dataset level. Default is false."
+        },
+    )
    dataset_prepared_path: str | None = Field(
        default=None,
        json_schema_extra={