From 430be216d811d4cb0065cfe494b4623d1ce1180e Mon Sep 17 00:00:00 2001
From: NICOLAS BZRD <79255399+Nicolas-BZRD@users.noreply.github.com>
Date: Sun, 27 Jul 2025 23:04:56 +0200
Subject: [PATCH] add shuffle_before_merging_datasets option to allow
 independent shuffling of datasets before merging (#2981) [skip ci]

---
 .runpod/README.md                   | 17 +++++++++--------
 src/axolotl/utils/data/shared.py    |  6 ++++++
 src/axolotl/utils/schemas/config.py |  6 ++++++
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/.runpod/README.md b/.runpod/README.md
index 60c661eef..2d24f1e5c 100644
--- a/.runpod/README.md
+++ b/.runpod/README.md
@@ -119,14 +119,15 @@ datasets:
 
 ## Dataset Processing
 
-| Option                        | Default                    | Description                       |
-| ----------------------------- | -------------------------- | --------------------------------- |
-| `dataset_prepared_path`       | `"data/last_run_prepared"` | Path for prepared dataset         |
-| `push_dataset_to_hub`         | `""`                       | Push dataset to HF hub            |
-| `dataset_processes`           | `4`                        | Number of preprocessing processes |
-| `dataset_keep_in_memory`      | `false`                    | Keep dataset in memory            |
-| `shuffle_merged_datasets`     | `true`                     | Shuffle merged datasets           |
-| `dataset_exact_deduplication` | `true`                     | Deduplicate datasets              |
+| Option                            | Default                    | Description                         |
+| --------------------------------- | -------------------------- | ----------------------------------- |
+| `dataset_prepared_path`           | `"data/last_run_prepared"` | Path for prepared dataset           |
+| `push_dataset_to_hub`             | `""`                       | Push dataset to HF hub              |
+| `dataset_processes`               | `4`                        | Number of preprocessing processes   |
+| `dataset_keep_in_memory`          | `false`                    | Keep dataset in memory              |
+| `shuffle_merged_datasets`         | `true`                     | Shuffle merged datasets             |
+| `shuffle_before_merging_datasets` | `false`                    | Shuffle each dataset before merging |
+| `dataset_exact_deduplication`     | `true`                     | Deduplicate datasets                |
 
 ## LoRA Configuration
 
diff --git a/src/axolotl/utils/data/shared.py b/src/axolotl/utils/data/shared.py
index bf7a30f48..7877e5abf 100644
--- a/src/axolotl/utils/data/shared.py
+++ b/src/axolotl/utils/data/shared.py
@@ -543,6 +543,12 @@ def merge_datasets(datasets: list[Dataset], cfg: DictDefault) -> Dataset:
 
         return ds.shuffle(seed=cfg.seed)
 
+    # If enabled, shuffle each dataset independently before merging.
+    # This allows curriculum learning strategies to be applied at the dataset level.
+    if cfg.shuffle_before_merging_datasets:
+        LOG.info("Shuffling each dataset individually before merging...")
+        datasets = [ds.shuffle(seed=cfg.seed) for ds in datasets]
+
     LOG.info("Merging datasets...")
     merged_dataset = concatenate_datasets(datasets)
 
diff --git a/src/axolotl/utils/schemas/config.py b/src/axolotl/utils/schemas/config.py
index a0e0b9604..0afeaa2a8 100644
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -179,6 +179,12 @@ class AxolotlInputConfig(
             "description": "If false, the datasets will not be shuffled and will keep their original order in `datasets`. The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true."
         },
     )
+    shuffle_before_merging_datasets: bool | None = Field(
+        default=False,
+        json_schema_extra={
+            "description": "If true, each dataset in `datasets` will be shuffled before merging. This allows curriculum learning strategies to be applied at the dataset level. Default is false."
+        },
+    )
     dataset_prepared_path: str | None = Field(
         default=None,
         json_schema_extra={