From 430be216d811d4cb0065cfe494b4623d1ce1180e Mon Sep 17 00:00:00 2001 From: NICOLAS BZRD <79255399+Nicolas-BZRD@users.noreply.github.com> Date: Sun, 27 Jul 2025 23:04:56 +0200 Subject: [PATCH] add shuffle_before_merging_datasets option to allow independent shuffling of datasets before merging (#2981) [skip ci] --- .runpod/README.md | 17 +++++++++-------- src/axolotl/utils/data/shared.py | 6 ++++++ src/axolotl/utils/schemas/config.py | 6 ++++++ 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/.runpod/README.md b/.runpod/README.md index 60c661eef..2d24f1e5c 100644 --- a/.runpod/README.md +++ b/.runpod/README.md @@ -119,14 +119,15 @@ datasets: ## Dataset Processing -| Option | Default | Description | -| ----------------------------- | -------------------------- | --------------------------------- | -| `dataset_prepared_path` | `"data/last_run_prepared"` | Path for prepared dataset | -| `push_dataset_to_hub` | `""` | Push dataset to HF hub | -| `dataset_processes` | `4` | Number of preprocessing processes | -| `dataset_keep_in_memory` | `false` | Keep dataset in memory | -| `shuffle_merged_datasets` | `true` | Shuffle merged datasets | -| `dataset_exact_deduplication` | `true` | Deduplicate datasets | +| Option | Default | Description | +| --------------------------------- | -------------------------- | ----------------------------------- | +| `dataset_prepared_path` | `"data/last_run_prepared"` | Path for prepared dataset | +| `push_dataset_to_hub` | `""` | Push dataset to HF hub | +| `dataset_processes` | `4` | Number of preprocessing processes | +| `dataset_keep_in_memory` | `false` | Keep dataset in memory | +| `shuffle_merged_datasets` | `true` | Shuffle merged datasets | +| `shuffle_before_merging_datasets` | `false` | Shuffle each dataset before merging | +| `dataset_exact_deduplication` | `true` | Deduplicate datasets | ## LoRA Configuration diff --git a/src/axolotl/utils/data/shared.py b/src/axolotl/utils/data/shared.py index bf7a30f48..7877e5abf 100644 --- a/src/axolotl/utils/data/shared.py +++ b/src/axolotl/utils/data/shared.py @@ -543,6 +543,12 @@ def merge_datasets(datasets: list[Dataset], cfg: DictDefault) -> Dataset: return ds.shuffle(seed=cfg.seed) + # If enabled, shuffle each dataset independently before merging. + # This allows curriculum learning strategies to be applied at the dataset level. + if cfg.shuffle_before_merging_datasets: + LOG.info("Shuffling each dataset individually before merging...") + datasets = [ds.shuffle(seed=cfg.seed) for ds in datasets] + LOG.info("Merging datasets...") merged_dataset = concatenate_datasets(datasets) diff --git a/src/axolotl/utils/schemas/config.py b/src/axolotl/utils/schemas/config.py index a0e0b9604..0afeaa2a8 100644 --- a/src/axolotl/utils/schemas/config.py +++ b/src/axolotl/utils/schemas/config.py @@ -179,6 +179,12 @@ class AxolotlInputConfig( "description": "If false, the datasets will not be shuffled and will keep their original order in `datasets`. The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true." }, ) + shuffle_before_merging_datasets: bool | None = Field( + default=False, + json_schema_extra={ + "description": "If true, each dataset in `datasets` will be shuffled before merging. This allows curriculum learning strategies to be applied at the dataset level. Default is false." + }, + ) dataset_prepared_path: str | None = Field( default=None, json_schema_extra={