add shuffle_before_merging_datasets option to allow independent shuffling of datasets before merging (#2981) [skip ci]
This commit is contained in:
@@ -119,14 +119,15 @@ datasets:
|
||||
|
||||
## Dataset Processing
|
||||
|
||||
| Option | Default | Description |
|
||||
| ----------------------------- | -------------------------- | --------------------------------- |
|
||||
| `dataset_prepared_path` | `"data/last_run_prepared"` | Path for prepared dataset |
|
||||
| `push_dataset_to_hub` | `""` | Push dataset to HF hub |
|
||||
| `dataset_processes` | `4` | Number of preprocessing processes |
|
||||
| `dataset_keep_in_memory` | `false` | Keep dataset in memory |
|
||||
| `shuffle_merged_datasets` | `true` | Shuffle merged datasets |
|
||||
| `dataset_exact_deduplication` | `true` | Deduplicate datasets |
|
||||
| Option | Default | Description |
|
||||
| --------------------------------- | -------------------------- | ----------------------------------- |
|
||||
| `dataset_prepared_path` | `"data/last_run_prepared"` | Path for prepared dataset |
|
||||
| `push_dataset_to_hub` | `""` | Push dataset to HF hub |
|
||||
| `dataset_processes` | `4` | Number of preprocessing processes |
|
||||
| `dataset_keep_in_memory` | `false` | Keep dataset in memory |
|
||||
| `shuffle_merged_datasets` | `true` | Shuffle merged datasets |
|
||||
| `shuffle_before_merging_datasets` | `false` | Shuffle each dataset before merging |
|
||||
| `dataset_exact_deduplication` | `true` | Deduplicate datasets |
|
||||
|
||||
## LoRA Configuration
|
||||
|
||||
|
||||
@@ -543,6 +543,12 @@ def merge_datasets(datasets: list[Dataset], cfg: DictDefault) -> Dataset:
|
||||
|
||||
return ds.shuffle(seed=cfg.seed)
|
||||
|
||||
# If enabled, shuffle each dataset independently before merging.
|
||||
# This allows curriculum learning strategies to be applied at the dataset level.
|
||||
if cfg.shuffle_before_merging_datasets:
|
||||
LOG.info("Shuffling each dataset individually before merging...")
|
||||
datasets = [ds.shuffle(seed=cfg.seed) for ds in datasets]
|
||||
|
||||
LOG.info("Merging datasets...")
|
||||
merged_dataset = concatenate_datasets(datasets)
|
||||
|
||||
|
||||
@@ -179,6 +179,12 @@ class AxolotlInputConfig(
|
||||
"description": "If false, the datasets will not be shuffled and will keep their original order in `datasets`. The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true."
|
||||
},
|
||||
)
|
||||
shuffle_before_merging_datasets: bool | None = Field(
|
||||
default=False,
|
||||
json_schema_extra={
|
||||
"description": "If true, each dataset in `datasets` will be shuffled before merging. This allows curriculum learning strategies to be applied at the dataset level. Default is false."
|
||||
},
|
||||
)
|
||||
dataset_prepared_path: str | None = Field(
|
||||
default=None,
|
||||
json_schema_extra={
|
||||
|
||||
Reference in New Issue
Block a user