add shuffle_before_merging_datasets option to allow independent shuffling of datasets before merging (#2981) [skip ci]
This commit is contained in:
@@ -543,6 +543,12 @@ def merge_datasets(datasets: list[Dataset], cfg: DictDefault) -> Dataset:
|
||||
|
||||
return ds.shuffle(seed=cfg.seed)
|
||||
|
||||
# If enabled, shuffle each dataset independently before merging.
|
||||
# This allows curriculum learning strategies to be applied at the dataset level.
|
||||
if cfg.shuffle_before_merging_datasets:
|
||||
LOG.info("Shuffling each dataset individually before merging...")
|
||||
datasets = [ds.shuffle(seed=cfg.seed) for ds in datasets]
|
||||
|
||||
LOG.info("Merging datasets...")
|
||||
merged_dataset = concatenate_datasets(datasets)
|
||||
|
||||
|
||||
@@ -179,6 +179,12 @@ class AxolotlInputConfig(
|
||||
"description": "If false, the datasets will not be shuffled and will keep their original order in `datasets`. The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true."
|
||||
},
|
||||
)
|
||||
shuffle_before_merging_datasets: bool | None = Field(
|
||||
default=False,
|
||||
json_schema_extra={
|
||||
"description": "If true, each dataset in `datasets` will be shuffled before merging. This allows curriculum learning strategies to be applied at the dataset level. Default is false."
|
||||
},
|
||||
)
|
||||
dataset_prepared_path: str | None = Field(
|
||||
default=None,
|
||||
json_schema_extra={
|
||||
|
||||
Reference in New Issue
Block a user