Fix: tokenize stall due to not shuffling dataset (#2845)

* fix: shuffle dataset even if only one to fix tokenize stall

* fix: warn if shuffling merged with curriculum sampling

* chore: refactor
This commit is contained in:
NanoCode012
2025-07-02 19:06:00 +07:00
committed by GitHub
parent f2b352f2e5
commit 6383630155

View File

@@ -524,13 +524,24 @@ def merge_datasets(datasets: list[Dataset], cfg: DictDefault) -> Dataset:
Merged dataset.
"""
if len(datasets) == 1:
return datasets[0]
ds = datasets[0]
# Do not shuffle if curriculum sampling is enabled
if cfg.curriculum_sampling:
return ds
return ds.shuffle(seed=cfg.seed)
LOG.info("Merging datasets...")
merged_dataset = concatenate_datasets(datasets)
if cfg.shuffle_merged_datasets:
LOG.debug("Shuffling merged datasets...")
if cfg.curriculum_sampling:
LOG.warning(
"Shuffling merged datasets with curriculum sampling is not recommended. "
"This will randomize the order of samples."
)
merged_dataset = merged_dataset.shuffle(seed=cfg.seed)
else:
LOG.debug("Not shuffling merged datasets.")