fix: crash when pretraining_dataset with dispatch_batches is false (#2558)

This commit is contained in:
Chiwan Park
2025-04-26 06:15:03 +09:00
committed by GitHub
parent 9eba0ad118
commit e3c9d541a7

View File

@@ -134,10 +134,9 @@ def prepare_dataset(cfg, tokenizer, processor=None, preprocess_iterable=None):
"csv", data_files=f.name, split="train", streaming=True "csv", data_files=f.name, split="train", streaming=True
) )
else: else:
if is_local_main_process(): iter_ds = load_dataset(
iter_ds = load_dataset( path, streaming=True, split=split, name=name, data_files=data_files
path, streaming=True, split=split, name=name, data_files=data_files )
)
if skip: if skip:
LOG.info(f"Skipping {skip} samples from the dataset") LOG.info(f"Skipping {skip} samples from the dataset")