optimize the iteration when tokenizeing large datasets (#332)

This commit is contained in:
Wing Lian
2023-08-04 12:12:05 -04:00
committed by GitHub
parent 0d2e34f056
commit fe285430bc

View File

@@ -1,5 +1,6 @@
"""Module containing data utilities""" """Module containing data utilities"""
import functools import functools
import itertools
import logging import logging
from hashlib import md5 from hashlib import md5
from pathlib import Path from pathlib import Path
@@ -264,8 +265,16 @@ def load_tokenized_prepared_datasets(
LOG.info("tokenizing, merging, and shuffling master dataset") LOG.info("tokenizing, merging, and shuffling master dataset")
samples: List[int] = [] samples: List[int] = []
chunk_size = 1000
for d in datasets: for d in datasets:
samples = samples + list(d) d_iter = iter(d)
while True:
chunk = list(itertools.islice(d_iter, chunk_size))
if not chunk:
break
samples.extend(chunk)
LOG.info("shuffle")
dataset = Dataset.from_list(samples).shuffle(seed=seed) dataset = Dataset.from_list(samples).shuffle(seed=seed)
if cfg.local_rank == 0: if cfg.local_rank == 0:
LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}") LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")