optimize the iteration when tokenizeing large datasets (#332)
This commit is contained in:
@@ -1,5 +1,6 @@
|
|||||||
"""Module containing data utilities"""
|
"""Module containing data utilities"""
|
||||||
import functools
|
import functools
|
||||||
|
import itertools
|
||||||
import logging
|
import logging
|
||||||
from hashlib import md5
|
from hashlib import md5
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -264,8 +265,16 @@ def load_tokenized_prepared_datasets(
|
|||||||
LOG.info("tokenizing, merging, and shuffling master dataset")
|
LOG.info("tokenizing, merging, and shuffling master dataset")
|
||||||
|
|
||||||
samples: List[int] = []
|
samples: List[int] = []
|
||||||
|
chunk_size = 1000
|
||||||
for d in datasets:
|
for d in datasets:
|
||||||
samples = samples + list(d)
|
d_iter = iter(d)
|
||||||
|
while True:
|
||||||
|
chunk = list(itertools.islice(d_iter, chunk_size))
|
||||||
|
if not chunk:
|
||||||
|
break
|
||||||
|
samples.extend(chunk)
|
||||||
|
|
||||||
|
LOG.info("shuffle")
|
||||||
dataset = Dataset.from_list(samples).shuffle(seed=seed)
|
dataset = Dataset.from_list(samples).shuffle(seed=seed)
|
||||||
if cfg.local_rank == 0:
|
if cfg.local_rank == 0:
|
||||||
LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
|
LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
|
||||||
|
|||||||
Reference in New Issue
Block a user