From fe285430bc26f34cccfdf7aec3b437a7eea0b85c Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Fri, 4 Aug 2023 12:12:05 -0400 Subject: [PATCH] optimize the iteration when tokenizeing large datasets (#332) --- src/axolotl/utils/data.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index ef732a8ad..f653e8009 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -1,5 +1,6 @@ """Module containing data utilities""" import functools +import itertools import logging from hashlib import md5 from pathlib import Path @@ -264,8 +265,16 @@ def load_tokenized_prepared_datasets( LOG.info("tokenizing, merging, and shuffling master dataset") samples: List[int] = [] + chunk_size = 1000 for d in datasets: - samples = samples + list(d) + d_iter = iter(d) + while True: + chunk = list(itertools.islice(d_iter, chunk_size)) + if not chunk: + break + samples.extend(chunk) + + LOG.info("shuffle") dataset = Dataset.from_list(samples).shuffle(seed=seed) if cfg.local_rank == 0: LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")