Add a config not to shuffle merged dataset (#1394) [skip ci]
* Add a config not to shuffle merged dataset * Update README.md * Update src/axolotl/utils/config/models/input/v0_4_1/__init__.py Co-authored-by: Wing Lian <wing.lian@gmail.com> * invert the condition name * update README * info -> debug --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>
This commit is contained in:
@@ -416,6 +416,7 @@ class AxolotlInputConfig(
|
||||
|
||||
datasets: Optional[conlist(Union[SFTDataset, DPODataset], min_length=1)] = None # type: ignore
|
||||
test_datasets: Optional[conlist(Union[SFTDataset, DPODataset], min_length=1)] = None # type: ignore
|
||||
shuffle_merged_datasets: Optional[bool] = True
|
||||
dataset_prepared_path: Optional[str] = None
|
||||
dataset_shard_num: Optional[int] = None
|
||||
dataset_shard_idx: Optional[int] = None
|
||||
|
||||
@@ -415,8 +415,11 @@ def load_tokenized_prepared_datasets(
|
||||
dataset = concatenate_datasets(datasets)
|
||||
|
||||
if len(datasets) > 1:
|
||||
LOG.info("shuffle merged datasets")
|
||||
dataset = dataset.shuffle(seed=seed)
|
||||
if cfg.shuffle_merged_datasets:
|
||||
LOG.debug("shuffle merged datasets")
|
||||
dataset = dataset.shuffle(seed=seed)
|
||||
else:
|
||||
LOG.debug("NOT shuffling merged datasets")
|
||||
|
||||
dataset, _ = process_datasets_for_packing(cfg, dataset, None)
|
||||
|
||||
@@ -819,7 +822,11 @@ def wrap_pretraining_dataset(
|
||||
else:
|
||||
encode = functools.partial(encode_pretraining, tokenizer, max_tokens)
|
||||
|
||||
dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size)
|
||||
if cfg.shuffle_merged_datasets:
|
||||
dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size)
|
||||
else:
|
||||
LOG.debug("NOT shuffling merged pretraining datasets")
|
||||
|
||||
dataset = dataset.map(
|
||||
encode,
|
||||
batched=True,
|
||||
|
||||
Reference in New Issue
Block a user