Add Exact Deduplication Feature to Preprocessing Pipeline (#2072)
* Add example YAML file for training Mistral using DPO * added deduplication code * Add exact deduplication feature and update examples * Improve deduplication for train/eval overlap Changed the deduplication function to use a more memory-efficient hashing method. Applied Git suggestions to improve clarity and maintainability.\n\nThe deduplication now handles cases where train and eval datasets have overlapping elements. * Improve deduplication for train/eval overlap Changed the deduplication function to use a more memory-efficient hashing method. Applied Git suggestions to improve clarity and maintainability.\n\nThe deduplication now handles cases where train and eval datasets have overlapping elements. * Apply suggestions from code review To handle the original case where we do not do deduplication Co-authored-by: Wing Lian <wing.lian@gmail.com> * Improve false collision detection to ensure dataset integrity - Added test cases to simulate and verify handling of forced hash collisions between datasets. - Ensured that datasets with identical hashes but different content are correctly identified, preventing incorrect deduplication. - Updated unit tests to include scenarios where collisions occur across both training and evaluation datasets, as well as within a single dataset. * Moved the constants file to the tests folder - Relocated `constants.py` to the `tests` folder to improve modularity and maintain a clear separation between source and test files. - Renamed `cicd/tests.py` to `cicd/cicd_tests.py` to resolve a conflict with `tests/__init__.py`, which caused Mypy to fail due to duplicate module names. - Updated all references to `cicd.tests` in the codebase to `cicd.cicd_tests` to reflect the renaming and ensure compatibility. - These changes ensure Mypy passes the pre-commit hook and maintain alignment with the project's structure. * revert some changes from previous commit and fix relative import --------- Co-authored-by: Wing Lian <wing.lian@gmail.com> Co-authored-by: Wing Lian <wing@axolotl.ai>
This commit is contained in:
committed by
GitHub
parent
5f1d98e8fc
commit
b620ed94d0
@@ -44,7 +44,7 @@ from axolotl.prompters import (
|
||||
UnsupportedPrompter,
|
||||
)
|
||||
from axolotl.utils.data.pretraining import wrap_pretraining_dataset
|
||||
from axolotl.utils.data.utils import md5
|
||||
from axolotl.utils.data.utils import deduplicate_and_log_datasets, md5
|
||||
from axolotl.utils.dict import DictDefault
|
||||
from axolotl.utils.distributed import is_local_main_process, zero_first
|
||||
from axolotl.utils.trainer import (
|
||||
@@ -136,8 +136,9 @@ def prepare_dataset(cfg, tokenizer, processor=None):
|
||||
# https://discuss.huggingface.co/t/how-to-use-huggingface-trainer-streaming-datasets-without-wrapping-it-with-torchdatas-iterablewrapper/25230
|
||||
train_dataset = train_dataset.with_format("torch")
|
||||
eval_dataset = None
|
||||
if cfg.dataset_exact_deduplication:
|
||||
LOG.info("Deduplication not available for pretrained datasets")
|
||||
return train_dataset, eval_dataset, cfg.max_steps, prompters
|
||||
|
||||
if eval_dataset and cfg.sample_packing and cfg.eval_sample_packing is not False:
|
||||
total_eval_steps = calculate_total_num_steps(cfg, eval_dataset, update=False)
|
||||
if total_eval_steps == 0:
|
||||
@@ -178,7 +179,7 @@ def load_tokenized_prepared_datasets(
|
||||
+ "|".join(
|
||||
sorted(
|
||||
[
|
||||
f"{d.path}:{d.type}:{d.shards}:{d.conversation}{d.split}"
|
||||
f"{d.path}: {d.type}: {d.shards}: {d.conversation}{d.split}"
|
||||
for d in cfg_datasets
|
||||
]
|
||||
)
|
||||
@@ -584,7 +585,8 @@ def load_prepare_datasets(
|
||||
)
|
||||
train_fingerprint = md5(to_hash_train)
|
||||
test_fingerprint = md5(to_hash_test)
|
||||
|
||||
if cfg.dataset_exact_deduplication:
|
||||
_, _, dataset = deduplicate_and_log_datasets(dataset=dataset)
|
||||
dataset = dataset.train_test_split(
|
||||
test_size=val_set_size,
|
||||
shuffle=False,
|
||||
@@ -596,12 +598,17 @@ def load_prepare_datasets(
|
||||
train_dataset = dataset["train"]
|
||||
eval_dataset = dataset["test"]
|
||||
elif split == "test":
|
||||
if cfg.dataset_exact_deduplication:
|
||||
_, eval_dataset, _ = deduplicate_and_log_datasets(eval_dataset=dataset)
|
||||
else:
|
||||
eval_dataset = dataset
|
||||
train_dataset = None
|
||||
eval_dataset = dataset
|
||||
else:
|
||||
train_dataset = dataset
|
||||
if cfg.dataset_exact_deduplication:
|
||||
train_dataset, _, _ = deduplicate_and_log_datasets(train_dataset=dataset)
|
||||
else:
|
||||
train_dataset = dataset
|
||||
eval_dataset = None
|
||||
|
||||
return train_dataset, eval_dataset, prompters
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user