From 4f2584f2dc39bd50aa1a0bff191457829042ac89 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 20 Apr 2023 14:39:47 -0400 Subject: [PATCH] shuffle and split dataset after save/load --- FAQS.md | 2 +- ds_config.json | 10 +++++----- src/axolotl/utils/data.py | 9 ++++----- src/axolotl/utils/models.py | 2 +- 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/FAQS.md b/FAQS.md index 7eb741482..bdf056be7 100644 --- a/FAQS.md +++ b/FAQS.md @@ -1,4 +1,4 @@ # FAQs - Can you train StableLM with this? Yes, but only with a single GPU atm. Multi GPU support is coming soon! Just waiting on this [PR](https://github.com/huggingface/transformers/pull/22874) -- +- Will this work with Deepspeed? That's still a WIP, but setting `export ACCELERATE_USE_DEEPSPEED=true` should work in some cases diff --git a/ds_config.json b/ds_config.json index 05fc98177..ffd6f2075 100644 --- a/ds_config.json +++ b/ds_config.json @@ -11,11 +11,10 @@ "min_loss_scale": 1 }, "scheduler": { - "type": "WarmupLR", + "type": "OneCycle", "params": { - "warmup_min_lr": "auto", - "warmup_max_lr": "auto", - "warmup_num_steps": "auto" + "cycle_min_lr": 1e-7, + "cycle_max_lr": 1e-4 } }, "zero_optimization": { @@ -25,7 +24,8 @@ "allgather_bucket_size": 5e8, "contiguous_gradients": true, "reduce_bucket_size": "auto", - "reduce_scatter": true + "reduce_scatter": true, + "stage3_gather_16bit_weights_on_model_save": true }, "gradient_accumulation_steps": "auto", "gradient_clipping": "auto", diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index 081f1d851..cb8ba93bf 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -119,16 +119,15 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path): seq_length=max_packed_sequence_len, ) logging.info("merging, packing, shuffling, and splitting master dataset") - # TODO don't split dataset here, shuffle and save first, then split, that way we can - # re-split when loading again - dataset = Dataset.from_list([_ for _ in constant_len_dataset]).train_test_split( - test_size=cfg.val_set_size, shuffle=True, seed=42 - ) + dataset = Dataset.from_list([_ for _ in constant_len_dataset]).shuffle(seed=42) if cfg.local_rank == 0: logging.info(f"Saving prepared dataset to disk... {prepared_ds_path}") dataset.save_to_disk(prepared_ds_path) + dataset = dataset.train_test_split( + test_size=cfg.val_set_size, shuffle=False + ) train_dataset = dataset["train"] eval_dataset = dataset["test"] diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 750f394b5..eb54eba80 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -75,7 +75,7 @@ def load_model( snapshot_download_kwargs = {} if cfg.base_model_ignore_patterns: snapshot_download_kwargs["ignore_patterns"] = cfg.base_model_ignore_patterns - cache_model_path = Path(snapshot_download(base_model, ** snapshot_download_kwargs)) + cache_model_path = Path(snapshot_download(base_model, **snapshot_download_kwargs)) files = ( list(cache_model_path.glob("*.pt")) + list(cache_model_path.glob("*.safetensors"))