shuffle and split dataset after save/load

This commit is contained in:
Wing Lian
2023-04-20 14:39:47 -04:00
parent 8d437853c8
commit 4f2584f2dc
4 changed files with 11 additions and 12 deletions

View File

@@ -1,4 +1,4 @@
# FAQs # FAQs
- Can you train StableLM with this? Yes, but only with a single GPU atm. Multi GPU support is coming soon! Just waiting on this [PR](https://github.com/huggingface/transformers/pull/22874) - Can you train StableLM with this? Yes, but only with a single GPU atm. Multi GPU support is coming soon! Just waiting on this [PR](https://github.com/huggingface/transformers/pull/22874)
- - Will this work with Deepspeed? That's still a WIP, but setting `export ACCELERATE_USE_DEEPSPEED=true` should work in some cases

View File

@@ -11,11 +11,10 @@
"min_loss_scale": 1 "min_loss_scale": 1
}, },
"scheduler": { "scheduler": {
"type": "WarmupLR", "type": "OneCycle",
"params": { "params": {
"warmup_min_lr": "auto", "cycle_min_lr": 1e-7,
"warmup_max_lr": "auto", "cycle_max_lr": 1e-4
"warmup_num_steps": "auto"
} }
}, },
"zero_optimization": { "zero_optimization": {
@@ -25,7 +24,8 @@
"allgather_bucket_size": 5e8, "allgather_bucket_size": 5e8,
"contiguous_gradients": true, "contiguous_gradients": true,
"reduce_bucket_size": "auto", "reduce_bucket_size": "auto",
"reduce_scatter": true "reduce_scatter": true,
"stage3_gather_16bit_weights_on_model_save": true
}, },
"gradient_accumulation_steps": "auto", "gradient_accumulation_steps": "auto",
"gradient_clipping": "auto", "gradient_clipping": "auto",

View File

@@ -119,16 +119,15 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
seq_length=max_packed_sequence_len, seq_length=max_packed_sequence_len,
) )
logging.info("merging, packing, shuffling, and splitting master dataset") logging.info("merging, packing, shuffling, and splitting master dataset")
# TODO don't split dataset here, shuffle and save first, then split, that way we can dataset = Dataset.from_list([_ for _ in constant_len_dataset]).shuffle(seed=42)
# re-split when loading again
dataset = Dataset.from_list([_ for _ in constant_len_dataset]).train_test_split(
test_size=cfg.val_set_size, shuffle=True, seed=42
)
if cfg.local_rank == 0: if cfg.local_rank == 0:
logging.info(f"Saving prepared dataset to disk... {prepared_ds_path}") logging.info(f"Saving prepared dataset to disk... {prepared_ds_path}")
dataset.save_to_disk(prepared_ds_path) dataset.save_to_disk(prepared_ds_path)
dataset = dataset.train_test_split(
test_size=cfg.val_set_size, shuffle=False
)
train_dataset = dataset["train"] train_dataset = dataset["train"]
eval_dataset = dataset["test"] eval_dataset = dataset["test"]

View File

@@ -75,7 +75,7 @@ def load_model(
snapshot_download_kwargs = {} snapshot_download_kwargs = {}
if cfg.base_model_ignore_patterns: if cfg.base_model_ignore_patterns:
snapshot_download_kwargs["ignore_patterns"] = cfg.base_model_ignore_patterns snapshot_download_kwargs["ignore_patterns"] = cfg.base_model_ignore_patterns
cache_model_path = Path(snapshot_download(base_model, ** snapshot_download_kwargs)) cache_model_path = Path(snapshot_download(base_model, **snapshot_download_kwargs))
files = ( files = (
list(cache_model_path.glob("*.pt")) list(cache_model_path.glob("*.pt"))
+ list(cache_model_path.glob("*.safetensors")) + list(cache_model_path.glob("*.safetensors"))