diff --git a/README.md b/README.md index 0960d3cdf..7f092d308 100644 --- a/README.md +++ b/README.md @@ -691,9 +691,11 @@ warmup_ratio: 0.05 # cannot use with warmup_steps learning_rate: 0.00003 lr_quadratic_warmup: logging_steps: +eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps +evals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps save_strategy: # Set to `no` to skip checkpoint saves save_steps: # Leave empty to save at each epoch -eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps +saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps save_total_limit: # Checkpoints saved at a time # Maximum number of iterations to train for. It precedes num_epochs which means that # if both are set, num_epochs will not be guaranteed. diff --git a/examples/cerebras/btlm-ft.yml b/examples/cerebras/btlm-ft.yml index 4c85a4c55..d0975214b 100644 --- a/examples/cerebras/btlm-ft.yml +++ b/examples/cerebras/btlm-ft.yml @@ -72,8 +72,8 @@ gptq_groupsize: gptq_model_v1: warmup_steps: 32 -eval_steps: -save_steps: +evals_per_epoch: 4 +saves_per_epoch: 1 save_total_limit: debug: diff --git a/examples/cerebras/qlora.yml b/examples/cerebras/qlora.yml index 7b640fc27..03155c6c2 100644 --- a/examples/cerebras/qlora.yml +++ b/examples/cerebras/qlora.yml @@ -49,8 +49,8 @@ flash_attention: gptq_groupsize: gptq_model_v1: warmup_steps: 10 -eval_steps: 0.05 -save_steps: +evals_per_epoch: 4 +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.1 diff --git a/examples/code-llama/13b/lora.yml b/examples/code-llama/13b/lora.yml index 45f66e02d..fc43ad14e 100644 --- a/examples/code-llama/13b/lora.yml +++ b/examples/code-llama/13b/lora.yml @@ -54,8 +54,8 @@ xformers_attention: flash_attention: true warmup_steps: 10 -eval_steps: 0.05 -save_steps: +evals_per_epoch: 4 +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/code-llama/13b/qlora.yml b/examples/code-llama/13b/qlora.yml index 79c684f88..06b9ac72f 100644 --- a/examples/code-llama/13b/qlora.yml +++ b/examples/code-llama/13b/qlora.yml @@ -56,8 +56,8 @@ xformers_attention: flash_attention: true warmup_steps: 10 -eval_steps: 0.05 -save_steps: +evals_per_epoch: 4 +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/code-llama/34b/lora.yml b/examples/code-llama/34b/lora.yml index 809e00710..c2f1d5ce1 100644 --- a/examples/code-llama/34b/lora.yml +++ b/examples/code-llama/34b/lora.yml @@ -54,8 +54,8 @@ xformers_attention: flash_attention: true warmup_steps: 10 -eval_steps: 0.05 -save_steps: +evals_per_epoch: 4 +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/code-llama/34b/qlora.yml b/examples/code-llama/34b/qlora.yml index ed927e51e..ad1e21675 100644 --- a/examples/code-llama/34b/qlora.yml +++ b/examples/code-llama/34b/qlora.yml @@ -56,8 +56,8 @@ xformers_attention: flash_attention: true warmup_steps: 10 -eval_steps: 0.05 -save_steps: +evals_per_epoch: 4 +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/code-llama/7b/lora.yml b/examples/code-llama/7b/lora.yml index 37d6ae3b7..630c8da6f 100644 --- a/examples/code-llama/7b/lora.yml +++ b/examples/code-llama/7b/lora.yml @@ -54,8 +54,8 @@ xformers_attention: flash_attention: true warmup_steps: 10 -eval_steps: 0.05 -save_steps: +evals_per_epoch: 4 +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/code-llama/7b/qlora.yml b/examples/code-llama/7b/qlora.yml index 491e07c98..12462dcb7 100644 --- a/examples/code-llama/7b/qlora.yml +++ b/examples/code-llama/7b/qlora.yml @@ -56,8 +56,8 @@ xformers_attention: flash_attention: true warmup_steps: 10 -eval_steps: 0.05 -save_steps: +evals_per_epoch: 4 +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/falcon/config-7b-lora.yml b/examples/falcon/config-7b-lora.yml index ef5eec1b7..13bad9425 100644 --- a/examples/falcon/config-7b-lora.yml +++ b/examples/falcon/config-7b-lora.yml @@ -51,8 +51,8 @@ flash_attention: gptq_groupsize: gptq_model_v1: warmup_steps: 40 -eval_steps: 5 -save_steps: 43 +evals_per_epoch: 4 +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/falcon/config-7b-qlora.yml b/examples/falcon/config-7b-qlora.yml index 03e2e3388..a89124bb8 100644 --- a/examples/falcon/config-7b-qlora.yml +++ b/examples/falcon/config-7b-qlora.yml @@ -80,8 +80,8 @@ flash_attention: gptq_groupsize: gptq_model_v1: warmup_steps: 10 -eval_steps: 5 -save_steps: 10 +evals_per_epoch: 4 +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.000001 diff --git a/examples/falcon/config-7b.yml b/examples/falcon/config-7b.yml index bf66d63e0..ff37dcf85 100644 --- a/examples/falcon/config-7b.yml +++ b/examples/falcon/config-7b.yml @@ -51,8 +51,8 @@ flash_attention: gptq_groupsize: gptq_model_v1: warmup_steps: 40 -eval_steps: 5 -save_steps: 43 +evals_per_epoch: 4 +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/gptj/qlora.yml b/examples/gptj/qlora.yml index 0e79bcd1d..700d10e67 100644 --- a/examples/gptj/qlora.yml +++ b/examples/gptj/qlora.yml @@ -46,8 +46,8 @@ flash_attention: gptq_groupsize: gptq_model_v1: warmup_steps: 10 -eval_steps: 0.05 -save_steps: +evals_per_epoch: 4 +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.1 diff --git a/examples/jeopardy-bot/config.yml b/examples/jeopardy-bot/config.yml index a0144ec51..ac8814b0b 100644 --- a/examples/jeopardy-bot/config.yml +++ b/examples/jeopardy-bot/config.yml @@ -42,8 +42,8 @@ flash_attention: gptq_groupsize: gptq_model_v1: warmup_steps: 20 -eval_steps: 110 -save_steps: 660 +evals_per_epoch: 4 +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.1 diff --git a/examples/llama-2/fft_optimized.yml b/examples/llama-2/fft_optimized.yml index e1c17c796..5530283bf 100644 --- a/examples/llama-2/fft_optimized.yml +++ b/examples/llama-2/fft_optimized.yml @@ -58,9 +58,9 @@ flash_attn_fuse_qkv: false flash_attn_fuse_mlp: true warmup_steps: 100 -eval_steps: 0.05 +evals_per_epoch: 4 eval_table_size: -save_steps: +saves_per_epoch: 1 debug: deepspeed: #deepspeed/zero2.json # multi-gpu only weight_decay: 0.1 diff --git a/examples/llama-2/gptq-lora.yml b/examples/llama-2/gptq-lora.yml index c22f4f08f..a3235c1fb 100644 --- a/examples/llama-2/gptq-lora.yml +++ b/examples/llama-2/gptq-lora.yml @@ -62,8 +62,8 @@ flash_attention: sdp_attention: flash_optimum: warmup_steps: 100 -eval_steps: -save_steps: +evals_per_epoch: 4 +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.1 diff --git a/examples/llama-2/lora.yml b/examples/llama-2/lora.yml index 4dfeb0079..afb7dcd06 100644 --- a/examples/llama-2/lora.yml +++ b/examples/llama-2/lora.yml @@ -54,10 +54,10 @@ xformers_attention: flash_attention: true warmup_steps: 10 -eval_steps: 0.05 +evals_per_epoch: 4 eval_table_size: eval_table_max_new_tokens: 128 -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/llama-2/qlora.yml b/examples/llama-2/qlora.yml index 7e453e7a1..d68882d6a 100644 --- a/examples/llama-2/qlora.yml +++ b/examples/llama-2/qlora.yml @@ -56,9 +56,9 @@ xformers_attention: flash_attention: true warmup_steps: 10 -eval_steps: 0.05 +evals_per_epoch: 4 eval_table_size: -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/llama-2/relora.yml b/examples/llama-2/relora.yml index 9c9f6d6f4..ff76ddbea 100644 --- a/examples/llama-2/relora.yml +++ b/examples/llama-2/relora.yml @@ -60,8 +60,8 @@ xformers_attention: flash_attention: true warmup_steps: 10 -eval_steps: 0.05 -save_steps: 50 +evals_per_epoch: 4 +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/llama-2/tiny-llama.yml b/examples/llama-2/tiny-llama.yml index c3af7e827..c72db4e5b 100644 --- a/examples/llama-2/tiny-llama.yml +++ b/examples/llama-2/tiny-llama.yml @@ -54,9 +54,9 @@ xformers_attention: flash_attention: true warmup_steps: 10 -eval_steps: 0.05 +evals_per_epoch: 4 eval_table_size: -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/mamba/config.yml b/examples/mamba/config.yml index c2e5a851f..946bbe731 100644 --- a/examples/mamba/config.yml +++ b/examples/mamba/config.yml @@ -47,10 +47,10 @@ xformers_attention: flash_attention: warmup_steps: 10 -eval_steps: +evals_per_epoch: 4 eval_table_size: eval_table_max_new_tokens: 128 -save_steps: 0.25 +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/mistral/config.yml b/examples/mistral/config.yml index 4d116c9f8..1c37b05c1 100644 --- a/examples/mistral/config.yml +++ b/examples/mistral/config.yml @@ -46,10 +46,10 @@ xformers_attention: flash_attention: true warmup_steps: 10 -eval_steps: 0.05 +evals_per_epoch: 4 eval_table_size: eval_table_max_new_tokens: 128 -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/mistral/mixtral.yml b/examples/mistral/mixtral.yml index f650efdd4..6e080e226 100644 --- a/examples/mistral/mixtral.yml +++ b/examples/mistral/mixtral.yml @@ -67,10 +67,10 @@ loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 warmup_steps: 10 -eval_steps: +evals_per_epoch: 4 eval_table_size: eval_table_max_new_tokens: 128 -save_steps: +saves_per_epoch: 1 debug: deepspeed: deepspeed/zero2.json weight_decay: 0.0 diff --git a/examples/mistral/qlora.yml b/examples/mistral/qlora.yml index 8e870da46..64b26f4fa 100644 --- a/examples/mistral/qlora.yml +++ b/examples/mistral/qlora.yml @@ -66,10 +66,10 @@ loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 warmup_steps: 10 -eval_steps: 0.05 +evals_per_epoch: 4 eval_table_size: eval_table_max_new_tokens: 128 -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/mpt-7b/config.yml b/examples/mpt-7b/config.yml index 72f4e043e..bc36b1c60 100644 --- a/examples/mpt-7b/config.yml +++ b/examples/mpt-7b/config.yml @@ -44,8 +44,8 @@ flash_attention: gptq_groupsize: gptq_model_v1: warmup_steps: 20 -eval_steps: 110 -save_steps: 660 +evals_per_epoch: 4 +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0001 diff --git a/examples/openllama-3b/config.yml b/examples/openllama-3b/config.yml index 7809ec3d8..0a404c79d 100644 --- a/examples/openllama-3b/config.yml +++ b/examples/openllama-3b/config.yml @@ -49,8 +49,8 @@ flash_attention: true gptq_groupsize: gptq_model_v1: warmup_steps: 20 -eval_steps: 0.05 -save_steps: +evals_per_epoch: 4 +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.1 diff --git a/examples/openllama-3b/lora.yml b/examples/openllama-3b/lora.yml index bddb777f8..4fbb634f9 100644 --- a/examples/openllama-3b/lora.yml +++ b/examples/openllama-3b/lora.yml @@ -54,8 +54,8 @@ flash_attention: true gptq_groupsize: gptq_model_v1: warmup_steps: 20 -eval_steps: 0.05 -save_steps: +evals_per_epoch: 4 +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.1 diff --git a/examples/openllama-3b/qlora.yml b/examples/openllama-3b/qlora.yml index 891dd48df..3d6218b30 100644 --- a/examples/openllama-3b/qlora.yml +++ b/examples/openllama-3b/qlora.yml @@ -48,8 +48,8 @@ flash_attention: true gptq_groupsize: gptq_model_v1: warmup_steps: 20 -eval_steps: 0.05 -save_steps: +evals_per_epoch: 4 +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.1 diff --git a/examples/phi/phi-ft.yml b/examples/phi/phi-ft.yml index cfacc49cc..eaebd21ef 100644 --- a/examples/phi/phi-ft.yml +++ b/examples/phi/phi-ft.yml @@ -59,8 +59,8 @@ xformers_attention: flash_attention: warmup_steps: 100 -eval_steps: 0.05 -save_steps: +evals_per_epoch: 4 +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.1 diff --git a/examples/phi/phi-qlora.yml b/examples/phi/phi-qlora.yml index 780a2a116..691a83509 100644 --- a/examples/phi/phi-qlora.yml +++ b/examples/phi/phi-qlora.yml @@ -59,8 +59,8 @@ xformers_attention: flash_attention: warmup_steps: 100 -eval_steps: 0.05 -save_steps: +evals_per_epoch: 4 +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.1 diff --git a/examples/pythia/lora.yml b/examples/pythia/lora.yml index 6681f627f..10c76c973 100644 --- a/examples/pythia/lora.yml +++ b/examples/pythia/lora.yml @@ -33,5 +33,5 @@ early_stopping_patience: resume_from_checkpoint: local_rank: weight_decay: 0.1 -eval_steps: 0.05 +evals_per_epoch: 4 logging_steps: 1 diff --git a/examples/qwen/lora.yml b/examples/qwen/lora.yml index 87db872a5..0ad9fc0f1 100644 --- a/examples/qwen/lora.yml +++ b/examples/qwen/lora.yml @@ -56,10 +56,10 @@ xformers_attention: flash_attention: warmup_steps: 10 -eval_steps: 0.05 +evals_per_epoch: 4 eval_table_size: eval_table_max_new_tokens: 128 -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/qwen/qlora.yml b/examples/qwen/qlora.yml index d3b45c940..1ce0cbdc0 100644 --- a/examples/qwen/qlora.yml +++ b/examples/qwen/qlora.yml @@ -56,10 +56,10 @@ xformers_attention: flash_attention: warmup_steps: 10 -eval_steps: 0.05 +evals_per_epoch: 4 eval_table_size: eval_table_max_new_tokens: 128 -save_steps: +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/examples/redpajama/config-3b.yml b/examples/redpajama/config-3b.yml index 8895074ba..a369b6cef 100644 --- a/examples/redpajama/config-3b.yml +++ b/examples/redpajama/config-3b.yml @@ -45,8 +45,8 @@ flash_attention: gptq_groupsize: gptq_model_v1: warmup_steps: 20 -eval_steps: 110 -save_steps: 660 +evals_per_epoch: 4 +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0001 diff --git a/examples/replit-3b/config-lora.yml b/examples/replit-3b/config-lora.yml index 82715eae5..01314acc1 100644 --- a/examples/replit-3b/config-lora.yml +++ b/examples/replit-3b/config-lora.yml @@ -45,8 +45,8 @@ flash_attention: gptq_groupsize: gptq_model_v1: warmup_steps: 20 -eval_steps: 50 -save_steps: +evals_per_epoch: 4 +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0 diff --git a/examples/xgen-7b/xgen-7b-8k-qlora.yml b/examples/xgen-7b/xgen-7b-8k-qlora.yml index 26230c408..48924e5f7 100644 --- a/examples/xgen-7b/xgen-7b-8k-qlora.yml +++ b/examples/xgen-7b/xgen-7b-8k-qlora.yml @@ -78,8 +78,8 @@ flash_attention: gptq_groupsize: gptq_model_v1: warmup_steps: 10 -eval_steps: 50 -save_steps: 50 +evals_per_epoch: 4 +saves_per_epoch: 1 debug: deepspeed: weight_decay: 0.0 diff --git a/src/axolotl/utils/config.py b/src/axolotl/utils/config.py index 74da66928..b04c207dd 100644 --- a/src/axolotl/utils/config.py +++ b/src/axolotl/utils/config.py @@ -77,6 +77,15 @@ def normalize_config(cfg): else: cfg.torch_dtype = torch.float32 + if cfg.saves_per_epoch: + save_steps = 1.0 / (cfg.saves_per_epoch * cfg.num_epochs) + if save_steps < 1.0: # prevent saves on every step + cfg.save_steps = save_steps + if cfg.evals_per_epoch: + eval_steps = 1.0 / (cfg.evals_per_epoch * cfg.num_epochs) + if eval_steps < 1.0: # prevent evals on every step + cfg.eval_steps = eval_steps + cfg.dataset_processes = cfg.dataset_processes or os.cpu_count() if not cfg.base_model_config: @@ -352,6 +361,27 @@ def validate_config(cfg): cfg.datasets[idx].type = cfg.datasets[idx].type.replace( "sharegpt_simple", "sharegpt" ) + + if cfg.saves_per_epoch and cfg.save_steps: + raise ValueError( + "save_steps and saves_per_epoch are mutually exclusive and cannot be used together." + ) + if cfg.saves_per_epoch and cfg.save_strategy and cfg.save_strategy != "steps": + raise ValueError( + "save_strategy must be empty or set to `steps` when used with saves_per_epoch." + ) + if cfg.evals_per_epoch and cfg.eval_steps: + raise ValueError( + "eval_steps and evals_per_epoch are mutually exclusive and cannot be used together." + ) + if ( + cfg.evals_per_epoch + and cfg.evaluation_strategy + and cfg.evaluation_strategy != "steps" + ): + raise ValueError( + "evaluation_strategy must be empty or set to `steps` when used with evals_per_epoch." + ) if cfg.save_strategy and cfg.save_steps and cfg.save_strategy != "steps": raise ValueError( "save_strategy and save_steps mismatch. Please set save_strategy to 'steps' or remove save_steps."