diff --git a/examples/archived/cerebras/btlm-ft.yml b/examples/archived/cerebras/btlm-ft.yml index c9878779d..c3495d287 100644 --- a/examples/archived/cerebras/btlm-ft.yml +++ b/examples/archived/cerebras/btlm-ft.yml @@ -66,7 +66,7 @@ flash_optimum: gptq_groupsize: gptq_model_v1: -warmup_steps: 32 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 save_total_limit: diff --git a/examples/archived/cerebras/qlora.yml b/examples/archived/cerebras/qlora.yml index 55cc597f1..4598a8338 100644 --- a/examples/archived/cerebras/qlora.yml +++ b/examples/archived/cerebras/qlora.yml @@ -43,7 +43,7 @@ xformers_attention: true flash_attention: gptq_groupsize: gptq_model_v1: -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.1 diff --git a/examples/archived/code-llama/13b/lora.yml b/examples/archived/code-llama/13b/lora.yml index 98ef516ab..ace94b619 100644 --- a/examples/archived/code-llama/13b/lora.yml +++ b/examples/archived/code-llama/13b/lora.yml @@ -47,7 +47,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/archived/code-llama/13b/qlora.yml b/examples/archived/code-llama/13b/qlora.yml index 2385368ac..f4ed17af5 100644 --- a/examples/archived/code-llama/13b/qlora.yml +++ b/examples/archived/code-llama/13b/qlora.yml @@ -48,7 +48,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/archived/code-llama/34b/lora.yml b/examples/archived/code-llama/34b/lora.yml index fb44997ff..0a1d71467 100644 --- a/examples/archived/code-llama/34b/lora.yml +++ b/examples/archived/code-llama/34b/lora.yml @@ -47,7 +47,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/archived/code-llama/34b/qlora.yml b/examples/archived/code-llama/34b/qlora.yml index 22f4cae3c..ec17bf200 100644 --- a/examples/archived/code-llama/34b/qlora.yml +++ b/examples/archived/code-llama/34b/qlora.yml @@ -48,7 +48,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/archived/code-llama/7b/lora.yml b/examples/archived/code-llama/7b/lora.yml index 0632bdfb7..174c17d2c 100644 --- a/examples/archived/code-llama/7b/lora.yml +++ b/examples/archived/code-llama/7b/lora.yml @@ -47,7 +47,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/archived/code-llama/7b/qlora.yml b/examples/archived/code-llama/7b/qlora.yml index 0bd462aab..08e67d8c2 100644 --- a/examples/archived/code-llama/7b/qlora.yml +++ b/examples/archived/code-llama/7b/qlora.yml @@ -48,7 +48,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/archived/dbrx/16bit-lora.yaml b/examples/archived/dbrx/16bit-lora.yaml index 852654d49..05946dfe9 100644 --- a/examples/archived/dbrx/16bit-lora.yaml +++ b/examples/archived/dbrx/16bit-lora.yaml @@ -54,7 +54,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: saves_per_epoch: 1 diff --git a/examples/archived/dbrx/8bit-lora.yaml b/examples/archived/dbrx/8bit-lora.yaml index 0b9402194..f159bf7fa 100644 --- a/examples/archived/dbrx/8bit-lora.yaml +++ b/examples/archived/dbrx/8bit-lora.yaml @@ -57,7 +57,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: saves_per_epoch: 1 diff --git a/examples/archived/dbrx/fft-ds-zero3.yaml b/examples/archived/dbrx/fft-ds-zero3.yaml index e42c16673..13cd0d997 100644 --- a/examples/archived/dbrx/fft-ds-zero3.yaml +++ b/examples/archived/dbrx/fft-ds-zero3.yaml @@ -41,7 +41,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: saves_per_epoch: 1 diff --git a/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml b/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml index a9511e9e3..2202091d5 100644 --- a/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml +++ b/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml @@ -51,7 +51,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/archived/falcon/config-7b-lora.yml b/examples/archived/falcon/config-7b-lora.yml index 391d4dd94..f4fedbede 100644 --- a/examples/archived/falcon/config-7b-lora.yml +++ b/examples/archived/falcon/config-7b-lora.yml @@ -47,7 +47,7 @@ xformers_attention: true flash_attention: gptq_groupsize: gptq_model_v1: -warmup_steps: 40 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/archived/falcon/config-7b-qlora.yml b/examples/archived/falcon/config-7b-qlora.yml index a9af8574c..a44cc40a6 100644 --- a/examples/archived/falcon/config-7b-qlora.yml +++ b/examples/archived/falcon/config-7b-qlora.yml @@ -77,7 +77,7 @@ xformers_attention: true flash_attention: gptq_groupsize: gptq_model_v1: -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.000001 diff --git a/examples/archived/falcon/config-7b.yml b/examples/archived/falcon/config-7b.yml index 3cc553daa..5481fb236 100644 --- a/examples/archived/falcon/config-7b.yml +++ b/examples/archived/falcon/config-7b.yml @@ -44,7 +44,7 @@ xformers_attention: true flash_attention: gptq_groupsize: gptq_model_v1: -warmup_steps: 40 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/archived/gptj/qlora.yml b/examples/archived/gptj/qlora.yml index c3cf9f973..6348566c2 100644 --- a/examples/archived/gptj/qlora.yml +++ b/examples/archived/gptj/qlora.yml @@ -40,7 +40,7 @@ xformers_attention: true flash_attention: gptq_groupsize: gptq_model_v1: -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.1 diff --git a/examples/archived/jeopardy-bot/config.yml b/examples/archived/jeopardy-bot/config.yml index 3609bd97e..ab1d19784 100644 --- a/examples/archived/jeopardy-bot/config.yml +++ b/examples/archived/jeopardy-bot/config.yml @@ -41,7 +41,7 @@ xformers_attention: true flash_attention: gptq_groupsize: gptq_model_v1: -warmup_steps: 20 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.1 diff --git a/examples/archived/mpt-7b/config.yml b/examples/archived/mpt-7b/config.yml index e7485fad7..1fff51b6e 100644 --- a/examples/archived/mpt-7b/config.yml +++ b/examples/archived/mpt-7b/config.yml @@ -42,7 +42,7 @@ logging_steps: 5 flash_attention: gptq_groupsize: gptq_model_v1: -warmup_steps: 20 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0001 diff --git a/examples/archived/openllama-3b/config.yml b/examples/archived/openllama-3b/config.yml index 17eeb73ae..63056ed6d 100644 --- a/examples/archived/openllama-3b/config.yml +++ b/examples/archived/openllama-3b/config.yml @@ -42,7 +42,7 @@ logging_steps: 1 flash_attention: true gptq_groupsize: gptq_model_v1: -warmup_steps: 20 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.1 diff --git a/examples/archived/openllama-3b/lora.yml b/examples/archived/openllama-3b/lora.yml index 073117f11..b70821ce2 100644 --- a/examples/archived/openllama-3b/lora.yml +++ b/examples/archived/openllama-3b/lora.yml @@ -50,7 +50,7 @@ logging_steps: 1 flash_attention: true gptq_groupsize: gptq_model_v1: -warmup_steps: 20 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.1 diff --git a/examples/archived/openllama-3b/qlora.yml b/examples/archived/openllama-3b/qlora.yml index b4fca2c07..a34f2964b 100644 --- a/examples/archived/openllama-3b/qlora.yml +++ b/examples/archived/openllama-3b/qlora.yml @@ -43,7 +43,7 @@ logging_steps: 1 flash_attention: true gptq_groupsize: gptq_model_v1: -warmup_steps: 20 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.1 diff --git a/examples/archived/qwen/lora.yml b/examples/archived/qwen/lora.yml index 9a2843236..29de25611 100644 --- a/examples/archived/qwen/lora.yml +++ b/examples/archived/qwen/lora.yml @@ -49,7 +49,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/archived/qwen/qlora.yml b/examples/archived/qwen/qlora.yml index 5f85b44dd..d46669444 100644 --- a/examples/archived/qwen/qlora.yml +++ b/examples/archived/qwen/qlora.yml @@ -49,7 +49,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/archived/qwen/qwen2-moe-lora.yaml b/examples/archived/qwen/qwen2-moe-lora.yaml index afce443a0..1d5e1b524 100644 --- a/examples/archived/qwen/qwen2-moe-lora.yaml +++ b/examples/archived/qwen/qwen2-moe-lora.yaml @@ -45,7 +45,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/archived/qwen/qwen2-moe-qlora.yaml b/examples/archived/qwen/qwen2-moe-qlora.yaml index 92a6842cf..08731441b 100644 --- a/examples/archived/qwen/qwen2-moe-qlora.yaml +++ b/examples/archived/qwen/qwen2-moe-qlora.yaml @@ -48,7 +48,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/archived/redpajama/config-3b.yml b/examples/archived/redpajama/config-3b.yml index 3e2999df9..c5b229c3d 100644 --- a/examples/archived/redpajama/config-3b.yml +++ b/examples/archived/redpajama/config-3b.yml @@ -43,7 +43,7 @@ logging_steps: 5 flash_attention: gptq_groupsize: gptq_model_v1: -warmup_steps: 20 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0001 diff --git a/examples/archived/replit-3b/config-lora.yml b/examples/archived/replit-3b/config-lora.yml index 5a02ba10c..d8561762c 100644 --- a/examples/archived/replit-3b/config-lora.yml +++ b/examples/archived/replit-3b/config-lora.yml @@ -41,7 +41,7 @@ logging_steps: 1 flash_attention: gptq_groupsize: gptq_model_v1: -warmup_steps: 20 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0 diff --git a/examples/archived/stablelm-2/1.6b/fft.yml b/examples/archived/stablelm-2/1.6b/fft.yml index 3ae08c9de..d608bc66f 100644 --- a/examples/archived/stablelm-2/1.6b/fft.yml +++ b/examples/archived/stablelm-2/1.6b/fft.yml @@ -50,7 +50,7 @@ flash_attn_rms_norm: true flash_attn_fuse_qkv: false flash_attn_fuse_mlp: true -warmup_steps: 100 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 diff --git a/examples/archived/stablelm-2/1.6b/lora.yml b/examples/archived/stablelm-2/1.6b/lora.yml index e5aa81423..6d358bdd8 100644 --- a/examples/archived/stablelm-2/1.6b/lora.yml +++ b/examples/archived/stablelm-2/1.6b/lora.yml @@ -51,7 +51,7 @@ flash_attention: true flash_attn_cross_entropy: false flash_attn_rms_norm: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/archived/starcoder2/qlora.yml b/examples/archived/starcoder2/qlora.yml index 889d837e8..fecf98d23 100644 --- a/examples/archived/starcoder2/qlora.yml +++ b/examples/archived/starcoder2/qlora.yml @@ -48,7 +48,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 20 +warmup_ratio: 0.1 evals_per_epoch: 4 eval_steps: saves_per_epoch: 4 diff --git a/examples/archived/tiny-llama/lora-mps.yml b/examples/archived/tiny-llama/lora-mps.yml index aa3b7d851..125090a78 100644 --- a/examples/archived/tiny-llama/lora-mps.yml +++ b/examples/archived/tiny-llama/lora-mps.yml @@ -49,7 +49,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: false -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 0 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/archived/tiny-llama/lora.yml b/examples/archived/tiny-llama/lora.yml index a92f4bd67..817481e18 100644 --- a/examples/archived/tiny-llama/lora.yml +++ b/examples/archived/tiny-llama/lora.yml @@ -47,7 +47,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/archived/tiny-llama/pretrain.yml b/examples/archived/tiny-llama/pretrain.yml index 5b3706bcb..f15c6ce19 100644 --- a/examples/archived/tiny-llama/pretrain.yml +++ b/examples/archived/tiny-llama/pretrain.yml @@ -38,7 +38,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/archived/tiny-llama/qlora.yml b/examples/archived/tiny-llama/qlora.yml index 4d422a5ee..d3ff59cb8 100644 --- a/examples/archived/tiny-llama/qlora.yml +++ b/examples/archived/tiny-llama/qlora.yml @@ -49,7 +49,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml b/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml index 48066b130..fc09a1e7b 100644 --- a/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml +++ b/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml @@ -75,7 +75,7 @@ xformers_attention: true flash_attention: gptq_groupsize: gptq_model_v1: -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/archived/yi-34B-chat/qlora.yml b/examples/archived/yi-34B-chat/qlora.yml index a0a95d86f..ba8d12fc8 100644 --- a/examples/archived/yi-34B-chat/qlora.yml +++ b/examples/archived/yi-34B-chat/qlora.yml @@ -20,7 +20,7 @@ special_tokens: datasets: - path: mhenrichsen/alpaca_2k_test type: alpaca -warmup_steps: 10 +warmup_ratio: 0.1 # Iterations num_epochs: 1 diff --git a/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml b/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml index 6f0b505bd..fc9a75e3f 100644 --- a/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml +++ b/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml @@ -51,7 +51,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml b/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml index fefcfadea..b527edc6f 100644 --- a/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml +++ b/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml @@ -51,7 +51,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/deepseek-v2/fft-fsdp-16b.yaml b/examples/deepseek-v2/fft-fsdp-16b.yaml index d23c789aa..6e936da16 100644 --- a/examples/deepseek-v2/fft-fsdp-16b.yaml +++ b/examples/deepseek-v2/fft-fsdp-16b.yaml @@ -37,7 +37,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 100 +warmup_ratio: 0.1 evals_per_epoch: 2 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/deepseek-v2/qlora-fsdp-2_5.yaml b/examples/deepseek-v2/qlora-fsdp-2_5.yaml index 0536d1c10..aab5034a0 100644 --- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml +++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml @@ -61,7 +61,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 100 +warmup_ratio: 0.1 evals_per_epoch: 2 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/glm4/qlora-32b.yaml b/examples/glm4/qlora-32b.yaml index b3656e3ae..832abde05 100644 --- a/examples/glm4/qlora-32b.yaml +++ b/examples/glm4/qlora-32b.yaml @@ -55,7 +55,7 @@ flash_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/jamba/qlora.yaml b/examples/jamba/qlora.yaml index 494154886..538ed3a10 100644 --- a/examples/jamba/qlora.yaml +++ b/examples/jamba/qlora.yaml @@ -49,7 +49,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/jamba/qlora_deepspeed.yaml b/examples/jamba/qlora_deepspeed.yaml index 64db8f2ff..b288635e7 100644 --- a/examples/jamba/qlora_deepspeed.yaml +++ b/examples/jamba/qlora_deepspeed.yaml @@ -48,7 +48,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: saves_per_epoch: 1 diff --git a/examples/jamba/qlora_fsdp_large.yaml b/examples/jamba/qlora_fsdp_large.yaml index 344f73e63..150e5e2ec 100644 --- a/examples/jamba/qlora_fsdp_large.yaml +++ b/examples/jamba/qlora_fsdp_large.yaml @@ -47,7 +47,7 @@ gradient_checkpointing_kwargs: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/llama-2/fft_optimized.yml b/examples/llama-2/fft_optimized.yml index a23778b96..678806473 100644 --- a/examples/llama-2/fft_optimized.yml +++ b/examples/llama-2/fft_optimized.yml @@ -48,7 +48,7 @@ flash_attn_rms_norm: true flash_attn_fuse_qkv: false flash_attn_fuse_mlp: true -warmup_steps: 100 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 diff --git a/examples/llama-2/gptq-lora.yml b/examples/llama-2/gptq-lora.yml index 580fabdf8..de1caaa05 100644 --- a/examples/llama-2/gptq-lora.yml +++ b/examples/llama-2/gptq-lora.yml @@ -56,7 +56,7 @@ logging_steps: 1 flash_attention: sdp_attention: flash_optimum: -warmup_steps: 100 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.1 diff --git a/examples/llama-2/lisa.yml b/examples/llama-2/lisa.yml index 25adcad5d..7b92b72e1 100644 --- a/examples/llama-2/lisa.yml +++ b/examples/llama-2/lisa.yml @@ -52,7 +52,7 @@ flash_attn_rms_norm: true flash_attn_fuse_qkv: false flash_attn_fuse_mlp: true -warmup_steps: 100 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.1 diff --git a/examples/llama-2/loftq.yml b/examples/llama-2/loftq.yml index 606bbc735..619e5bcce 100644 --- a/examples/llama-2/loftq.yml +++ b/examples/llama-2/loftq.yml @@ -47,7 +47,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/llama-2/lora.yml b/examples/llama-2/lora.yml index 0781e0d1b..0a677f11a 100644 --- a/examples/llama-2/lora.yml +++ b/examples/llama-2/lora.yml @@ -47,7 +47,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/llama-2/qlora-fsdp.yml b/examples/llama-2/qlora-fsdp.yml index ceb3ce5d1..54f4b86b4 100644 --- a/examples/llama-2/qlora-fsdp.yml +++ b/examples/llama-2/qlora-fsdp.yml @@ -50,7 +50,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/llama-2/qlora.yml b/examples/llama-2/qlora.yml index 1515872e6..327d88c15 100644 --- a/examples/llama-2/qlora.yml +++ b/examples/llama-2/qlora.yml @@ -48,7 +48,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/llama-2/relora.yml b/examples/llama-2/relora.yml index 6c9e83223..b0e905340 100644 --- a/examples/llama-2/relora.yml +++ b/examples/llama-2/relora.yml @@ -26,7 +26,7 @@ lora_dropout: 0.05 lora_target_linear: true relora_steps: 150 -relora_warmup_steps: 10 +relora_warmup_ratio: 0.1 relora_cpu_offload: false wandb_project: @@ -50,7 +50,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/llama-3/3b-qat-fsdp2.yaml b/examples/llama-3/3b-qat-fsdp2.yaml index d9b96fb96..35e3461e2 100644 --- a/examples/llama-3/3b-qat-fsdp2.yaml +++ b/examples/llama-3/3b-qat-fsdp2.yaml @@ -58,7 +58,7 @@ logging_steps: 1 evals_per_epoch: 1 saves_per_epoch: 1 -warmup_steps: 10 +warmup_ratio: 0.1 weight_decay: 0.0 fsdp: - full_shard diff --git a/examples/llama-3/fft-8b-liger-fsdp.yaml b/examples/llama-3/fft-8b-liger-fsdp.yaml index b3d990a8b..a655b97a9 100644 --- a/examples/llama-3/fft-8b-liger-fsdp.yaml +++ b/examples/llama-3/fft-8b-liger-fsdp.yaml @@ -51,7 +51,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 100 +warmup_ratio: 0.1 evals_per_epoch: 2 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/llama-3/fft-8b.yaml b/examples/llama-3/fft-8b.yaml index e067212b7..c72ec6662 100644 --- a/examples/llama-3/fft-8b.yaml +++ b/examples/llama-3/fft-8b.yaml @@ -36,7 +36,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 100 +warmup_ratio: 0.1 evals_per_epoch: 2 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/llama-3/instruct-dpo-lora-8b.yml b/examples/llama-3/instruct-dpo-lora-8b.yml index 99de56ad3..cf823353b 100644 --- a/examples/llama-3/instruct-dpo-lora-8b.yml +++ b/examples/llama-3/instruct-dpo-lora-8b.yml @@ -67,7 +67,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/llama-3/instruct-lora-8b.yml b/examples/llama-3/instruct-lora-8b.yml index b8baa5b0a..69e17b9cf 100644 --- a/examples/llama-3/instruct-lora-8b.yml +++ b/examples/llama-3/instruct-lora-8b.yml @@ -58,7 +58,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/llama-3/lora-1b-deduplicate-dpo.yml b/examples/llama-3/lora-1b-deduplicate-dpo.yml index 288e8fd19..2897636f4 100644 --- a/examples/llama-3/lora-1b-deduplicate-dpo.yml +++ b/examples/llama-3/lora-1b-deduplicate-dpo.yml @@ -79,7 +79,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/llama-3/lora-1b-deduplicate-sft.yml b/examples/llama-3/lora-1b-deduplicate-sft.yml index 6ce504a0d..c5190d892 100644 --- a/examples/llama-3/lora-1b-deduplicate-sft.yml +++ b/examples/llama-3/lora-1b-deduplicate-sft.yml @@ -55,7 +55,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/llama-3/lora-1b-kernels.yml b/examples/llama-3/lora-1b-kernels.yml index 71e569ae0..0bcf46b17 100644 --- a/examples/llama-3/lora-1b-kernels.yml +++ b/examples/llama-3/lora-1b-kernels.yml @@ -59,7 +59,7 @@ flash_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/llama-3/lora-1b-ray.yml b/examples/llama-3/lora-1b-ray.yml index 7b9d15741..46c83348e 100644 --- a/examples/llama-3/lora-1b-ray.yml +++ b/examples/llama-3/lora-1b-ray.yml @@ -53,7 +53,7 @@ flash_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 diff --git a/examples/llama-3/lora-1b-sample-packing-sequentially.yml b/examples/llama-3/lora-1b-sample-packing-sequentially.yml index 9f764e131..dba78597b 100644 --- a/examples/llama-3/lora-1b-sample-packing-sequentially.yml +++ b/examples/llama-3/lora-1b-sample-packing-sequentially.yml @@ -57,7 +57,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/llama-3/lora-1b.yml b/examples/llama-3/lora-1b.yml index 34d540eb7..2ae2f0056 100644 --- a/examples/llama-3/lora-1b.yml +++ b/examples/llama-3/lora-1b.yml @@ -54,7 +54,7 @@ flash_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/llama-3/lora-8b.yml b/examples/llama-3/lora-8b.yml index ca6cd9e97..d72b6527d 100644 --- a/examples/llama-3/lora-8b.yml +++ b/examples/llama-3/lora-8b.yml @@ -51,7 +51,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/llama-3/qlora-1b-kto.yaml b/examples/llama-3/qlora-1b-kto.yaml index f156e23d3..a6a84e7b1 100644 --- a/examples/llama-3/qlora-1b-kto.yaml +++ b/examples/llama-3/qlora-1b-kto.yaml @@ -55,7 +55,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 20 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/llama-3/qlora-1b.yml b/examples/llama-3/qlora-1b.yml index 288b7dc6c..1e4f97438 100644 --- a/examples/llama-3/qlora-1b.yml +++ b/examples/llama-3/qlora-1b.yml @@ -56,7 +56,7 @@ flash_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/llama-3/qlora-fsdp-405b.yaml b/examples/llama-3/qlora-fsdp-405b.yaml index 0f31b5bdc..8ddb84d65 100644 --- a/examples/llama-3/qlora-fsdp-405b.yaml +++ b/examples/llama-3/qlora-fsdp-405b.yaml @@ -41,7 +41,7 @@ gradient_checkpointing_kwargs: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/llama-3/qlora-fsdp-70b.yaml b/examples/llama-3/qlora-fsdp-70b.yaml index 28387ba1b..c052bc19d 100644 --- a/examples/llama-3/qlora-fsdp-70b.yaml +++ b/examples/llama-3/qlora-fsdp-70b.yaml @@ -50,7 +50,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/llama-3/qlora.yml b/examples/llama-3/qlora.yml index ffb00dace..a8f47a0e2 100644 --- a/examples/llama-3/qlora.yml +++ b/examples/llama-3/qlora.yml @@ -48,7 +48,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/llama-3/sparse-finetuning.yaml b/examples/llama-3/sparse-finetuning.yaml index ecf5df955..348756b70 100644 --- a/examples/llama-3/sparse-finetuning.yaml +++ b/examples/llama-3/sparse-finetuning.yaml @@ -47,7 +47,7 @@ logging_steps: 1 xformers_attention: flash_attention: true -warmup_steps: 100 +warmup_ratio: 0.1 evals_per_epoch: 2 eval_table_size: saves_per_epoch: 1 diff --git a/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml b/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml index 5770ce947..b20f79758 100644 --- a/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml +++ b/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml @@ -66,7 +66,7 @@ gradient_checkpointing: offload gradient_checkpointing_kwargs: use_reentrant: false -warmup_steps: 20 +warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml b/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml index 7cd8032d2..40449009c 100644 --- a/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml +++ b/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml @@ -69,7 +69,7 @@ tf32: true logging_steps: 1 flash_attention: true -warmup_steps: 100 +warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml b/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml index 03acdc234..abdc51378 100644 --- a/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml +++ b/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml @@ -76,7 +76,7 @@ gradient_checkpointing: offload gradient_checkpointing_kwargs: use_reentrant: false -warmup_steps: 20 +warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml b/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml index d9255ea16..9975949bb 100644 --- a/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml +++ b/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml @@ -65,7 +65,7 @@ tf32: true logging_steps: 1 flash_attention: true -warmup_steps: 100 +warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml b/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml index 4cda4949e..02c04c691 100644 --- a/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml +++ b/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml @@ -64,7 +64,7 @@ flex_attn_compile_kwargs: dynamic: false mode: max-autotune-no-cudagraphs -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/llama-4/scout-qlora-single-h100-flex.yaml b/examples/llama-4/scout-qlora-single-h100-flex.yaml index 518cfa57c..33a691189 100644 --- a/examples/llama-4/scout-qlora-single-h100-flex.yaml +++ b/examples/llama-4/scout-qlora-single-h100-flex.yaml @@ -74,7 +74,7 @@ gradient_checkpointing_kwargs: use_reentrant: false logging_steps: 1 -warmup_steps: 20 +warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 diff --git a/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml b/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml index 28da15084..ac7e05659 100644 --- a/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml +++ b/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml @@ -67,7 +67,7 @@ flex_attn_compile_kwargs: dynamic: false mode: max-autotune-no-cudagraphs -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/mamba/config.yml b/examples/mamba/config.yml index 2261bd215..e6b335804 100644 --- a/examples/mamba/config.yml +++ b/examples/mamba/config.yml @@ -41,7 +41,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/mistral/config.yml b/examples/mistral/config.yml index 455c3c224..e74162537 100644 --- a/examples/mistral/config.yml +++ b/examples/mistral/config.yml @@ -38,7 +38,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/mistral/lora-mps.yml b/examples/mistral/lora-mps.yml index c18d10aee..07ce191dc 100644 --- a/examples/mistral/lora-mps.yml +++ b/examples/mistral/lora-mps.yml @@ -59,7 +59,7 @@ sdp_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/mistral/lora.yml b/examples/mistral/lora.yml index 77a87a1da..757287f19 100644 --- a/examples/mistral/lora.yml +++ b/examples/mistral/lora.yml @@ -59,7 +59,7 @@ flash_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/mistral/mistral-dpo-qlora.yml b/examples/mistral/mistral-dpo-qlora.yml index 49f5e4ede..8fea14a0f 100644 --- a/examples/mistral/mistral-dpo-qlora.yml +++ b/examples/mistral/mistral-dpo-qlora.yml @@ -73,7 +73,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: false -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/mistral/mistral-qlora-fsdp.yml b/examples/mistral/mistral-qlora-fsdp.yml index cec958c54..8e1f03d24 100644 --- a/examples/mistral/mistral-qlora-fsdp.yml +++ b/examples/mistral/mistral-qlora-fsdp.yml @@ -56,7 +56,7 @@ flash_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 diff --git a/examples/mistral/mistral-qlora-orpo.yml b/examples/mistral/mistral-qlora-orpo.yml index ea3e112b9..850d286f3 100644 --- a/examples/mistral/mistral-qlora-orpo.yml +++ b/examples/mistral/mistral-qlora-orpo.yml @@ -64,7 +64,7 @@ flash_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/mistral/mixtral-8x22b-qlora-fsdp.yml b/examples/mistral/mixtral-8x22b-qlora-fsdp.yml index 64ef9930c..dc7bd9c37 100644 --- a/examples/mistral/mixtral-8x22b-qlora-fsdp.yml +++ b/examples/mistral/mixtral-8x22b-qlora-fsdp.yml @@ -54,7 +54,7 @@ flash_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 diff --git a/examples/mistral/mixtral-qlora-fsdp.yml b/examples/mistral/mixtral-qlora-fsdp.yml index c8d0a2711..5151e1292 100644 --- a/examples/mistral/mixtral-qlora-fsdp.yml +++ b/examples/mistral/mixtral-qlora-fsdp.yml @@ -56,7 +56,7 @@ flash_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 diff --git a/examples/mistral/mixtral.yml b/examples/mistral/mixtral.yml index 933275484..d1981a699 100644 --- a/examples/mistral/mixtral.yml +++ b/examples/mistral/mixtral.yml @@ -74,7 +74,7 @@ flash_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 diff --git a/examples/mistral/qlora.yml b/examples/mistral/qlora.yml index a5e8b65fb..2a7495e95 100644 --- a/examples/mistral/qlora.yml +++ b/examples/mistral/qlora.yml @@ -59,7 +59,7 @@ flash_attention: true loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/orpheus/finetune.yml b/examples/orpheus/finetune.yml index 9dcb8a43e..f4bc8054e 100644 --- a/examples/orpheus/finetune.yml +++ b/examples/orpheus/finetune.yml @@ -43,7 +43,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 20 +warmup_ratio: 0.1 evals_per_epoch: 5 saves_per_epoch: 5 weight_decay: 0.05 diff --git a/examples/phi/lora-3.5.yaml b/examples/phi/lora-3.5.yaml index b7f902d63..a6fa15d98 100644 --- a/examples/phi/lora-3.5.yaml +++ b/examples/phi/lora-3.5.yaml @@ -59,7 +59,7 @@ gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 4 weight_decay: 0.0 diff --git a/examples/phi/phi-ft.yml b/examples/phi/phi-ft.yml index 4adb62d3a..717a45929 100644 --- a/examples/phi/phi-ft.yml +++ b/examples/phi/phi-ft.yml @@ -50,7 +50,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 100 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.1 diff --git a/examples/phi/phi-qlora.yml b/examples/phi/phi-qlora.yml index 11c08bfe6..0fe1abea5 100644 --- a/examples/phi/phi-qlora.yml +++ b/examples/phi/phi-qlora.yml @@ -53,7 +53,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 100 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.1 diff --git a/examples/phi/phi2-ft.yml b/examples/phi/phi2-ft.yml index 102c7ba03..e470c0d24 100644 --- a/examples/phi/phi2-ft.yml +++ b/examples/phi/phi2-ft.yml @@ -50,7 +50,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 100 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.1 diff --git a/examples/phi/phi3-ft-fsdp.yml b/examples/phi/phi3-ft-fsdp.yml index e8290ea1f..1793737b5 100644 --- a/examples/phi/phi3-ft-fsdp.yml +++ b/examples/phi/phi3-ft-fsdp.yml @@ -51,7 +51,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 100 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.1 diff --git a/examples/qwen2/dpo.yaml b/examples/qwen2/dpo.yaml index 3b1f817e5..3e87766d6 100644 --- a/examples/qwen2/dpo.yaml +++ b/examples/qwen2/dpo.yaml @@ -50,7 +50,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/qwen2/qlora-fsdp.yaml b/examples/qwen2/qlora-fsdp.yaml index ca435b2bb..337619b61 100644 --- a/examples/qwen2/qlora-fsdp.yaml +++ b/examples/qwen2/qlora-fsdp.yaml @@ -49,7 +49,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/qwen3/32b-qlora.yaml b/examples/qwen3/32b-qlora.yaml index 87609c42f..f4a4f2816 100644 --- a/examples/qwen3/32b-qlora.yaml +++ b/examples/qwen3/32b-qlora.yaml @@ -62,7 +62,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0 diff --git a/examples/qwen3/8b-qat-fsdp2.yml b/examples/qwen3/8b-qat-fsdp2.yml index 395812a56..cfbe5a4b7 100644 --- a/examples/qwen3/8b-qat-fsdp2.yml +++ b/examples/qwen3/8b-qat-fsdp2.yml @@ -58,7 +58,7 @@ logging_steps: 1 evals_per_epoch: 1 saves_per_epoch: 1 -warmup_steps: 10 +warmup_ratio: 0.1 weight_decay: 0.0 fsdp: - full_shard diff --git a/examples/qwen3/qlora-fsdp.yaml b/examples/qwen3/qlora-fsdp.yaml index 6af3cfbc6..e4d584dc7 100644 --- a/examples/qwen3/qlora-fsdp.yaml +++ b/examples/qwen3/qlora-fsdp.yaml @@ -48,7 +48,7 @@ resume_from_checkpoint: logging_steps: 1 flash_attention: true -warmup_steps: 10 +warmup_ratio: 0.1 evals_per_epoch: 4 saves_per_epoch: 1 weight_decay: 0.0