use warmup_ratio as a better default than warmup steps since it's data dependent (#2897) [skip ci]

* use warmup_ratio as a better default than warmup steps since it's data dependent

* replace remainder of warmup_steps
This commit is contained in:
Wing Lian
2025-07-30 06:44:06 -04:00
committed by GitHub
parent 2eb7ff95af
commit 22810c97b7
99 changed files with 100 additions and 100 deletions

View File

@@ -66,7 +66,7 @@ flash_optimum:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 32 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
save_total_limit: save_total_limit:

View File

@@ -43,7 +43,7 @@ xformers_attention: true
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -47,7 +47,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -47,7 +47,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -47,7 +47,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -54,7 +54,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: evals_per_epoch:
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -57,7 +57,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: evals_per_epoch:
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -41,7 +41,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: evals_per_epoch:
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -51,7 +51,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -47,7 +47,7 @@ xformers_attention: true
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 40 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -77,7 +77,7 @@ xformers_attention: true
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.000001 weight_decay: 0.000001

View File

@@ -44,7 +44,7 @@ xformers_attention: true
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 40 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -40,7 +40,7 @@ xformers_attention: true
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -41,7 +41,7 @@ xformers_attention: true
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 20 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -42,7 +42,7 @@ logging_steps: 5
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 20 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0001 weight_decay: 0.0001

View File

@@ -42,7 +42,7 @@ logging_steps: 1
flash_attention: true flash_attention: true
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 20 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -50,7 +50,7 @@ logging_steps: 1
flash_attention: true flash_attention: true
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 20 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -43,7 +43,7 @@ logging_steps: 1
flash_attention: true flash_attention: true
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 20 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -49,7 +49,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: flash_attention:
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -49,7 +49,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: flash_attention:
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -45,7 +45,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -43,7 +43,7 @@ logging_steps: 5
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 20 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0001 weight_decay: 0.0001

View File

@@ -41,7 +41,7 @@ logging_steps: 1
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 20 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0 weight_decay: 0

View File

@@ -50,7 +50,7 @@ flash_attn_rms_norm: true
flash_attn_fuse_qkv: false flash_attn_fuse_qkv: false
flash_attn_fuse_mlp: true flash_attn_fuse_mlp: true
warmup_steps: 100 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -51,7 +51,7 @@ flash_attention: true
flash_attn_cross_entropy: false flash_attn_cross_entropy: false
flash_attn_rms_norm: true flash_attn_rms_norm: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 20 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
eval_steps: eval_steps:
saves_per_epoch: 4 saves_per_epoch: 4

View File

@@ -49,7 +49,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: false flash_attention: false
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 0 evals_per_epoch: 0
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -47,7 +47,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -38,7 +38,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: evals_per_epoch:
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -49,7 +49,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -75,7 +75,7 @@ xformers_attention: true
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -20,7 +20,7 @@ special_tokens:
datasets: datasets:
- path: mhenrichsen/alpaca_2k_test - path: mhenrichsen/alpaca_2k_test
type: alpaca type: alpaca
warmup_steps: 10 warmup_ratio: 0.1
# Iterations # Iterations
num_epochs: 1 num_epochs: 1

View File

@@ -51,7 +51,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -51,7 +51,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -37,7 +37,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 100 warmup_ratio: 0.1
evals_per_epoch: 2 evals_per_epoch: 2
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -61,7 +61,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 100 warmup_ratio: 0.1
evals_per_epoch: 2 evals_per_epoch: 2
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -55,7 +55,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -49,7 +49,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: evals_per_epoch:
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: evals_per_epoch:
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -47,7 +47,7 @@ gradient_checkpointing_kwargs:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ flash_attn_rms_norm: true
flash_attn_fuse_qkv: false flash_attn_fuse_qkv: false
flash_attn_fuse_mlp: true flash_attn_fuse_mlp: true
warmup_steps: 100 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -56,7 +56,7 @@ logging_steps: 1
flash_attention: flash_attention:
sdp_attention: sdp_attention:
flash_optimum: flash_optimum:
warmup_steps: 100 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -52,7 +52,7 @@ flash_attn_rms_norm: true
flash_attn_fuse_qkv: false flash_attn_fuse_qkv: false
flash_attn_fuse_mlp: true flash_attn_fuse_mlp: true
warmup_steps: 100 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -47,7 +47,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -47,7 +47,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -50,7 +50,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -26,7 +26,7 @@ lora_dropout: 0.05
lora_target_linear: true lora_target_linear: true
relora_steps: 150 relora_steps: 150
relora_warmup_steps: 10 relora_warmup_ratio: 0.1
relora_cpu_offload: false relora_cpu_offload: false
wandb_project: wandb_project:
@@ -50,7 +50,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -58,7 +58,7 @@ logging_steps: 1
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
warmup_steps: 10 warmup_ratio: 0.1
weight_decay: 0.0 weight_decay: 0.0
fsdp: fsdp:
- full_shard - full_shard

View File

@@ -51,7 +51,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 100 warmup_ratio: 0.1
evals_per_epoch: 2 evals_per_epoch: 2
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -36,7 +36,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 100 warmup_ratio: 0.1
evals_per_epoch: 2 evals_per_epoch: 2
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -67,7 +67,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -58,7 +58,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -79,7 +79,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -55,7 +55,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -59,7 +59,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -53,7 +53,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -57,7 +57,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -54,7 +54,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -51,7 +51,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -55,7 +55,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 20 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -56,7 +56,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -41,7 +41,7 @@ gradient_checkpointing_kwargs:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -50,7 +50,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -47,7 +47,7 @@ logging_steps: 1
xformers_attention: xformers_attention:
flash_attention: true flash_attention: true
warmup_steps: 100 warmup_ratio: 0.1
evals_per_epoch: 2 evals_per_epoch: 2
eval_table_size: eval_table_size:
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -66,7 +66,7 @@ gradient_checkpointing: offload
gradient_checkpointing_kwargs: gradient_checkpointing_kwargs:
use_reentrant: false use_reentrant: false
warmup_steps: 20 warmup_ratio: 0.1
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -69,7 +69,7 @@ tf32: true
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 100 warmup_ratio: 0.1
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -76,7 +76,7 @@ gradient_checkpointing: offload
gradient_checkpointing_kwargs: gradient_checkpointing_kwargs:
use_reentrant: false use_reentrant: false
warmup_steps: 20 warmup_ratio: 0.1
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -65,7 +65,7 @@ tf32: true
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 100 warmup_ratio: 0.1
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -64,7 +64,7 @@ flex_attn_compile_kwargs:
dynamic: false dynamic: false
mode: max-autotune-no-cudagraphs mode: max-autotune-no-cudagraphs
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -74,7 +74,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false use_reentrant: false
logging_steps: 1 logging_steps: 1
warmup_steps: 20 warmup_ratio: 0.1
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -67,7 +67,7 @@ flex_attn_compile_kwargs:
dynamic: false dynamic: false
mode: max-autotune-no-cudagraphs mode: max-autotune-no-cudagraphs
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -41,7 +41,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: flash_attention:
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -38,7 +38,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -59,7 +59,7 @@ sdp_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -59,7 +59,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -73,7 +73,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: false flash_attention: false
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -56,7 +56,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -64,7 +64,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -54,7 +54,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -56,7 +56,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -74,7 +74,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -59,7 +59,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -43,7 +43,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 20 warmup_ratio: 0.1
evals_per_epoch: 5 evals_per_epoch: 5
saves_per_epoch: 5 saves_per_epoch: 5
weight_decay: 0.05 weight_decay: 0.05

View File

@@ -59,7 +59,7 @@ gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 4 saves_per_epoch: 4
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -50,7 +50,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 100 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -53,7 +53,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 100 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -50,7 +50,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 100 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -51,7 +51,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 100 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -50,7 +50,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -49,7 +49,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -62,7 +62,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -58,7 +58,7 @@ logging_steps: 1
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
warmup_steps: 10 warmup_ratio: 0.1
weight_decay: 0.0 weight_decay: 0.0
fsdp: fsdp:
- full_shard - full_shard

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_ratio: 0.1
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0