use warmup_ratio as a better default than warmup steps since it's data dependent (#2897) [skip ci]

* use warmup_ratio as a better default than warmup steps since it's data dependent

* replace remainder of warmup_steps
This commit is contained in:
Wing Lian
2025-07-30 06:44:06 -04:00
committed by GitHub
parent 2eb7ff95af
commit 22810c97b7
99 changed files with 100 additions and 100 deletions

View File

@@ -66,7 +66,7 @@ gradient_checkpointing: offload
gradient_checkpointing_kwargs:
use_reentrant: false
warmup_steps: 20
warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -69,7 +69,7 @@ tf32: true
logging_steps: 1
flash_attention: true
warmup_steps: 100
warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -76,7 +76,7 @@ gradient_checkpointing: offload
gradient_checkpointing_kwargs:
use_reentrant: false
warmup_steps: 20
warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -65,7 +65,7 @@ tf32: true
logging_steps: 1
flash_attention: true
warmup_steps: 100
warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -64,7 +64,7 @@ flex_attn_compile_kwargs:
dynamic: false
mode: max-autotune-no-cudagraphs
warmup_steps: 10
warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0

View File

@@ -74,7 +74,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
logging_steps: 1
warmup_steps: 20
warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1

View File

@@ -67,7 +67,7 @@ flex_attn_compile_kwargs:
dynamic: false
mode: max-autotune-no-cudagraphs
warmup_steps: 10
warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0