use warmup_ratio as a better default than warmup steps since it's data dependent (#2897) [skip ci]
* use warmup_ratio as a better default than warmup steps since it's data dependent * replace remainder of warmup_steps
This commit is contained in:
@@ -66,7 +66,7 @@ flash_optimum:
|
|||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
|
|
||||||
warmup_steps: 32
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
save_total_limit:
|
save_total_limit:
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ xformers_attention: true
|
|||||||
flash_attention:
|
flash_attention:
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ xformers_attention: true
|
|||||||
flash_attention:
|
flash_attention:
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_steps: 40
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -77,7 +77,7 @@ xformers_attention: true
|
|||||||
flash_attention:
|
flash_attention:
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.000001
|
weight_decay: 0.000001
|
||||||
|
|||||||
@@ -44,7 +44,7 @@ xformers_attention: true
|
|||||||
flash_attention:
|
flash_attention:
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_steps: 40
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ xformers_attention: true
|
|||||||
flash_attention:
|
flash_attention:
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ xformers_attention: true
|
|||||||
flash_attention:
|
flash_attention:
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_steps: 20
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ logging_steps: 5
|
|||||||
flash_attention:
|
flash_attention:
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_steps: 20
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0001
|
weight_decay: 0.0001
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ logging_steps: 1
|
|||||||
flash_attention: true
|
flash_attention: true
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_steps: 20
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ logging_steps: 1
|
|||||||
flash_attention: true
|
flash_attention: true
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_steps: 20
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ logging_steps: 1
|
|||||||
flash_attention: true
|
flash_attention: true
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_steps: 20
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention:
|
flash_attention:
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention:
|
flash_attention:
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ logging_steps: 5
|
|||||||
flash_attention:
|
flash_attention:
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_steps: 20
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0001
|
weight_decay: 0.0001
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ logging_steps: 1
|
|||||||
flash_attention:
|
flash_attention:
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_steps: 20
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0
|
weight_decay: 0
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ flash_attn_rms_norm: true
|
|||||||
flash_attn_fuse_qkv: false
|
flash_attn_fuse_qkv: false
|
||||||
flash_attn_fuse_mlp: true
|
flash_attn_fuse_mlp: true
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ flash_attention: true
|
|||||||
flash_attn_cross_entropy: false
|
flash_attn_cross_entropy: false
|
||||||
flash_attn_rms_norm: true
|
flash_attn_rms_norm: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 20
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
eval_steps:
|
eval_steps:
|
||||||
saves_per_epoch: 4
|
saves_per_epoch: 4
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: false
|
flash_attention: false
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 0
|
evals_per_epoch: 0
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -75,7 +75,7 @@ xformers_attention: true
|
|||||||
flash_attention:
|
flash_attention:
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ special_tokens:
|
|||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
type: alpaca
|
type: alpaca
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
|
|
||||||
# Iterations
|
# Iterations
|
||||||
num_epochs: 1
|
num_epochs: 1
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 2
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 2
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ flash_attention: true
|
|||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ gradient_checkpointing_kwargs:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ flash_attn_rms_norm: true
|
|||||||
flash_attn_fuse_qkv: false
|
flash_attn_fuse_qkv: false
|
||||||
flash_attn_fuse_mlp: true
|
flash_attn_fuse_mlp: true
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ logging_steps: 1
|
|||||||
flash_attention:
|
flash_attention:
|
||||||
sdp_attention:
|
sdp_attention:
|
||||||
flash_optimum:
|
flash_optimum:
|
||||||
warmup_steps: 100
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ flash_attn_rms_norm: true
|
|||||||
flash_attn_fuse_qkv: false
|
flash_attn_fuse_qkv: false
|
||||||
flash_attn_fuse_mlp: true
|
flash_attn_fuse_mlp: true
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ lora_dropout: 0.05
|
|||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
|
||||||
relora_steps: 150
|
relora_steps: 150
|
||||||
relora_warmup_steps: 10
|
relora_warmup_ratio: 0.1
|
||||||
relora_cpu_offload: false
|
relora_cpu_offload: false
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
@@ -50,7 +50,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ logging_steps: 1
|
|||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
fsdp:
|
||||||
- full_shard
|
- full_shard
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 2
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 2
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -79,7 +79,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ flash_attention: true
|
|||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ flash_attention: true
|
|||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ flash_attention: true
|
|||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 20
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ flash_attention: true
|
|||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ gradient_checkpointing_kwargs:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ logging_steps: 1
|
|||||||
xformers_attention:
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 2
|
||||||
eval_table_size:
|
eval_table_size:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ gradient_checkpointing: offload
|
|||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
|
|
||||||
warmup_steps: 20
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -69,7 +69,7 @@ tf32: true
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -76,7 +76,7 @@ gradient_checkpointing: offload
|
|||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
|
|
||||||
warmup_steps: 20
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -65,7 +65,7 @@ tf32: true
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ flex_attn_compile_kwargs:
|
|||||||
dynamic: false
|
dynamic: false
|
||||||
mode: max-autotune-no-cudagraphs
|
mode: max-autotune-no-cudagraphs
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -74,7 +74,7 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
warmup_steps: 20
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ flex_attn_compile_kwargs:
|
|||||||
dynamic: false
|
dynamic: false
|
||||||
mode: max-autotune-no-cudagraphs
|
mode: max-autotune-no-cudagraphs
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention:
|
flash_attention:
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ sdp_attention: true
|
|||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ flash_attention: true
|
|||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -73,7 +73,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: false
|
flash_attention: false
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ flash_attention: true
|
|||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ flash_attention: true
|
|||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ flash_attention: true
|
|||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ flash_attention: true
|
|||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
|
|||||||
@@ -74,7 +74,7 @@ flash_attention: true
|
|||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ flash_attention: true
|
|||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 20
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 5
|
evals_per_epoch: 5
|
||||||
saves_per_epoch: 5
|
saves_per_epoch: 5
|
||||||
weight_decay: 0.05
|
weight_decay: 0.05
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ gradient_checkpointing: true
|
|||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 4
|
saves_per_epoch: 4
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ logging_steps: 1
|
|||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
fsdp:
|
||||||
- full_shard
|
- full_shard
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_steps: 10
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
Reference in New Issue
Block a user