migrate example configs to canonical attn_implementation
This commit is contained in:
@@ -39,7 +39,7 @@ tf32: true
|
|||||||
gradient_checkpointing: false
|
gradient_checkpointing: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 2
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ tf32: true
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 2
|
||||||
|
|||||||
@@ -50,8 +50,7 @@ tf32: true
|
|||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
eager_attention:
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ activation_offloading: legacy
|
|||||||
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_steps: 100
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ activation_offloading: legacy
|
|||||||
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_steps: 100
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
|
|||||||
@@ -59,8 +59,7 @@ gradient_checkpointing: false
|
|||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
sdp_attention:
|
|
||||||
flash_optimum:
|
flash_optimum:
|
||||||
|
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
|
|||||||
@@ -39,8 +39,7 @@ tf32: true
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention: true
|
attn_implementation: xformers
|
||||||
flash_attention:
|
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ tf32: true
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
|
|||||||
@@ -43,8 +43,7 @@ tf32: true
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention: true
|
attn_implementation: xformers
|
||||||
flash_attention:
|
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
|
|||||||
@@ -73,8 +73,7 @@ early_stopping_patience: 3
|
|||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
auto_resume_from_checkpoints: true
|
auto_resume_from_checkpoints: true
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention: true
|
attn_implementation: xformers
|
||||||
flash_attention:
|
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
|
|||||||
@@ -40,8 +40,7 @@ tf32: true
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention: true
|
attn_implementation: xformers
|
||||||
flash_attention:
|
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -36,8 +36,7 @@ tf32: true
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention: true
|
attn_implementation: xformers
|
||||||
flash_attention:
|
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
|
|||||||
@@ -37,8 +37,7 @@ bf16: auto
|
|||||||
tf32: true
|
tf32: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 5
|
logging_steps: 5
|
||||||
xformers_attention: true
|
attn_implementation: xformers
|
||||||
flash_attention:
|
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
|
|||||||
@@ -39,7 +39,6 @@ bf16: auto
|
|||||||
tf32: true
|
tf32: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 5
|
logging_steps: 5
|
||||||
flash_attention:
|
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
|
|||||||
@@ -47,7 +47,6 @@ tf32: false
|
|||||||
gradient_checkpointing: false
|
gradient_checkpointing: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention:
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -47,7 +47,6 @@ tf32: false
|
|||||||
gradient_checkpointing: false
|
gradient_checkpointing: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention:
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -40,7 +40,6 @@ bf16: auto
|
|||||||
tf32: true
|
tf32: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 5
|
logging_steps: 5
|
||||||
flash_attention:
|
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
|
|||||||
@@ -38,7 +38,6 @@ tf32: true
|
|||||||
gradient_checkpointing:
|
gradient_checkpointing:
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention:
|
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
|
|||||||
@@ -44,7 +44,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
flash_attn_cross_entropy: false
|
flash_attn_cross_entropy: false
|
||||||
flash_attn_rms_norm: true
|
flash_attn_rms_norm: true
|
||||||
flash_attn_fuse_mlp: true
|
flash_attn_fuse_mlp: true
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
flash_attn_cross_entropy: false
|
flash_attn_cross_entropy: false
|
||||||
flash_attn_rms_norm: true
|
flash_attn_rms_norm: true
|
||||||
|
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -47,7 +47,6 @@ tf32: true
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: false
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 0
|
evals_per_epoch: 0
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -71,8 +71,7 @@ early_stopping_patience: 3
|
|||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
auto_resume_from_checkpoints: true
|
auto_resume_from_checkpoints: true
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention: true
|
attn_implementation: xformers
|
||||||
flash_attention:
|
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ load_in_4bit: true
|
|||||||
sequence_len: 1024
|
sequence_len: 1024
|
||||||
bf16: auto
|
bf16: auto
|
||||||
tf32: false
|
tf32: false
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
special_tokens:
|
special_tokens:
|
||||||
bos_token: "<|startoftext|>"
|
bos_token: "<|startoftext|>"
|
||||||
eos_token: "<|endoftext|>"
|
eos_token: "<|endoftext|>"
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ tf32: true
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ tf32: true
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ tf32: true
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 2
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 2
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
scaling_softmax: true
|
scaling_softmax: true
|
||||||
|
|
||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ output_dir: ./outputs/ndp-out/
|
|||||||
|
|
||||||
sequence_len: 2048
|
sequence_len: 2048
|
||||||
sample_packing: true
|
sample_packing: true
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
gradient_accumulation_steps: 1
|
gradient_accumulation_steps: 1
|
||||||
micro_batch_size: 1
|
micro_batch_size: 1
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ output_dir: ./outputs/ndp-out/
|
|||||||
|
|
||||||
sequence_len: 8192
|
sequence_len: 8192
|
||||||
sample_packing: true
|
sample_packing: true
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
gradient_accumulation_steps: 1
|
gradient_accumulation_steps: 1
|
||||||
micro_batch_size: 1 # must be 1 when using context parallel
|
micro_batch_size: 1 # must be 1 when using context parallel
|
||||||
|
|||||||
@@ -65,8 +65,7 @@ early_stopping_patience:
|
|||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
xformers_attention:
|
attn_implementation: flash_attention_2
|
||||||
flash_attention: true
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ lora_dropout: 0.05
|
|||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ lora_target_linear: true
|
|||||||
|
|
||||||
# --- Hardware ---
|
# --- Hardware ---
|
||||||
bf16: auto
|
bf16: auto
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -47,8 +47,7 @@ lora_dropout: 0.05
|
|||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
flash_attention: false # strided EBFT overrides to flex_attention (or eager fallback) at runtime
|
attn_implementation: flex_attention
|
||||||
flex_attention: true # fused flex_attention kernel compiles itself; don't set torch_compile: true
|
|
||||||
# (full-model compile conflicts with gradient checkpointing + flex_attention)
|
# (full-model compile conflicts with gradient checkpointing + flex_attention)
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
|
|||||||
@@ -46,7 +46,6 @@ lora_dropout: 0.05
|
|||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
flash_attention: false # strided EBFT overrides to flex_attention (or eager fallback) at runtime
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -48,7 +48,6 @@ lora_target_linear: true
|
|||||||
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
torch_dtype: bfloat16
|
torch_dtype: bfloat16
|
||||||
flash_attention: false
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
torch_compile: true
|
torch_compile: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
|
|||||||
@@ -41,7 +41,6 @@ warmup_steps: 10
|
|||||||
weight_decay: 0.01
|
weight_decay: 0.01
|
||||||
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
flash_attention: false # strided EBFT uses flex_attention at runtime
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
|
|||||||
@@ -72,7 +72,7 @@ lora_dropout: 0.0
|
|||||||
lora_target_modules: ".*\\.layers\\.(3|7|11|15|19|23|27|31)\\.self_attn\\.(q|k|v|o)_proj|.*\\.mlp\\.(gate|up|down)_proj"
|
lora_target_modules: ".*\\.layers\\.(3|7|11|15|19|23|27|31)\\.self_attn\\.(q|k|v|o)_proj|.*\\.mlp\\.(gate|up|down)_proj"
|
||||||
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -63,7 +63,7 @@ lora_dropout: 0.0
|
|||||||
lora_target_modules: ".*\\.layers\\.(3|7|11|15|19|23|27|31)\\.self_attn\\.(q|k|v|o)_proj|.*\\.mlp\\.(gate|up|down)_proj"
|
lora_target_modules: ".*\\.layers\\.(3|7|11|15|19|23|27|31)\\.self_attn\\.(q|k|v|o)_proj|.*\\.mlp\\.(gate|up|down)_proj"
|
||||||
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -68,7 +68,7 @@ lora_dropout: 0.0
|
|||||||
lora_target_modules: ".*\\.layers\\.(3|7|11|15|19|23|27|31)\\.self_attn\\.(q|k|v|o)_proj|.*\\.mlp\\.(gate|up|down)_proj"
|
lora_target_modules: ".*\\.layers\\.(3|7|11|15|19|23|27|31)\\.self_attn\\.(q|k|v|o)_proj|.*\\.mlp\\.(gate|up|down)_proj"
|
||||||
|
|
||||||
bf16: auto
|
bf16: auto
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ tf32: true
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
|||||||
@@ -58,8 +58,7 @@ gradient_checkpointing: true
|
|||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
eager_attention:
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
|
|||||||
@@ -55,8 +55,7 @@ gradient_checkpointing: true
|
|||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
eager_attention:
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
|
|||||||
@@ -84,7 +84,7 @@ activation_offloading: true
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
|
||||||
# FA2 not supported
|
# FA2 not supported
|
||||||
sdp_attention: true
|
attn_implementation: sdpa
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ activation_offloading: true
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
|
||||||
# FA not supported
|
# FA not supported
|
||||||
flex_attention: true
|
attn_implementation: flex_attention
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -60,7 +60,7 @@ activation_offloading: true
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
|
||||||
# FA not supported
|
# FA not supported
|
||||||
sdp_attention: true
|
attn_implementation: sdpa
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ gradient_checkpointing: true
|
|||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
sdp_attention: true
|
attn_implementation: sdpa
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ gradient_checkpointing: true
|
|||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
sdp_attention: true
|
attn_implementation: sdpa
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 0
|
evals_per_epoch: 0
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ tf32: false
|
|||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
sdp_attention: true
|
attn_implementation: sdpa
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 0
|
evals_per_epoch: 0
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ tf32: false
|
|||||||
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ tf32: false
|
|||||||
|
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ learning_rate: 2e-5
|
|||||||
bf16: true
|
bf16: true
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3
|
attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ learning_rate: 2e-5
|
|||||||
bf16: true
|
bf16: true
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3
|
attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
|||||||
@@ -44,7 +44,7 @@ learning_rate: 2e-5
|
|||||||
bf16: true
|
bf16: true
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3
|
attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ learning_rate: 2e-5
|
|||||||
bf16: true
|
bf16: true
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3
|
attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ learning_rate: 2e-4
|
|||||||
bf16: true
|
bf16: true
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3
|
attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ learning_rate: 2e-4
|
|||||||
bf16: true
|
bf16: true
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3
|
attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3
|
||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
|
|||||||
@@ -50,8 +50,7 @@ tf32: true
|
|||||||
|
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
eager_attention:
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
|
|||||||
@@ -44,7 +44,7 @@ gradient_checkpointing: true
|
|||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: true
|
use_reentrant: true
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
|
|||||||
@@ -65,7 +65,7 @@ early_stopping_patience:
|
|||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
local_rank:
|
local_rank:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
|
|
||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ tf32: false
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
attn_implementation: flash_attention_2
|
||||||
flash_attn_cross_entropy: false
|
flash_attn_cross_entropy: false
|
||||||
flash_attn_rms_norm: true
|
flash_attn_rms_norm: true
|
||||||
flash_attn_fuse_mlp: true
|
flash_attn_fuse_mlp: true
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user