migrate example configs to canonical attn_implementation

This commit is contained in:
Wing Lian
2026-04-23 22:15:07 +00:00
parent 2d64d009d8
commit 39226623d2
222 changed files with 209 additions and 243 deletions

View File

@@ -25,7 +25,7 @@ sample_packing: true
pad_to_sequence_len: true
sequence_len: 512
flex_attention: true
attn_implementation: flex_attention
flex_attn_compile_kwargs:
dynamic: false
mode: max-autotune-no-cudagraphs

View File

@@ -26,7 +26,7 @@ dataset_prepared_path: ./outputs/qat_out/dataset_prepared
sample_packing: false
sequence_len: 8192
flash_attention: true
attn_implementation: flash_attention_2
qat:
activation_dtype: int8

View File

@@ -24,7 +24,7 @@ output_dir: ./outputs/qat_out/
dataset_prepared_path: ./outputs/dataset_prepared
sequence_len: 2048
flash_attention: true
attn_implementation: flash_attention_2
qat:
activation_dtype: mxfp4

View File

@@ -24,7 +24,7 @@ output_dir: ./outputs/qat_out/
dataset_prepared_path: ./outputs/dataset_prepared
sequence_len: 8192
flash_attention: true
attn_implementation: flash_attention_2
qat:
activation_dtype: nvfp4

View File

@@ -35,7 +35,7 @@ warmup_ratio: 0.1
optimizer: adamw_8bit
lr_scheduler: cosine
learning_rate: 3e-4
sdp_attention: true
attn_implementation: sdpa
bf16: auto
tf32: true

View File

@@ -41,7 +41,7 @@ tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
sdp_attention: true
attn_implementation: sdpa
logging_steps: 1
save_strategy: best

View File

@@ -49,7 +49,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attn_implementation: flash_attention_2
warmup_ratio: 0.1
evals_per_epoch: 2

View File

@@ -34,7 +34,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attn_implementation: flash_attention_2
warmup_ratio: 0.1
evals_per_epoch: 2

View File

@@ -65,7 +65,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attn_implementation: flash_attention_2
warmup_ratio: 0.1
evals_per_epoch: 4

View File

@@ -47,7 +47,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attn_implementation: flash_attention_2
warmup_ratio: 0.1
evals_per_epoch: 4

View File

@@ -77,7 +77,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attn_implementation: flash_attention_2
warmup_ratio: 0.1
evals_per_epoch: 4

View File

@@ -53,7 +53,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attn_implementation: flash_attention_2
warmup_ratio: 0.1
evals_per_epoch: 4

View File

@@ -54,7 +54,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attn_implementation: flash_attention_2
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -48,7 +48,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attn_implementation: flash_attention_2
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -55,7 +55,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attn_implementation: flash_attention_2
warmup_ratio: 0.1
evals_per_epoch: 4

View File

@@ -49,7 +49,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attn_implementation: flash_attention_2
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -49,7 +49,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attn_implementation: flash_attention_2
warmup_ratio: 0.1
evals_per_epoch: 4

View File

@@ -39,7 +39,6 @@ tf32: false
gradient_checkpointing: true
logging_steps: 1
flash_attention: false
warmup_ratio: 0.1
evals_per_epoch: 2

View File

@@ -56,7 +56,7 @@ gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
flash_attention: true
attn_implementation: flash_attention_2
logging_steps: 1
save_steps: 50
save_safetensors: true

View File

@@ -53,7 +53,7 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attn_implementation: flash_attention_2
warmup_ratio: 0.1
evals_per_epoch: 4

View File

@@ -51,7 +51,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attn_implementation: flash_attention_2
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -38,7 +38,7 @@ gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: true
logging_steps: 1
flash_attention: true
attn_implementation: flash_attention_2
warmup_ratio: 0.1
evals_per_epoch: 4

View File

@@ -48,7 +48,7 @@ gradient_checkpointing_kwargs:
use_reentrant: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attn_implementation: flash_attention_2
warmup_ratio: 0.1
evals_per_epoch: 4

View File

@@ -46,7 +46,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attn_implementation: flash_attention_2
warmup_ratio: 0.1
evals_per_epoch: 4

View File

@@ -44,8 +44,7 @@ gradient_checkpointing_kwargs:
early_stopping_patience:
resume_from_checkpoint:
logging_steps: 1
xformers_attention:
flash_attention: true
attn_implementation: flash_attention_2
warmup_ratio: 0.1
evals_per_epoch: 2