migrate example configs to canonical attn_implementation

2026-04-23 22:15:07 +00:00
parent 2d64d009d8
commit 39226623d2
222 changed files with 209 additions and 243 deletions
--- a/examples/llama-3/3b-fp8-fsdp2.yaml
+++ b/examples/llama-3/3b-fp8-fsdp2.yaml
@@ -25,7 +25,7 @@ sample_packing: true
 pad_to_sequence_len: true
 sequence_len: 512

-flex_attention: true
+attn_implementation: flex_attention
 flex_attn_compile_kwargs:
  dynamic: false
  mode: max-autotune-no-cudagraphs
--- a/examples/llama-3/3b-qat-fsdp2.yaml
+++ b/examples/llama-3/3b-qat-fsdp2.yaml
@@ -26,7 +26,7 @@ dataset_prepared_path: ./outputs/qat_out/dataset_prepared

 sample_packing: false
 sequence_len: 8192
-flash_attention: true
+attn_implementation: flash_attention_2

 qat:
  activation_dtype: int8
--- a/examples/llama-3/3b-qat-mxfp4.yaml
+++ b/examples/llama-3/3b-qat-mxfp4.yaml
@@ -24,7 +24,7 @@ output_dir: ./outputs/qat_out/
 dataset_prepared_path: ./outputs/dataset_prepared

 sequence_len: 2048
-flash_attention: true
+attn_implementation: flash_attention_2

 qat:
  activation_dtype: mxfp4
--- a/examples/llama-3/3b-qat-nvfp4.yaml
+++ b/examples/llama-3/3b-qat-nvfp4.yaml
@@ -24,7 +24,7 @@ output_dir: ./outputs/qat_out/
 dataset_prepared_path: ./outputs/dataset_prepared

 sequence_len: 8192
-flash_attention: true
+attn_implementation: flash_attention_2

 qat:
  activation_dtype: nvfp4
--- a/examples/llama-3/diffusion/pretrain-1b.yaml
+++ b/examples/llama-3/diffusion/pretrain-1b.yaml
@@ -35,7 +35,7 @@ warmup_ratio: 0.1
 optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 3e-4
-sdp_attention: true
+attn_implementation: sdpa

 bf16: auto
 tf32: true
--- a/examples/llama-3/diffusion/sft-1b.yaml
+++ b/examples/llama-3/diffusion/sft-1b.yaml
@@ -41,7 +41,7 @@ tf32: true

 gradient_checkpointing: true
 resume_from_checkpoint:
-sdp_attention: true
+attn_implementation: sdpa

 logging_steps: 1
 save_strategy: best
--- a/examples/llama-3/fft-8b-liger-fsdp.yaml
+++ b/examples/llama-3/fft-8b-liger-fsdp.yaml
@@ -49,7 +49,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 2
--- a/examples/llama-3/fft-8b.yaml
+++ b/examples/llama-3/fft-8b.yaml
@@ -34,7 +34,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 2
--- a/examples/llama-3/instruct-dpo-lora-8b.yml
+++ b/examples/llama-3/instruct-dpo-lora-8b.yml
@@ -65,7 +65,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/llama-3/instruct-lora-8b.yml
+++ b/examples/llama-3/instruct-lora-8b.yml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/llama-3/lora-1b-deduplicate-dpo.yml
+++ b/examples/llama-3/lora-1b-deduplicate-dpo.yml
@@ -77,7 +77,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/llama-3/lora-1b-deduplicate-sft.yml
+++ b/examples/llama-3/lora-1b-deduplicate-sft.yml
@@ -53,7 +53,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/llama-3/lora-1b-kernels.yml
+++ b/examples/llama-3/lora-1b-kernels.yml
@@ -54,7 +54,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/llama-3/lora-1b-ray.yml
+++ b/examples/llama-3/lora-1b-ray.yml
@@ -48,7 +48,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/llama-3/lora-1b-sample-packing-sequentially.yml
+++ b/examples/llama-3/lora-1b-sample-packing-sequentially.yml
@@ -55,7 +55,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/llama-3/lora-1b.yml
+++ b/examples/llama-3/lora-1b.yml
@@ -49,7 +49,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/llama-3/lora-8b.yml
+++ b/examples/llama-3/lora-8b.yml
@@ -49,7 +49,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/llama-3/opentelemetry-qlora.yml
+++ b/examples/llama-3/opentelemetry-qlora.yml
@@ -39,7 +39,6 @@ tf32: false

 gradient_checkpointing: true
 logging_steps: 1
-flash_attention: false

 warmup_ratio: 0.1
 evals_per_epoch: 2
--- a/examples/llama-3/qlora-1b-gdpo.yaml
+++ b/examples/llama-3/qlora-1b-gdpo.yaml
@@ -56,7 +56,7 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false

-flash_attention: true
+attn_implementation: flash_attention_2
 logging_steps: 1
 save_steps: 50
 save_safetensors: true
--- a/examples/llama-3/qlora-1b-kto.yaml
+++ b/examples/llama-3/qlora-1b-kto.yaml
@@ -53,7 +53,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/llama-3/qlora-1b.yml
+++ b/examples/llama-3/qlora-1b.yml
@@ -51,7 +51,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/llama-3/qlora-fsdp-405b.yaml
+++ b/examples/llama-3/qlora-fsdp-405b.yaml
@@ -38,7 +38,7 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: true
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/llama-3/qlora-fsdp-70b.yaml
+++ b/examples/llama-3/qlora-fsdp-70b.yaml
@@ -48,7 +48,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/llama-3/qlora.yml
+++ b/examples/llama-3/qlora.yml
@@ -46,7 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/llama-3/sparse-finetuning.yaml
+++ b/examples/llama-3/sparse-finetuning.yaml
@@ -44,8 +44,7 @@ gradient_checkpointing_kwargs:
 early_stopping_patience:
 resume_from_checkpoint:
 logging_steps: 1
-xformers_attention:
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 2