From 39226623d262327ca62fa384f860922fd8797f84 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Thu, 23 Apr 2026 22:15:07 +0000
Subject: [PATCH] migrate example configs to canonical attn_implementation

---
 examples/LiquidAI/lfm2-350m-fft.yaml                           | 2 +-
 examples/LiquidAI/lfm2-8b-a1b-lora.yaml                        | 2 +-
 examples/LiquidAI/lfm2-vl-lora.yaml                            | 3 +--
 examples/alst/llama3-8b-deepspeed-alst.yaml                    | 2 +-
 examples/alst/llama3-8b-fsdp2-alst.yaml                        | 2 +-
 examples/apertus/apertus-8b-qlora.yaml                         | 2 +-
 examples/arcee/afm-4.5b-qlora.yaml                             | 2 +-
 examples/archived/cerebras/btlm-ft.yml                         | 3 +--
 examples/archived/cerebras/qlora.yml                           | 3 +--
 examples/archived/code-llama/13b/lora.yml                      | 2 +-
 examples/archived/code-llama/13b/qlora.yml                     | 2 +-
 examples/archived/code-llama/34b/lora.yml                      | 2 +-
 examples/archived/code-llama/34b/qlora.yml                     | 2 +-
 examples/archived/code-llama/7b/lora.yml                       | 2 +-
 examples/archived/code-llama/7b/qlora.yml                      | 2 +-
 examples/archived/dbrx/16bit-lora.yaml                         | 2 +-
 examples/archived/dbrx/8bit-lora.yaml                          | 2 +-
 examples/archived/dbrx/fft-ds-zero3.yaml                       | 2 +-
 examples/archived/deepcoder/deepcoder-14B-preview-lora.yml     | 2 +-
 examples/archived/falcon/config-7b-lora.yml                    | 3 +--
 examples/archived/falcon/config-7b-qlora.yml                   | 3 +--
 examples/archived/falcon/config-7b.yml                         | 3 +--
 examples/archived/gemma/qlora.yml                              | 2 +-
 examples/archived/gptj/qlora.yml                               | 3 +--
 examples/archived/jeopardy-bot/config.yml                      | 3 +--
 examples/archived/mpt-7b/config.yml                            | 1 -
 examples/archived/openllama-3b/config.yml                      | 2 +-
 examples/archived/openllama-3b/lora.yml                        | 2 +-
 examples/archived/openllama-3b/qlora.yml                       | 2 +-
 examples/archived/qwen/lora.yml                                | 1 -
 examples/archived/qwen/qlora.yml                               | 1 -
 examples/archived/qwen/qwen2-moe-lora.yaml                     | 2 +-
 examples/archived/qwen/qwen2-moe-qlora.yaml                    | 2 +-
 examples/archived/redpajama/config-3b.yml                      | 1 -
 examples/archived/replit-3b/config-lora.yml                    | 1 -
 examples/archived/stablelm-2/1.6b/fft.yml                      | 2 +-
 examples/archived/stablelm-2/1.6b/lora.yml                     | 2 +-
 examples/archived/starcoder2/qlora.yml                         | 2 +-
 examples/archived/tiny-llama/lora-mps.yml                      | 1 -
 examples/archived/tiny-llama/lora.yml                          | 2 +-
 examples/archived/tiny-llama/pretrain.yml                      | 2 +-
 examples/archived/tiny-llama/qlora.yml                         | 2 +-
 examples/archived/xgen-7b/xgen-7b-8k-qlora.yml                 | 3 +--
 examples/archived/yi-34B-chat/qlora.yml                        | 2 +-
 examples/cohere/command-r-7b-qlora.yml                         | 2 +-
 examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml        | 2 +-
 examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml        | 2 +-
 examples/deepseek-v2/fft-fsdp-16b.yaml                         | 2 +-
 examples/deepseek-v2/qlora-fsdp-2_5.yaml                       | 2 +-
 examples/devstral/devstral-small-qlora.yml                     | 2 +-
 examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml        | 2 +-
 examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml         | 2 +-
 examples/eaft/eaft-example.yml                                 | 3 +--
 examples/ebft/llama-1b-ebft-opencode-novllm.yaml               | 2 +-
 examples/ebft/llama-1b-ebft-opencode.yaml                      | 2 +-
 examples/ebft/llama-1b-ebft-strided-structured.yaml            | 3 +--
 examples/ebft/llama-1b-ebft-strided.yaml                       | 1 -
 examples/ebft/llama-3b-ebft-strided-fft.yaml                   | 1 -
 examples/ebft/llama-8b-ebft-strided-fft.yaml                   | 1 -
 examples/ebft/qwen35-4b-ebft-structured-async.yaml             | 2 +-
 examples/ebft/qwen35-4b-ebft-structured.yaml                   | 2 +-
 examples/ebft/qwen35-9b-ebft-structured.yaml                   | 2 +-
 examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml                | 2 +-
 examples/falcon-h1/falcon-h1-1b-qlora.yaml                     | 2 +-
 examples/falcon-h1/falcon-h1-34b-qlora.yaml                    | 2 +-
 examples/falcon-h1/falcon-h1-3b-qlora.yaml                     | 2 +-
 examples/falcon-h1/falcon-h1-500m-qlora.yaml                   | 2 +-
 examples/falcon-h1/falcon-h1-7b-qlora.yaml                     | 2 +-
 examples/gemma2/qlora.yml                                      | 2 +-
 examples/gemma2/reward-model.yaml                              | 2 +-
 examples/gemma3/gemma-3-1b-qlora.yml                           | 2 +-
 examples/gemma3/gemma-3-270m-qlora.yml                         | 2 +-
 examples/gemma3/gemma-3-4b-qlora.yml                           | 3 +--
 examples/gemma3/gemma-3-4b-vision-qlora.yml                    | 3 +--
 examples/gemma4/26b-a4b-moe-qlora.yaml                         | 2 +-
 examples/gemma4/31b-qlora-flex.yaml                            | 2 +-
 examples/gemma4/31b-qlora.yaml                                 | 2 +-
 examples/gemma4/e2b-vision-lora.yaml                           | 2 +-
 examples/glm4/qlora-32b.yaml                                   | 2 +-
 examples/glm45/glm-45-air-qlora.yaml                           | 2 +-
 examples/glm46v/glm-4-6v-flash-ddp.yaml                        | 2 +-
 examples/glm46v/glm-4-6v-flash-qlora.yaml                      | 2 +-
 examples/glm47-flash/lora.yaml                                 | 2 +-
 examples/glm47-flash/lora_fsdp.yaml                            | 2 +-
 examples/glm47-flash/qlora.yaml                                | 2 +-
 examples/glm47-flash/qlora_fsdp.yaml                           | 2 +-
 examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml           | 2 +-
 examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml          | 2 +-
 examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml            | 2 +-
 examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml                    | 2 +-
 examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml           | 2 +-
 examples/gpt-oss/gpt-oss-safeguard-20b-sft-lora-singlegpu.yaml | 2 +-
 examples/granite4/granite-4.0-tiny-fft.yaml                    | 2 +-
 examples/hunyuan/hunyuan-v1-dense-qlora.yaml                   | 2 +-
 examples/internvl3_5/internvl3_5-8b-qlora.yml                  | 3 +--
 examples/jamba/qlora.yaml                                      | 2 +-
 examples/jamba/qlora_deepspeed.yaml                            | 2 +-
 examples/jamba/qlora_fsdp_large.yaml                           | 2 +-
 examples/kimi-linear/kimi-48b-lora.yaml                        | 2 +-
 examples/llama-2/fft_optimized.yml                             | 2 +-
 examples/llama-2/gptq-lora.yml                                 | 2 --
 examples/llama-2/lisa.yml                                      | 2 +-
 examples/llama-2/loftq.yml                                     | 2 +-
 examples/llama-2/lora.yml                                      | 2 +-
 examples/llama-2/qlora-fsdp.yml                                | 2 +-
 examples/llama-2/qlora.yml                                     | 2 +-
 examples/llama-2/relora.yml                                    | 2 +-
 examples/llama-3-vision/lora-11b.yaml                          | 2 +-
 examples/llama-3/3b-fp8-fsdp2.yaml                             | 2 +-
 examples/llama-3/3b-qat-fsdp2.yaml                             | 2 +-
 examples/llama-3/3b-qat-mxfp4.yaml                             | 2 +-
 examples/llama-3/3b-qat-nvfp4.yaml                             | 2 +-
 examples/llama-3/diffusion/pretrain-1b.yaml                    | 2 +-
 examples/llama-3/diffusion/sft-1b.yaml                         | 2 +-
 examples/llama-3/fft-8b-liger-fsdp.yaml                        | 2 +-
 examples/llama-3/fft-8b.yaml                                   | 2 +-
 examples/llama-3/instruct-dpo-lora-8b.yml                      | 2 +-
 examples/llama-3/instruct-lora-8b.yml                          | 2 +-
 examples/llama-3/lora-1b-deduplicate-dpo.yml                   | 2 +-
 examples/llama-3/lora-1b-deduplicate-sft.yml                   | 2 +-
 examples/llama-3/lora-1b-kernels.yml                           | 2 +-
 examples/llama-3/lora-1b-ray.yml                               | 2 +-
 examples/llama-3/lora-1b-sample-packing-sequentially.yml       | 2 +-
 examples/llama-3/lora-1b.yml                                   | 2 +-
 examples/llama-3/lora-8b.yml                                   | 2 +-
 examples/llama-3/opentelemetry-qlora.yml                       | 1 -
 examples/llama-3/qlora-1b-gdpo.yaml                            | 2 +-
 examples/llama-3/qlora-1b-kto.yaml                             | 2 +-
 examples/llama-3/qlora-1b.yml                                  | 2 +-
 examples/llama-3/qlora-fsdp-405b.yaml                          | 2 +-
 examples/llama-3/qlora-fsdp-70b.yaml                           | 2 +-
 examples/llama-3/qlora.yml                                     | 2 +-
 examples/llama-3/sparse-finetuning.yaml                        | 3 +--
 examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml       | 2 +-
 examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml          | 2 +-
 examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml    | 2 +-
 examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml    | 2 +-
 examples/llama-4/scout-qlora-flexattn-fsdp2.yaml               | 2 +-
 examples/llama-4/scout-qlora-single-h100-flex.yaml             | 2 +-
 examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml            | 2 +-
 examples/llava/lora-7b.yaml                                    | 3 +--
 examples/magistral/magistral-small-fsdp-qlora.yaml             | 2 +-
 examples/magistral/magistral-small-qlora.yaml                  | 2 +-
 examples/magistral/think/magistral-small-think-qlora.yaml      | 2 +-
 examples/magistral/vision/magistral-small-vision-24B-qlora.yml | 2 +-
 examples/mamba/config.yml                                      | 1 -
 examples/mimo/mimo-7b-qlora.yaml                               | 2 +-
 examples/ministral/ministral-small-qlora.yaml                  | 2 +-
 examples/ministral3/ministral3-3b-qlora.yaml                   | 2 +-
 examples/ministral3/think/ministral3-3b-think-qlora.yaml       | 2 +-
 examples/ministral3/vision/ministral3-3b-vision-qlora.yml      | 2 +-
 examples/mistral-small/mistral-small-3.1-24B-lora.yml          | 2 +-
 examples/mistral/bigstral/bigstral-ds-zero3.yaml               | 2 +-
 examples/mistral/config.yml                                    | 2 +-
 examples/mistral/dpo/mistral-dpo-qlora.yml                     | 1 -
 examples/mistral/lora.yml                                      | 2 +-
 examples/mistral/mistral-qlora-fsdp.yml                        | 2 +-
 examples/mistral/mixtral/mixtral-8x22b-qlora-fsdp.yml          | 2 +-
 examples/mistral/mixtral/mixtral-qlora-fsdp.yml                | 2 +-
 examples/mistral/mixtral/mixtral.yml                           | 2 +-
 examples/mistral/mixtral/mixtral_22.yml                        | 2 +-
 examples/mistral/mps/lora-mps.yml                              | 3 +--
 examples/mistral/orpo/mistral-qlora-orpo.yml                   | 2 +-
 examples/mistral/qlora.yml                                     | 2 +-
 examples/mistral4/fft-text.yml                                 | 2 +-
 examples/mistral4/fft-vision.yml                               | 2 +-
 examples/mistral4/qlora-text.yml                               | 2 +-
 examples/mistral4/qlora-vision.yml                             | 2 +-
 examples/nemotron-h/120b-a12b-qlora.yaml                       | 2 +-
 examples/nemotron-h/nano-30b-a3b-qlora.yaml                    | 2 +-
 examples/nemotron/nemotron-mini-4b-qlora.yaml                  | 2 +-
 examples/olmo3/olmo3-7b-qlora.yaml                             | 2 +-
 examples/orpheus/finetune.yml                                  | 2 +-
 examples/phi/phi-ft.yml                                        | 2 +-
 examples/phi/phi-qlora.yml                                     | 2 +-
 examples/phi/phi2-ft.yml                                       | 2 +-
 examples/phi/phi3-ft-fsdp.yml                                  | 2 +-
 examples/phi/phi3-ft.yml                                       | 2 +-
 examples/pixtral/lora-12b.yml                                  | 2 +-
 examples/plano/plano-4b-qlora.yaml                             | 2 +-
 examples/qat_nvfp4/Gemma3-12B_baseline.yml                     | 2 +-
 examples/qat_nvfp4/Gemma3-12B_qat.yml                          | 2 +-
 examples/qat_nvfp4/Math-Gemma3-12B_baseline.yml                | 2 +-
 examples/qat_nvfp4/Math-Gemma3-12B_qat.yml                     | 2 +-
 examples/qat_nvfp4/Math-Gemma3-27B_baseline.yml                | 2 +-
 examples/qat_nvfp4/Math-Gemma3-27B_qat.yml                     | 2 +-
 examples/qat_nvfp4/Math-Qwen2.5-72B_baseline.yml               | 2 +-
 examples/qat_nvfp4/Math-Qwen2.5-72B_qat.yml                    | 2 +-
 examples/qat_nvfp4/Qwen2.5-72B_baseline.yml                    | 2 +-
 examples/qat_nvfp4/Qwen2.5-72B_qat.yml                         | 2 +-
 examples/qwen2-vl/lora-7b.yaml                                 | 3 +--
 examples/qwen2/adamw-pretrain-fsdp2.yaml                       | 2 +-
 examples/qwen2/dpo.yaml                                        | 2 +-
 examples/qwen2/muon-pretrain-fsdp2.yaml                        | 2 +-
 examples/qwen2/prm.yaml                                        | 2 +-
 examples/qwen2/qlora-fsdp.yaml                                 | 2 +-
 examples/qwen2/reward-model.yaml                               | 2 +-
 examples/qwen2_5-vl/lora-7b.yaml                               | 3 +--
 examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml              | 2 +-
 examples/qwen3.5/122b-a10b-moe-qlora-fsdp.yaml                 | 2 +-
 examples/qwen3.5/122b-a10b-moe-qlora.yaml                      | 2 +-
 examples/qwen3.5/27b-fft.yaml                                  | 2 +-
 examples/qwen3.5/27b-qlora-fsdp.yaml                           | 2 +-
 examples/qwen3.5/27b-qlora.yaml                                | 2 +-
 examples/qwen3.5/35b-a3b-moe-qlora-fsdp.yaml                   | 2 +-
 examples/qwen3.5/35b-a3b-moe-qlora.yaml                        | 2 +-
 examples/qwen3.5/35b-a3b-moe-vision-lora.yaml                  | 2 +-
 examples/qwen3.5/9b-fft-vision.yaml                            | 2 +-
 examples/qwen3.5/9b-lora-vision.yaml                           | 2 +-
 examples/qwen3/32b-qlora.yaml                                  | 2 +-
 examples/qwen3/8b-qat-fsdp2.yml                                | 2 +-
 examples/qwen3/qlora-fsdp.yaml                                 | 2 +-
 examples/seed-oss/seed-oss-36b-qlora.yaml                      | 2 +-
 examples/smolvlm2/smolvlm2-2B-lora.yaml                        | 3 +--
 examples/streaming/pretrain.yaml                               | 2 +-
 examples/streaming/sft.yaml                                    | 2 +-
 examples/swanlab/dpo-swanlab-completions.yml                   | 2 +-
 examples/swanlab/dpo-swanlab-full-featured.yml                 | 2 +-
 examples/swanlab/lora-swanlab-profiling.yml                    | 2 +-
 examples/trinity/trinity-nano-preview-qlora.yaml               | 2 +-
 examples/voxtral/voxtral-mini-audio-qlora.yml                  | 2 +-
 examples/voxtral/voxtral-mini-qlora.yml                        | 2 +-
 222 files changed, 209 insertions(+), 243 deletions(-)

diff --git a/examples/LiquidAI/lfm2-350m-fft.yaml b/examples/LiquidAI/lfm2-350m-fft.yaml
index 145b56dd1..cd5942206 100644
--- a/examples/LiquidAI/lfm2-350m-fft.yaml
+++ b/examples/LiquidAI/lfm2-350m-fft.yaml
@@ -39,7 +39,7 @@ tf32: true
 gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 2
diff --git a/examples/LiquidAI/lfm2-8b-a1b-lora.yaml b/examples/LiquidAI/lfm2-8b-a1b-lora.yaml
index 73cbfcce7..4932ea06e 100644
--- a/examples/LiquidAI/lfm2-8b-a1b-lora.yaml
+++ b/examples/LiquidAI/lfm2-8b-a1b-lora.yaml
@@ -48,7 +48,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 2
diff --git a/examples/LiquidAI/lfm2-vl-lora.yaml b/examples/LiquidAI/lfm2-vl-lora.yaml
index 313da8274..9a125da5e 100644
--- a/examples/LiquidAI/lfm2-vl-lora.yaml
+++ b/examples/LiquidAI/lfm2-vl-lora.yaml
@@ -50,8 +50,7 @@ tf32: true
 
 gradient_checkpointing: true
 logging_steps: 1
-flash_attention: true
-eager_attention:
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/alst/llama3-8b-deepspeed-alst.yaml b/examples/alst/llama3-8b-deepspeed-alst.yaml
index dea23c5ee..e844c6823 100644
--- a/examples/alst/llama3-8b-deepspeed-alst.yaml
+++ b/examples/alst/llama3-8b-deepspeed-alst.yaml
@@ -39,7 +39,7 @@ activation_offloading: legacy
 
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_steps: 100
 saves_per_epoch: 1
diff --git a/examples/alst/llama3-8b-fsdp2-alst.yaml b/examples/alst/llama3-8b-fsdp2-alst.yaml
index c8a978264..a7da92637 100644
--- a/examples/alst/llama3-8b-fsdp2-alst.yaml
+++ b/examples/alst/llama3-8b-fsdp2-alst.yaml
@@ -39,7 +39,7 @@ activation_offloading: legacy
 
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_steps: 100
 saves_per_epoch: 1
diff --git a/examples/apertus/apertus-8b-qlora.yaml b/examples/apertus/apertus-8b-qlora.yaml
index 521b282da..f43901363 100644
--- a/examples/apertus/apertus-8b-qlora.yaml
+++ b/examples/apertus/apertus-8b-qlora.yaml
@@ -55,7 +55,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/arcee/afm-4.5b-qlora.yaml b/examples/arcee/afm-4.5b-qlora.yaml
index 2cb42cacd..8e70847ad 100644
--- a/examples/arcee/afm-4.5b-qlora.yaml
+++ b/examples/arcee/afm-4.5b-qlora.yaml
@@ -55,7 +55,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/archived/cerebras/btlm-ft.yml b/examples/archived/cerebras/btlm-ft.yml
index c3495d287..5a5f8dc12 100644
--- a/examples/archived/cerebras/btlm-ft.yml
+++ b/examples/archived/cerebras/btlm-ft.yml
@@ -59,8 +59,7 @@ gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1
 
-flash_attention: true
-sdp_attention:
+attn_implementation: flash_attention_2
 flash_optimum:
 
 gptq_groupsize:
diff --git a/examples/archived/cerebras/qlora.yml b/examples/archived/cerebras/qlora.yml
index 4598a8338..22f52e682 100644
--- a/examples/archived/cerebras/qlora.yml
+++ b/examples/archived/cerebras/qlora.yml
@@ -39,8 +39,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-xformers_attention: true
-flash_attention:
+attn_implementation: xformers
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
diff --git a/examples/archived/code-llama/13b/lora.yml b/examples/archived/code-llama/13b/lora.yml
index ace94b619..43f623357 100644
--- a/examples/archived/code-llama/13b/lora.yml
+++ b/examples/archived/code-llama/13b/lora.yml
@@ -45,7 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/archived/code-llama/13b/qlora.yml b/examples/archived/code-llama/13b/qlora.yml
index f4ed17af5..086f5e3d8 100644
--- a/examples/archived/code-llama/13b/qlora.yml
+++ b/examples/archived/code-llama/13b/qlora.yml
@@ -46,7 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/archived/code-llama/34b/lora.yml b/examples/archived/code-llama/34b/lora.yml
index 0a1d71467..19aa898be 100644
--- a/examples/archived/code-llama/34b/lora.yml
+++ b/examples/archived/code-llama/34b/lora.yml
@@ -45,7 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/archived/code-llama/34b/qlora.yml b/examples/archived/code-llama/34b/qlora.yml
index ec17bf200..2ec78f0d8 100644
--- a/examples/archived/code-llama/34b/qlora.yml
+++ b/examples/archived/code-llama/34b/qlora.yml
@@ -46,7 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/archived/code-llama/7b/lora.yml b/examples/archived/code-llama/7b/lora.yml
index 174c17d2c..30bc63355 100644
--- a/examples/archived/code-llama/7b/lora.yml
+++ b/examples/archived/code-llama/7b/lora.yml
@@ -45,7 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/archived/code-llama/7b/qlora.yml b/examples/archived/code-llama/7b/qlora.yml
index 08e67d8c2..0c3b38519 100644
--- a/examples/archived/code-llama/7b/qlora.yml
+++ b/examples/archived/code-llama/7b/qlora.yml
@@ -46,7 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/archived/dbrx/16bit-lora.yaml b/examples/archived/dbrx/16bit-lora.yaml
index 05946dfe9..eca58f94c 100644
--- a/examples/archived/dbrx/16bit-lora.yaml
+++ b/examples/archived/dbrx/16bit-lora.yaml
@@ -52,7 +52,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch:
diff --git a/examples/archived/dbrx/8bit-lora.yaml b/examples/archived/dbrx/8bit-lora.yaml
index f159bf7fa..59f5241b4 100644
--- a/examples/archived/dbrx/8bit-lora.yaml
+++ b/examples/archived/dbrx/8bit-lora.yaml
@@ -55,7 +55,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch:
diff --git a/examples/archived/dbrx/fft-ds-zero3.yaml b/examples/archived/dbrx/fft-ds-zero3.yaml
index 13cd0d997..2cb3e6da1 100644
--- a/examples/archived/dbrx/fft-ds-zero3.yaml
+++ b/examples/archived/dbrx/fft-ds-zero3.yaml
@@ -39,7 +39,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch:
diff --git a/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml b/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
index 3223ec19a..b125e9e3f 100644
--- a/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
+++ b/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
@@ -45,7 +45,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/archived/falcon/config-7b-lora.yml b/examples/archived/falcon/config-7b-lora.yml
index f4fedbede..71dd572b3 100644
--- a/examples/archived/falcon/config-7b-lora.yml
+++ b/examples/archived/falcon/config-7b-lora.yml
@@ -43,8 +43,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-xformers_attention: true
-flash_attention:
+attn_implementation: xformers
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
diff --git a/examples/archived/falcon/config-7b-qlora.yml b/examples/archived/falcon/config-7b-qlora.yml
index a44cc40a6..edd6550a7 100644
--- a/examples/archived/falcon/config-7b-qlora.yml
+++ b/examples/archived/falcon/config-7b-qlora.yml
@@ -73,8 +73,7 @@ early_stopping_patience: 3
 resume_from_checkpoint:
 auto_resume_from_checkpoints: true
 logging_steps: 1
-xformers_attention: true
-flash_attention:
+attn_implementation: xformers
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
diff --git a/examples/archived/falcon/config-7b.yml b/examples/archived/falcon/config-7b.yml
index 5481fb236..6da39d7ab 100644
--- a/examples/archived/falcon/config-7b.yml
+++ b/examples/archived/falcon/config-7b.yml
@@ -40,8 +40,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-xformers_attention: true
-flash_attention:
+attn_implementation: xformers
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
diff --git a/examples/archived/gemma/qlora.yml b/examples/archived/gemma/qlora.yml
index 80829b3c9..5b5ec4a9f 100644
--- a/examples/archived/gemma/qlora.yml
+++ b/examples/archived/gemma/qlora.yml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/archived/gptj/qlora.yml b/examples/archived/gptj/qlora.yml
index 6348566c2..7e10adeaa 100644
--- a/examples/archived/gptj/qlora.yml
+++ b/examples/archived/gptj/qlora.yml
@@ -36,8 +36,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-xformers_attention: true
-flash_attention:
+attn_implementation: xformers
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
diff --git a/examples/archived/jeopardy-bot/config.yml b/examples/archived/jeopardy-bot/config.yml
index ab1d19784..90ca3b4bc 100644
--- a/examples/archived/jeopardy-bot/config.yml
+++ b/examples/archived/jeopardy-bot/config.yml
@@ -37,8 +37,7 @@ bf16: auto
 tf32: true
 resume_from_checkpoint:
 logging_steps: 5
-xformers_attention: true
-flash_attention:
+attn_implementation: xformers
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
diff --git a/examples/archived/mpt-7b/config.yml b/examples/archived/mpt-7b/config.yml
index 1fff51b6e..588981bf7 100644
--- a/examples/archived/mpt-7b/config.yml
+++ b/examples/archived/mpt-7b/config.yml
@@ -39,7 +39,6 @@ bf16: auto
 tf32: true
 resume_from_checkpoint:
 logging_steps: 5
-flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
diff --git a/examples/archived/openllama-3b/config.yml b/examples/archived/openllama-3b/config.yml
index 63056ed6d..14104ff4b 100644
--- a/examples/archived/openllama-3b/config.yml
+++ b/examples/archived/openllama-3b/config.yml
@@ -39,7 +39,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
diff --git a/examples/archived/openllama-3b/lora.yml b/examples/archived/openllama-3b/lora.yml
index b70821ce2..30d3888f1 100644
--- a/examples/archived/openllama-3b/lora.yml
+++ b/examples/archived/openllama-3b/lora.yml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
diff --git a/examples/archived/openllama-3b/qlora.yml b/examples/archived/openllama-3b/qlora.yml
index a34f2964b..fc9d1d703 100644
--- a/examples/archived/openllama-3b/qlora.yml
+++ b/examples/archived/openllama-3b/qlora.yml
@@ -40,7 +40,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
diff --git a/examples/archived/qwen/lora.yml b/examples/archived/qwen/lora.yml
index 29de25611..362a848a8 100644
--- a/examples/archived/qwen/lora.yml
+++ b/examples/archived/qwen/lora.yml
@@ -47,7 +47,6 @@ tf32: false
 gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention:
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/archived/qwen/qlora.yml b/examples/archived/qwen/qlora.yml
index d46669444..bce3012e7 100644
--- a/examples/archived/qwen/qlora.yml
+++ b/examples/archived/qwen/qlora.yml
@@ -47,7 +47,6 @@ tf32: false
 gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention:
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/archived/qwen/qwen2-moe-lora.yaml b/examples/archived/qwen/qwen2-moe-lora.yaml
index 1d5e1b524..97c0d51a6 100644
--- a/examples/archived/qwen/qwen2-moe-lora.yaml
+++ b/examples/archived/qwen/qwen2-moe-lora.yaml
@@ -43,7 +43,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/archived/qwen/qwen2-moe-qlora.yaml b/examples/archived/qwen/qwen2-moe-qlora.yaml
index 08731441b..a16089eed 100644
--- a/examples/archived/qwen/qwen2-moe-qlora.yaml
+++ b/examples/archived/qwen/qwen2-moe-qlora.yaml
@@ -46,7 +46,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/archived/redpajama/config-3b.yml b/examples/archived/redpajama/config-3b.yml
index c5b229c3d..676f31476 100644
--- a/examples/archived/redpajama/config-3b.yml
+++ b/examples/archived/redpajama/config-3b.yml
@@ -40,7 +40,6 @@ bf16: auto
 tf32: true
 resume_from_checkpoint:
 logging_steps: 5
-flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
diff --git a/examples/archived/replit-3b/config-lora.yml b/examples/archived/replit-3b/config-lora.yml
index d8561762c..b0a0c9089 100644
--- a/examples/archived/replit-3b/config-lora.yml
+++ b/examples/archived/replit-3b/config-lora.yml
@@ -38,7 +38,6 @@ tf32: true
 gradient_checkpointing:
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
diff --git a/examples/archived/stablelm-2/1.6b/fft.yml b/examples/archived/stablelm-2/1.6b/fft.yml
index 585888f43..05f59544c 100644
--- a/examples/archived/stablelm-2/1.6b/fft.yml
+++ b/examples/archived/stablelm-2/1.6b/fft.yml
@@ -44,7 +44,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
 flash_attn_fuse_mlp: true
diff --git a/examples/archived/stablelm-2/1.6b/lora.yml b/examples/archived/stablelm-2/1.6b/lora.yml
index 6d358bdd8..1edb56e0c 100644
--- a/examples/archived/stablelm-2/1.6b/lora.yml
+++ b/examples/archived/stablelm-2/1.6b/lora.yml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
 
diff --git a/examples/archived/starcoder2/qlora.yml b/examples/archived/starcoder2/qlora.yml
index fecf98d23..0fd0f453c 100644
--- a/examples/archived/starcoder2/qlora.yml
+++ b/examples/archived/starcoder2/qlora.yml
@@ -46,7 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/archived/tiny-llama/lora-mps.yml b/examples/archived/tiny-llama/lora-mps.yml
index 125090a78..bf3292c35 100644
--- a/examples/archived/tiny-llama/lora-mps.yml
+++ b/examples/archived/tiny-llama/lora-mps.yml
@@ -47,7 +47,6 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: false
 
 warmup_ratio: 0.1
 evals_per_epoch: 0
diff --git a/examples/archived/tiny-llama/lora.yml b/examples/archived/tiny-llama/lora.yml
index 817481e18..a12d63746 100644
--- a/examples/archived/tiny-llama/lora.yml
+++ b/examples/archived/tiny-llama/lora.yml
@@ -45,7 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/archived/tiny-llama/pretrain.yml b/examples/archived/tiny-llama/pretrain.yml
index f15c6ce19..4d1686138 100644
--- a/examples/archived/tiny-llama/pretrain.yml
+++ b/examples/archived/tiny-llama/pretrain.yml
@@ -36,7 +36,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch:
diff --git a/examples/archived/tiny-llama/qlora.yml b/examples/archived/tiny-llama/qlora.yml
index d3ff59cb8..b1adcb2e6 100644
--- a/examples/archived/tiny-llama/qlora.yml
+++ b/examples/archived/tiny-llama/qlora.yml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml b/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml
index fc09a1e7b..d548032b9 100644
--- a/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml
+++ b/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml
@@ -71,8 +71,7 @@ early_stopping_patience: 3
 resume_from_checkpoint:
 auto_resume_from_checkpoints: true
 logging_steps: 1
-xformers_attention: true
-flash_attention:
+attn_implementation: xformers
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
diff --git a/examples/archived/yi-34B-chat/qlora.yml b/examples/archived/yi-34B-chat/qlora.yml
index ba8d12fc8..5d3d54dc6 100644
--- a/examples/archived/yi-34B-chat/qlora.yml
+++ b/examples/archived/yi-34B-chat/qlora.yml
@@ -10,7 +10,7 @@ load_in_4bit: true
 sequence_len: 1024
 bf16: auto
 tf32: false
-flash_attention: true
+attn_implementation: flash_attention_2
 special_tokens:
   bos_token: "<|startoftext|>"
   eos_token: "<|endoftext|>"
diff --git a/examples/cohere/command-r-7b-qlora.yml b/examples/cohere/command-r-7b-qlora.yml
index b4741636b..c4d03b0ec 100644
--- a/examples/cohere/command-r-7b-qlora.yml
+++ b/examples/cohere/command-r-7b-qlora.yml
@@ -48,7 +48,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch:
diff --git a/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml b/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
index 97d1bb6b3..c36b0e74a 100644
--- a/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
@@ -45,7 +45,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml b/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
index b80cc5bc0..2b2aafd75 100644
--- a/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
@@ -45,7 +45,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/deepseek-v2/fft-fsdp-16b.yaml b/examples/deepseek-v2/fft-fsdp-16b.yaml
index 6e936da16..2eac9aea3 100644
--- a/examples/deepseek-v2/fft-fsdp-16b.yaml
+++ b/examples/deepseek-v2/fft-fsdp-16b.yaml
@@ -35,7 +35,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 2
diff --git a/examples/deepseek-v2/qlora-fsdp-2_5.yaml b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
index aab5034a0..0e23a0266 100644
--- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -59,7 +59,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 2
diff --git a/examples/devstral/devstral-small-qlora.yml b/examples/devstral/devstral-small-qlora.yml
index ca8e8e043..4a359a2e6 100644
--- a/examples/devstral/devstral-small-qlora.yml
+++ b/examples/devstral/devstral-small-qlora.yml
@@ -51,7 +51,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 scaling_softmax: true
 
 loss_watchdog_threshold: 5.0
diff --git a/examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml b/examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml
index f10dc9bd2..a99a6bef8 100644
--- a/examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml
+++ b/examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml
@@ -29,7 +29,7 @@ output_dir: ./outputs/ndp-out/
 
 sequence_len: 2048
 sample_packing: true
-flash_attention: true
+attn_implementation: flash_attention_2
 
 gradient_accumulation_steps: 1
 micro_batch_size: 1
diff --git a/examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml b/examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml
index 584a33f44..a12b524ed 100644
--- a/examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml
+++ b/examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml
@@ -26,7 +26,7 @@ output_dir: ./outputs/ndp-out/
 
 sequence_len: 8192
 sample_packing: true
-flash_attention: true
+attn_implementation: flash_attention_2
 
 gradient_accumulation_steps: 1
 micro_batch_size: 1  # must be 1 when using context parallel
diff --git a/examples/eaft/eaft-example.yml b/examples/eaft/eaft-example.yml
index fed4179d2..b4b13a14c 100644
--- a/examples/eaft/eaft-example.yml
+++ b/examples/eaft/eaft-example.yml
@@ -65,8 +65,7 @@ early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
-xformers_attention:
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 weight_decay: 0.0
diff --git a/examples/ebft/llama-1b-ebft-opencode-novllm.yaml b/examples/ebft/llama-1b-ebft-opencode-novllm.yaml
index 0891033f0..7d7edad33 100644
--- a/examples/ebft/llama-1b-ebft-opencode-novllm.yaml
+++ b/examples/ebft/llama-1b-ebft-opencode-novllm.yaml
@@ -46,7 +46,7 @@ lora_dropout: 0.05
 lora_target_linear: true
 
 bf16: auto
-flash_attention: true
+attn_implementation: flash_attention_2
 gradient_checkpointing: true
 
 special_tokens:
diff --git a/examples/ebft/llama-1b-ebft-opencode.yaml b/examples/ebft/llama-1b-ebft-opencode.yaml
index d0d1069d8..c77c36677 100644
--- a/examples/ebft/llama-1b-ebft-opencode.yaml
+++ b/examples/ebft/llama-1b-ebft-opencode.yaml
@@ -66,7 +66,7 @@ lora_target_linear: true
 
 # --- Hardware ---
 bf16: auto
-flash_attention: true
+attn_implementation: flash_attention_2
 gradient_checkpointing: true
 
 special_tokens:
diff --git a/examples/ebft/llama-1b-ebft-strided-structured.yaml b/examples/ebft/llama-1b-ebft-strided-structured.yaml
index 8ba63b64b..02e89dea0 100644
--- a/examples/ebft/llama-1b-ebft-strided-structured.yaml
+++ b/examples/ebft/llama-1b-ebft-strided-structured.yaml
@@ -47,8 +47,7 @@ lora_dropout: 0.05
 lora_target_linear: true
 
 bf16: auto
-flash_attention: false  # strided EBFT overrides to flex_attention (or eager fallback) at runtime
-flex_attention: true    # fused flex_attention kernel compiles itself; don't set torch_compile: true
+attn_implementation: flex_attention
                         # (full-model compile conflicts with gradient checkpointing + flex_attention)
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
diff --git a/examples/ebft/llama-1b-ebft-strided.yaml b/examples/ebft/llama-1b-ebft-strided.yaml
index c9519f160..e3cfe8040 100644
--- a/examples/ebft/llama-1b-ebft-strided.yaml
+++ b/examples/ebft/llama-1b-ebft-strided.yaml
@@ -46,7 +46,6 @@ lora_dropout: 0.05
 lora_target_linear: true
 
 bf16: auto
-flash_attention: false  # strided EBFT overrides to flex_attention (or eager fallback) at runtime
 gradient_checkpointing: true
 
 special_tokens:
diff --git a/examples/ebft/llama-3b-ebft-strided-fft.yaml b/examples/ebft/llama-3b-ebft-strided-fft.yaml
index 5695efa40..e39d3bcfa 100644
--- a/examples/ebft/llama-3b-ebft-strided-fft.yaml
+++ b/examples/ebft/llama-3b-ebft-strided-fft.yaml
@@ -48,7 +48,6 @@ lora_target_linear: true
 
 bf16: auto
 torch_dtype: bfloat16
-flash_attention: false
 gradient_checkpointing: true
 torch_compile: true
 gradient_checkpointing_kwargs:
diff --git a/examples/ebft/llama-8b-ebft-strided-fft.yaml b/examples/ebft/llama-8b-ebft-strided-fft.yaml
index 8cf962849..caed98085 100644
--- a/examples/ebft/llama-8b-ebft-strided-fft.yaml
+++ b/examples/ebft/llama-8b-ebft-strided-fft.yaml
@@ -41,7 +41,6 @@ warmup_steps: 10
 weight_decay: 0.01
 
 bf16: auto
-flash_attention: false  # strided EBFT uses flex_attention at runtime
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: false
diff --git a/examples/ebft/qwen35-4b-ebft-structured-async.yaml b/examples/ebft/qwen35-4b-ebft-structured-async.yaml
index 759a31730..daa77d6f6 100644
--- a/examples/ebft/qwen35-4b-ebft-structured-async.yaml
+++ b/examples/ebft/qwen35-4b-ebft-structured-async.yaml
@@ -72,7 +72,7 @@ lora_dropout: 0.0
 lora_target_modules: ".*\\.layers\\.(3|7|11|15|19|23|27|31)\\.self_attn\\.(q|k|v|o)_proj|.*\\.mlp\\.(gate|up|down)_proj"
 
 bf16: auto
-flash_attention: true
+attn_implementation: flash_attention_2
 gradient_checkpointing: true
 
 special_tokens:
diff --git a/examples/ebft/qwen35-4b-ebft-structured.yaml b/examples/ebft/qwen35-4b-ebft-structured.yaml
index 9108e87e9..d1b2a72f2 100644
--- a/examples/ebft/qwen35-4b-ebft-structured.yaml
+++ b/examples/ebft/qwen35-4b-ebft-structured.yaml
@@ -63,7 +63,7 @@ lora_dropout: 0.0
 lora_target_modules: ".*\\.layers\\.(3|7|11|15|19|23|27|31)\\.self_attn\\.(q|k|v|o)_proj|.*\\.mlp\\.(gate|up|down)_proj"
 
 bf16: auto
-flash_attention: true
+attn_implementation: flash_attention_2
 gradient_checkpointing: true
 
 special_tokens:
diff --git a/examples/ebft/qwen35-9b-ebft-structured.yaml b/examples/ebft/qwen35-9b-ebft-structured.yaml
index e79fb5fbf..ad3b8538e 100644
--- a/examples/ebft/qwen35-9b-ebft-structured.yaml
+++ b/examples/ebft/qwen35-9b-ebft-structured.yaml
@@ -68,7 +68,7 @@ lora_dropout: 0.0
 lora_target_modules: ".*\\.layers\\.(3|7|11|15|19|23|27|31)\\.self_attn\\.(q|k|v|o)_proj|.*\\.mlp\\.(gate|up|down)_proj"
 
 bf16: auto
-flash_attention: true
+attn_implementation: flash_attention_2
 gradient_checkpointing: true
 
 special_tokens:
diff --git a/examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml b/examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml
index 2473179f0..f59f0df5c 100644
--- a/examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml
@@ -62,7 +62,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch:
diff --git a/examples/falcon-h1/falcon-h1-1b-qlora.yaml b/examples/falcon-h1/falcon-h1-1b-qlora.yaml
index bfb7836ef..8c3eb080d 100644
--- a/examples/falcon-h1/falcon-h1-1b-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-1b-qlora.yaml
@@ -61,7 +61,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch:
diff --git a/examples/falcon-h1/falcon-h1-34b-qlora.yaml b/examples/falcon-h1/falcon-h1-34b-qlora.yaml
index 80a9d45b5..28e7de956 100644
--- a/examples/falcon-h1/falcon-h1-34b-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-34b-qlora.yaml
@@ -62,7 +62,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch:
diff --git a/examples/falcon-h1/falcon-h1-3b-qlora.yaml b/examples/falcon-h1/falcon-h1-3b-qlora.yaml
index 02be8ac5d..71b38e2f7 100644
--- a/examples/falcon-h1/falcon-h1-3b-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-3b-qlora.yaml
@@ -62,7 +62,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/falcon-h1/falcon-h1-500m-qlora.yaml b/examples/falcon-h1/falcon-h1-500m-qlora.yaml
index b112d5d85..91602ae71 100644
--- a/examples/falcon-h1/falcon-h1-500m-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-500m-qlora.yaml
@@ -62,7 +62,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch:
diff --git a/examples/falcon-h1/falcon-h1-7b-qlora.yaml b/examples/falcon-h1/falcon-h1-7b-qlora.yaml
index c5505873d..cc7e8f6cd 100644
--- a/examples/falcon-h1/falcon-h1-7b-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-7b-qlora.yaml
@@ -62,7 +62,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/gemma2/qlora.yml b/examples/gemma2/qlora.yml
index 8a295a1f8..b2fca74da 100644
--- a/examples/gemma2/qlora.yml
+++ b/examples/gemma2/qlora.yml
@@ -53,7 +53,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch:
diff --git a/examples/gemma2/reward-model.yaml b/examples/gemma2/reward-model.yaml
index 67b1228b2..f48bff626 100644
--- a/examples/gemma2/reward-model.yaml
+++ b/examples/gemma2/reward-model.yaml
@@ -43,7 +43,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch:
diff --git a/examples/gemma3/gemma-3-1b-qlora.yml b/examples/gemma3/gemma-3-1b-qlora.yml
index 4bcbf09f4..95b99a0da 100644
--- a/examples/gemma3/gemma-3-1b-qlora.yml
+++ b/examples/gemma3/gemma-3-1b-qlora.yml
@@ -62,7 +62,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch:
diff --git a/examples/gemma3/gemma-3-270m-qlora.yml b/examples/gemma3/gemma-3-270m-qlora.yml
index 1f247ab05..800a88a1b 100644
--- a/examples/gemma3/gemma-3-270m-qlora.yml
+++ b/examples/gemma3/gemma-3-270m-qlora.yml
@@ -62,7 +62,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch:
diff --git a/examples/gemma3/gemma-3-4b-qlora.yml b/examples/gemma3/gemma-3-4b-qlora.yml
index 5d939da19..e7c43ddef 100644
--- a/examples/gemma3/gemma-3-4b-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-qlora.yml
@@ -58,8 +58,7 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: false
 logging_steps: 1
-flash_attention: true
-eager_attention:
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/gemma3/gemma-3-4b-vision-qlora.yml b/examples/gemma3/gemma-3-4b-vision-qlora.yml
index a12e84bee..790d9543a 100644
--- a/examples/gemma3/gemma-3-4b-vision-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-vision-qlora.yml
@@ -55,8 +55,7 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: false
 logging_steps: 1
-flash_attention: true
-eager_attention:
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/gemma4/26b-a4b-moe-qlora.yaml b/examples/gemma4/26b-a4b-moe-qlora.yaml
index e7bdb6f46..cdc70ef4a 100644
--- a/examples/gemma4/26b-a4b-moe-qlora.yaml
+++ b/examples/gemma4/26b-a4b-moe-qlora.yaml
@@ -84,7 +84,7 @@ activation_offloading: true
 logging_steps: 1
 
 # FA2 not supported
-sdp_attention: true
+attn_implementation: sdpa
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/gemma4/31b-qlora-flex.yaml b/examples/gemma4/31b-qlora-flex.yaml
index 8456c9c13..87221c515 100644
--- a/examples/gemma4/31b-qlora-flex.yaml
+++ b/examples/gemma4/31b-qlora-flex.yaml
@@ -62,7 +62,7 @@ activation_offloading: true
 logging_steps: 1
 
 # FA not supported
-flex_attention: true
+attn_implementation: flex_attention
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/gemma4/31b-qlora.yaml b/examples/gemma4/31b-qlora.yaml
index 42086a43c..4a633436e 100644
--- a/examples/gemma4/31b-qlora.yaml
+++ b/examples/gemma4/31b-qlora.yaml
@@ -60,7 +60,7 @@ activation_offloading: true
 logging_steps: 1
 
 # FA not supported
-sdp_attention: true
+attn_implementation: sdpa
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/gemma4/e2b-vision-lora.yaml b/examples/gemma4/e2b-vision-lora.yaml
index c779aaea5..ae90bc1cb 100644
--- a/examples/gemma4/e2b-vision-lora.yaml
+++ b/examples/gemma4/e2b-vision-lora.yaml
@@ -50,7 +50,7 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: false
 logging_steps: 1
-sdp_attention: true
+attn_implementation: sdpa
 
 warmup_ratio: 0.1
 weight_decay: 0.0
diff --git a/examples/glm4/qlora-32b.yaml b/examples/glm4/qlora-32b.yaml
index 832abde05..151820924 100644
--- a/examples/glm4/qlora-32b.yaml
+++ b/examples/glm4/qlora-32b.yaml
@@ -50,7 +50,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
diff --git a/examples/glm45/glm-45-air-qlora.yaml b/examples/glm45/glm-45-air-qlora.yaml
index accb8898f..5723d3c45 100644
--- a/examples/glm45/glm-45-air-qlora.yaml
+++ b/examples/glm45/glm-45-air-qlora.yaml
@@ -55,7 +55,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/glm46v/glm-4-6v-flash-ddp.yaml b/examples/glm46v/glm-4-6v-flash-ddp.yaml
index c67ac5e28..274f041a3 100644
--- a/examples/glm46v/glm-4-6v-flash-ddp.yaml
+++ b/examples/glm46v/glm-4-6v-flash-ddp.yaml
@@ -45,7 +45,7 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: false
 logging_steps: 1
-sdp_attention: true
+attn_implementation: sdpa
 
 warmup_ratio: 0.1
 evals_per_epoch: 0
diff --git a/examples/glm46v/glm-4-6v-flash-qlora.yaml b/examples/glm46v/glm-4-6v-flash-qlora.yaml
index 287944ae8..9fe8d6e43 100644
--- a/examples/glm46v/glm-4-6v-flash-qlora.yaml
+++ b/examples/glm46v/glm-4-6v-flash-qlora.yaml
@@ -42,7 +42,7 @@ tf32: false
 
 gradient_checkpointing: true
 logging_steps: 1
-sdp_attention: true
+attn_implementation: sdpa
 
 warmup_ratio: 0.1
 evals_per_epoch: 0
diff --git a/examples/glm47-flash/lora.yaml b/examples/glm47-flash/lora.yaml
index 2586babb7..5f3de36e9 100644
--- a/examples/glm47-flash/lora.yaml
+++ b/examples/glm47-flash/lora.yaml
@@ -58,7 +58,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/glm47-flash/lora_fsdp.yaml b/examples/glm47-flash/lora_fsdp.yaml
index bee20bf02..cf1d2de55 100644
--- a/examples/glm47-flash/lora_fsdp.yaml
+++ b/examples/glm47-flash/lora_fsdp.yaml
@@ -57,7 +57,7 @@ tf32: false
 
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/glm47-flash/qlora.yaml b/examples/glm47-flash/qlora.yaml
index 834c46af8..a05bf54d2 100644
--- a/examples/glm47-flash/qlora.yaml
+++ b/examples/glm47-flash/qlora.yaml
@@ -58,7 +58,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/glm47-flash/qlora_fsdp.yaml b/examples/glm47-flash/qlora_fsdp.yaml
index 0bb87813f..9ad5a6212 100644
--- a/examples/glm47-flash/qlora_fsdp.yaml
+++ b/examples/glm47-flash/qlora_fsdp.yaml
@@ -57,7 +57,7 @@ tf32: false
 
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml b/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
index b7082f986..512784e50 100644
--- a/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
+++ b/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
@@ -47,7 +47,7 @@ learning_rate: 2e-5
 bf16: true
 tf32: true
 
-flash_attention: true
+attn_implementation: flash_attention_2
 attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
 
 gradient_checkpointing: true
diff --git a/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml b/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml
index b718ff2eb..e36cd5192 100644
--- a/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml
+++ b/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml
@@ -43,7 +43,7 @@ learning_rate: 2e-5
 bf16: true
 tf32: true
 
-flash_attention: true
+attn_implementation: flash_attention_2
 attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
 
 gradient_checkpointing: true
diff --git a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml
index af1c93bc0..cd85460d8 100644
--- a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml
+++ b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml
@@ -44,7 +44,7 @@ learning_rate: 2e-5
 bf16: true
 tf32: true
 
-flash_attention: true
+attn_implementation: flash_attention_2
 attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
 
 gradient_checkpointing: true
diff --git a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml
index 894ba99b8..2ebfd1a80 100644
--- a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml
+++ b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml
@@ -43,7 +43,7 @@ learning_rate: 2e-5
 bf16: true
 tf32: true
 
-flash_attention: true
+attn_implementation: flash_attention_2
 attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
 
 gradient_checkpointing: true
diff --git a/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml b/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml
index 7c4f97846..dd632e4a0 100644
--- a/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml
+++ b/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml
@@ -56,7 +56,7 @@ learning_rate: 2e-4
 bf16: true
 tf32: true
 
-flash_attention: true
+attn_implementation: flash_attention_2
 attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
 
 gradient_checkpointing: true
diff --git a/examples/gpt-oss/gpt-oss-safeguard-20b-sft-lora-singlegpu.yaml b/examples/gpt-oss/gpt-oss-safeguard-20b-sft-lora-singlegpu.yaml
index cbb9efc8e..d57f9501d 100644
--- a/examples/gpt-oss/gpt-oss-safeguard-20b-sft-lora-singlegpu.yaml
+++ b/examples/gpt-oss/gpt-oss-safeguard-20b-sft-lora-singlegpu.yaml
@@ -56,7 +56,7 @@ learning_rate: 2e-4
 bf16: true
 tf32: true
 
-flash_attention: true
+attn_implementation: flash_attention_2
 attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
 
 gradient_checkpointing: true
diff --git a/examples/granite4/granite-4.0-tiny-fft.yaml b/examples/granite4/granite-4.0-tiny-fft.yaml
index 7ff8207ae..fd7d2a312 100644
--- a/examples/granite4/granite-4.0-tiny-fft.yaml
+++ b/examples/granite4/granite-4.0-tiny-fft.yaml
@@ -36,7 +36,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/hunyuan/hunyuan-v1-dense-qlora.yaml b/examples/hunyuan/hunyuan-v1-dense-qlora.yaml
index a94345a61..1ae6b000d 100644
--- a/examples/hunyuan/hunyuan-v1-dense-qlora.yaml
+++ b/examples/hunyuan/hunyuan-v1-dense-qlora.yaml
@@ -55,7 +55,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/internvl3_5/internvl3_5-8b-qlora.yml b/examples/internvl3_5/internvl3_5-8b-qlora.yml
index 9a72d078a..2d924c6f1 100644
--- a/examples/internvl3_5/internvl3_5-8b-qlora.yml
+++ b/examples/internvl3_5/internvl3_5-8b-qlora.yml
@@ -50,8 +50,7 @@ tf32: true
 
 gradient_checkpointing: true
 logging_steps: 1
-flash_attention: true
-eager_attention:
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/jamba/qlora.yaml b/examples/jamba/qlora.yaml
index 538ed3a10..f625fb6f5 100644
--- a/examples/jamba/qlora.yaml
+++ b/examples/jamba/qlora.yaml
@@ -47,7 +47,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch:
diff --git a/examples/jamba/qlora_deepspeed.yaml b/examples/jamba/qlora_deepspeed.yaml
index b288635e7..8ec74f905 100644
--- a/examples/jamba/qlora_deepspeed.yaml
+++ b/examples/jamba/qlora_deepspeed.yaml
@@ -46,7 +46,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch:
diff --git a/examples/jamba/qlora_fsdp_large.yaml b/examples/jamba/qlora_fsdp_large.yaml
index 4db889fbc..76cc0ef18 100644
--- a/examples/jamba/qlora_fsdp_large.yaml
+++ b/examples/jamba/qlora_fsdp_large.yaml
@@ -44,7 +44,7 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: true
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/kimi-linear/kimi-48b-lora.yaml b/examples/kimi-linear/kimi-48b-lora.yaml
index 8e855dd72..befa29891 100644
--- a/examples/kimi-linear/kimi-48b-lora.yaml
+++ b/examples/kimi-linear/kimi-48b-lora.yaml
@@ -65,7 +65,7 @@ early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
diff --git a/examples/llama-2/fft_optimized.yml b/examples/llama-2/fft_optimized.yml
index ea119348e..7af25dd17 100644
--- a/examples/llama-2/fft_optimized.yml
+++ b/examples/llama-2/fft_optimized.yml
@@ -42,7 +42,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
 flash_attn_fuse_mlp: true
diff --git a/examples/llama-2/gptq-lora.yml b/examples/llama-2/gptq-lora.yml
index de1caaa05..c4073b80a 100644
--- a/examples/llama-2/gptq-lora.yml
+++ b/examples/llama-2/gptq-lora.yml
@@ -53,8 +53,6 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention:
-sdp_attention:
 flash_optimum:
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/llama-2/lisa.yml b/examples/llama-2/lisa.yml
index d21c01a49..40ba6d0d0 100644
--- a/examples/llama-2/lisa.yml
+++ b/examples/llama-2/lisa.yml
@@ -46,7 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
 flash_attn_fuse_mlp: true
diff --git a/examples/llama-2/loftq.yml b/examples/llama-2/loftq.yml
index 619e5bcce..f1562ec29 100644
--- a/examples/llama-2/loftq.yml
+++ b/examples/llama-2/loftq.yml
@@ -45,7 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/llama-2/lora.yml b/examples/llama-2/lora.yml
index 0a677f11a..8c2242b71 100644
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -45,7 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/llama-2/qlora-fsdp.yml b/examples/llama-2/qlora-fsdp.yml
index 1e7064de8..102eb7af7 100644
--- a/examples/llama-2/qlora-fsdp.yml
+++ b/examples/llama-2/qlora-fsdp.yml
@@ -48,7 +48,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/llama-2/qlora.yml b/examples/llama-2/qlora.yml
index 327d88c15..87e710792 100644
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -46,7 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/llama-2/relora.yml b/examples/llama-2/relora.yml
index fabdf0e0f..8e3df58bf 100644
--- a/examples/llama-2/relora.yml
+++ b/examples/llama-2/relora.yml
@@ -51,7 +51,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/llama-3-vision/lora-11b.yaml b/examples/llama-3-vision/lora-11b.yaml
index adbb61643..4e5eb4c4e 100644
--- a/examples/llama-3-vision/lora-11b.yaml
+++ b/examples/llama-3-vision/lora-11b.yaml
@@ -50,7 +50,7 @@ tf32: true
 gradient_checkpointing: true
 logging_steps: 1
 # flash_attention: true  # use for text-only mode
-sdp_attention: true
+attn_implementation: sdpa
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/llama-3/3b-fp8-fsdp2.yaml b/examples/llama-3/3b-fp8-fsdp2.yaml
index 57b308abd..cfc15870f 100644
--- a/examples/llama-3/3b-fp8-fsdp2.yaml
+++ b/examples/llama-3/3b-fp8-fsdp2.yaml
@@ -25,7 +25,7 @@ sample_packing: true
 pad_to_sequence_len: true
 sequence_len: 512
 
-flex_attention: true
+attn_implementation: flex_attention
 flex_attn_compile_kwargs:
   dynamic: false
   mode: max-autotune-no-cudagraphs
diff --git a/examples/llama-3/3b-qat-fsdp2.yaml b/examples/llama-3/3b-qat-fsdp2.yaml
index 0c5a87891..99c975351 100644
--- a/examples/llama-3/3b-qat-fsdp2.yaml
+++ b/examples/llama-3/3b-qat-fsdp2.yaml
@@ -26,7 +26,7 @@ dataset_prepared_path: ./outputs/qat_out/dataset_prepared
 
 sample_packing: false
 sequence_len: 8192
-flash_attention: true
+attn_implementation: flash_attention_2
 
 qat:
   activation_dtype: int8
diff --git a/examples/llama-3/3b-qat-mxfp4.yaml b/examples/llama-3/3b-qat-mxfp4.yaml
index 7ae941e9e..4e9f64685 100644
--- a/examples/llama-3/3b-qat-mxfp4.yaml
+++ b/examples/llama-3/3b-qat-mxfp4.yaml
@@ -24,7 +24,7 @@ output_dir: ./outputs/qat_out/
 dataset_prepared_path: ./outputs/dataset_prepared
 
 sequence_len: 2048
-flash_attention: true
+attn_implementation: flash_attention_2
 
 qat:
   activation_dtype: mxfp4
diff --git a/examples/llama-3/3b-qat-nvfp4.yaml b/examples/llama-3/3b-qat-nvfp4.yaml
index 1ec809bbe..77cf2b19b 100644
--- a/examples/llama-3/3b-qat-nvfp4.yaml
+++ b/examples/llama-3/3b-qat-nvfp4.yaml
@@ -24,7 +24,7 @@ output_dir: ./outputs/qat_out/
 dataset_prepared_path: ./outputs/dataset_prepared
 
 sequence_len: 8192
-flash_attention: true
+attn_implementation: flash_attention_2
 
 qat:
   activation_dtype: nvfp4
diff --git a/examples/llama-3/diffusion/pretrain-1b.yaml b/examples/llama-3/diffusion/pretrain-1b.yaml
index 8d05e4c60..1b488db7a 100644
--- a/examples/llama-3/diffusion/pretrain-1b.yaml
+++ b/examples/llama-3/diffusion/pretrain-1b.yaml
@@ -35,7 +35,7 @@ warmup_ratio: 0.1
 optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 3e-4
-sdp_attention: true
+attn_implementation: sdpa
 
 bf16: auto
 tf32: true
diff --git a/examples/llama-3/diffusion/sft-1b.yaml b/examples/llama-3/diffusion/sft-1b.yaml
index f3b29a809..b6de76af3 100644
--- a/examples/llama-3/diffusion/sft-1b.yaml
+++ b/examples/llama-3/diffusion/sft-1b.yaml
@@ -41,7 +41,7 @@ tf32: true
 
 gradient_checkpointing: true
 resume_from_checkpoint:
-sdp_attention: true
+attn_implementation: sdpa
 
 logging_steps: 1
 save_strategy: best
diff --git a/examples/llama-3/fft-8b-liger-fsdp.yaml b/examples/llama-3/fft-8b-liger-fsdp.yaml
index a655b97a9..b96bc920e 100644
--- a/examples/llama-3/fft-8b-liger-fsdp.yaml
+++ b/examples/llama-3/fft-8b-liger-fsdp.yaml
@@ -49,7 +49,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 2
diff --git a/examples/llama-3/fft-8b.yaml b/examples/llama-3/fft-8b.yaml
index c72ec6662..3e2809196 100644
--- a/examples/llama-3/fft-8b.yaml
+++ b/examples/llama-3/fft-8b.yaml
@@ -34,7 +34,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 2
diff --git a/examples/llama-3/instruct-dpo-lora-8b.yml b/examples/llama-3/instruct-dpo-lora-8b.yml
index cf823353b..b49ace2ed 100644
--- a/examples/llama-3/instruct-dpo-lora-8b.yml
+++ b/examples/llama-3/instruct-dpo-lora-8b.yml
@@ -65,7 +65,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/llama-3/instruct-lora-8b.yml b/examples/llama-3/instruct-lora-8b.yml
index 401df1d72..1c61ce9e4 100644
--- a/examples/llama-3/instruct-lora-8b.yml
+++ b/examples/llama-3/instruct-lora-8b.yml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/llama-3/lora-1b-deduplicate-dpo.yml b/examples/llama-3/lora-1b-deduplicate-dpo.yml
index 2897636f4..2be72c4d0 100644
--- a/examples/llama-3/lora-1b-deduplicate-dpo.yml
+++ b/examples/llama-3/lora-1b-deduplicate-dpo.yml
@@ -77,7 +77,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/llama-3/lora-1b-deduplicate-sft.yml b/examples/llama-3/lora-1b-deduplicate-sft.yml
index c5190d892..ad21cb266 100644
--- a/examples/llama-3/lora-1b-deduplicate-sft.yml
+++ b/examples/llama-3/lora-1b-deduplicate-sft.yml
@@ -53,7 +53,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/llama-3/lora-1b-kernels.yml b/examples/llama-3/lora-1b-kernels.yml
index 0bcf46b17..b0914f87a 100644
--- a/examples/llama-3/lora-1b-kernels.yml
+++ b/examples/llama-3/lora-1b-kernels.yml
@@ -54,7 +54,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
diff --git a/examples/llama-3/lora-1b-ray.yml b/examples/llama-3/lora-1b-ray.yml
index 46c83348e..a3aa1cf5e 100644
--- a/examples/llama-3/lora-1b-ray.yml
+++ b/examples/llama-3/lora-1b-ray.yml
@@ -48,7 +48,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
diff --git a/examples/llama-3/lora-1b-sample-packing-sequentially.yml b/examples/llama-3/lora-1b-sample-packing-sequentially.yml
index dba78597b..f6c24bc74 100644
--- a/examples/llama-3/lora-1b-sample-packing-sequentially.yml
+++ b/examples/llama-3/lora-1b-sample-packing-sequentially.yml
@@ -55,7 +55,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/llama-3/lora-1b.yml b/examples/llama-3/lora-1b.yml
index 2ae2f0056..d01c618bc 100644
--- a/examples/llama-3/lora-1b.yml
+++ b/examples/llama-3/lora-1b.yml
@@ -49,7 +49,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
diff --git a/examples/llama-3/lora-8b.yml b/examples/llama-3/lora-8b.yml
index d72b6527d..90084ec95 100644
--- a/examples/llama-3/lora-8b.yml
+++ b/examples/llama-3/lora-8b.yml
@@ -49,7 +49,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/llama-3/opentelemetry-qlora.yml b/examples/llama-3/opentelemetry-qlora.yml
index d8ce7b1ec..0c9995dae 100644
--- a/examples/llama-3/opentelemetry-qlora.yml
+++ b/examples/llama-3/opentelemetry-qlora.yml
@@ -39,7 +39,6 @@ tf32: false
 
 gradient_checkpointing: true
 logging_steps: 1
-flash_attention: false
 
 warmup_ratio: 0.1
 evals_per_epoch: 2
diff --git a/examples/llama-3/qlora-1b-gdpo.yaml b/examples/llama-3/qlora-1b-gdpo.yaml
index d806fcf26..f754a6887 100644
--- a/examples/llama-3/qlora-1b-gdpo.yaml
+++ b/examples/llama-3/qlora-1b-gdpo.yaml
@@ -56,7 +56,7 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: false
 
-flash_attention: true
+attn_implementation: flash_attention_2
 logging_steps: 1
 save_steps: 50
 save_safetensors: true
diff --git a/examples/llama-3/qlora-1b-kto.yaml b/examples/llama-3/qlora-1b-kto.yaml
index a6a84e7b1..18c240d97 100644
--- a/examples/llama-3/qlora-1b-kto.yaml
+++ b/examples/llama-3/qlora-1b-kto.yaml
@@ -53,7 +53,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/llama-3/qlora-1b.yml b/examples/llama-3/qlora-1b.yml
index 1e4f97438..d1e5e18ae 100644
--- a/examples/llama-3/qlora-1b.yml
+++ b/examples/llama-3/qlora-1b.yml
@@ -51,7 +51,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
diff --git a/examples/llama-3/qlora-fsdp-405b.yaml b/examples/llama-3/qlora-fsdp-405b.yaml
index 5c236f2cf..b801af845 100644
--- a/examples/llama-3/qlora-fsdp-405b.yaml
+++ b/examples/llama-3/qlora-fsdp-405b.yaml
@@ -38,7 +38,7 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: true
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/llama-3/qlora-fsdp-70b.yaml b/examples/llama-3/qlora-fsdp-70b.yaml
index c052bc19d..5ce774e18 100644
--- a/examples/llama-3/qlora-fsdp-70b.yaml
+++ b/examples/llama-3/qlora-fsdp-70b.yaml
@@ -48,7 +48,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/llama-3/qlora.yml b/examples/llama-3/qlora.yml
index a8f47a0e2..fad507cd9 100644
--- a/examples/llama-3/qlora.yml
+++ b/examples/llama-3/qlora.yml
@@ -46,7 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/llama-3/sparse-finetuning.yaml b/examples/llama-3/sparse-finetuning.yaml
index 348756b70..0ce4aa03d 100644
--- a/examples/llama-3/sparse-finetuning.yaml
+++ b/examples/llama-3/sparse-finetuning.yaml
@@ -44,8 +44,7 @@ gradient_checkpointing_kwargs:
 early_stopping_patience:
 resume_from_checkpoint:
 logging_steps: 1
-xformers_attention:
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 2
diff --git a/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml b/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml
index b20f79758..2c701a2aa 100644
--- a/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml
+++ b/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml
@@ -60,7 +60,7 @@ bf16: true
 tf32: true
 
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 gradient_checkpointing: offload
 gradient_checkpointing_kwargs:
diff --git a/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml b/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml
index 40449009c..8197d1629 100644
--- a/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml
+++ b/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml
@@ -67,7 +67,7 @@ bf16: true
 tf32: true
 
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml b/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml
index abdc51378..2dcff36cd 100644
--- a/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml
+++ b/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml
@@ -70,7 +70,7 @@ bf16: true
 tf32: true
 
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 gradient_checkpointing: offload
 gradient_checkpointing_kwargs:
diff --git a/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml b/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml
index 4136dc14a..de7ae5f50 100644
--- a/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml
+++ b/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml
@@ -62,7 +62,7 @@ bf16: true
 tf32: true
 
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml b/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml
index 02c04c691..c5343fa2e 100644
--- a/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml
+++ b/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml
@@ -59,7 +59,7 @@ bf16: true
 tf32: true
 
 logging_steps: 1
-flex_attention: true
+attn_implementation: flex_attention
 flex_attn_compile_kwargs:
   dynamic: false
   mode: max-autotune-no-cudagraphs
diff --git a/examples/llama-4/scout-qlora-single-h100-flex.yaml b/examples/llama-4/scout-qlora-single-h100-flex.yaml
index 33a691189..00491c3b1 100644
--- a/examples/llama-4/scout-qlora-single-h100-flex.yaml
+++ b/examples/llama-4/scout-qlora-single-h100-flex.yaml
@@ -64,7 +64,7 @@ bf16: true
 tf32: true
 
 torch_compile: true
-flex_attention: true
+attn_implementation: flex_attention
 flex_attn_compile_kwargs:
   dynamic: false
   mode: max-autotune-no-cudagraphs
diff --git a/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml b/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml
index 5972c2ae3..9b3e089b5 100644
--- a/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml
+++ b/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml
@@ -61,7 +61,7 @@ bf16: true
 tf32: true
 
 logging_steps: 1
-flex_attention: true
+attn_implementation: flex_attention
 flex_attn_compile_kwargs:
   dynamic: false
   mode: max-autotune-no-cudagraphs
diff --git a/examples/llava/lora-7b.yaml b/examples/llava/lora-7b.yaml
index 77ef7474d..56b48fda9 100644
--- a/examples/llava/lora-7b.yaml
+++ b/examples/llava/lora-7b.yaml
@@ -45,8 +45,7 @@ tf32: true
 
 gradient_checkpointing: true
 logging_steps: 1
-flash_attention: true
-eager_attention:
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/magistral/magistral-small-fsdp-qlora.yaml b/examples/magistral/magistral-small-fsdp-qlora.yaml
index d46c49fe0..f31ca7326 100644
--- a/examples/magistral/magistral-small-fsdp-qlora.yaml
+++ b/examples/magistral/magistral-small-fsdp-qlora.yaml
@@ -59,7 +59,7 @@ tf32: false
 gradient_checkpointing:
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/magistral/magistral-small-qlora.yaml b/examples/magistral/magistral-small-qlora.yaml
index 188924d39..90f6b6f91 100644
--- a/examples/magistral/magistral-small-qlora.yaml
+++ b/examples/magistral/magistral-small-qlora.yaml
@@ -58,7 +58,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/magistral/think/magistral-small-think-qlora.yaml b/examples/magistral/think/magistral-small-think-qlora.yaml
index b715b3156..85abe18da 100644
--- a/examples/magistral/think/magistral-small-think-qlora.yaml
+++ b/examples/magistral/think/magistral-small-think-qlora.yaml
@@ -58,7 +58,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/magistral/vision/magistral-small-vision-24B-qlora.yml b/examples/magistral/vision/magistral-small-vision-24B-qlora.yml
index 397db383e..abd244647 100644
--- a/examples/magistral/vision/magistral-small-vision-24B-qlora.yml
+++ b/examples/magistral/vision/magistral-small-vision-24B-qlora.yml
@@ -53,7 +53,7 @@ tf32: true
 
 gradient_checkpointing: true
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/mamba/config.yml b/examples/mamba/config.yml
index 5f36595a3..0c39768d8 100644
--- a/examples/mamba/config.yml
+++ b/examples/mamba/config.yml
@@ -39,7 +39,6 @@ tf32: true
 gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention:
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/mimo/mimo-7b-qlora.yaml b/examples/mimo/mimo-7b-qlora.yaml
index 689213bcd..7ced584e1 100644
--- a/examples/mimo/mimo-7b-qlora.yaml
+++ b/examples/mimo/mimo-7b-qlora.yaml
@@ -58,7 +58,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/ministral/ministral-small-qlora.yaml b/examples/ministral/ministral-small-qlora.yaml
index 0d5300ef6..4c3bdfe94 100644
--- a/examples/ministral/ministral-small-qlora.yaml
+++ b/examples/ministral/ministral-small-qlora.yaml
@@ -58,7 +58,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/ministral3/ministral3-3b-qlora.yaml b/examples/ministral3/ministral3-3b-qlora.yaml
index b369c9d41..49eec882f 100644
--- a/examples/ministral3/ministral3-3b-qlora.yaml
+++ b/examples/ministral3/ministral3-3b-qlora.yaml
@@ -58,7 +58,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 scaling_softmax: true
 
 warmup_ratio: 0.1
diff --git a/examples/ministral3/think/ministral3-3b-think-qlora.yaml b/examples/ministral3/think/ministral3-3b-think-qlora.yaml
index 987c0bd54..508575cac 100644
--- a/examples/ministral3/think/ministral3-3b-think-qlora.yaml
+++ b/examples/ministral3/think/ministral3-3b-think-qlora.yaml
@@ -58,7 +58,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/ministral3/vision/ministral3-3b-vision-qlora.yml b/examples/ministral3/vision/ministral3-3b-vision-qlora.yml
index 0a0fdce4a..f1430ba53 100644
--- a/examples/ministral3/vision/ministral3-3b-vision-qlora.yml
+++ b/examples/ministral3/vision/ministral3-3b-vision-qlora.yml
@@ -53,7 +53,7 @@ tf32: true
 
 gradient_checkpointing: true
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/mistral-small/mistral-small-3.1-24B-lora.yml b/examples/mistral-small/mistral-small-3.1-24B-lora.yml
index d45d13ac6..4d3f78a13 100644
--- a/examples/mistral-small/mistral-small-3.1-24B-lora.yml
+++ b/examples/mistral-small/mistral-small-3.1-24B-lora.yml
@@ -51,7 +51,7 @@ tf32: true
 
 gradient_checkpointing: true
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/mistral/bigstral/bigstral-ds-zero3.yaml b/examples/mistral/bigstral/bigstral-ds-zero3.yaml
index a8dc36216..4648ae4b4 100644
--- a/examples/mistral/bigstral/bigstral-ds-zero3.yaml
+++ b/examples/mistral/bigstral/bigstral-ds-zero3.yaml
@@ -42,7 +42,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 save_total_limit: 1
 save_steps:
diff --git a/examples/mistral/config.yml b/examples/mistral/config.yml
index e74162537..aa1066733 100644
--- a/examples/mistral/config.yml
+++ b/examples/mistral/config.yml
@@ -36,7 +36,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/mistral/dpo/mistral-dpo-qlora.yml b/examples/mistral/dpo/mistral-dpo-qlora.yml
index 8fea14a0f..604eada74 100644
--- a/examples/mistral/dpo/mistral-dpo-qlora.yml
+++ b/examples/mistral/dpo/mistral-dpo-qlora.yml
@@ -71,7 +71,6 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: false
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/mistral/lora.yml b/examples/mistral/lora.yml
index 757287f19..b157fcc21 100644
--- a/examples/mistral/lora.yml
+++ b/examples/mistral/lora.yml
@@ -54,7 +54,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
diff --git a/examples/mistral/mistral-qlora-fsdp.yml b/examples/mistral/mistral-qlora-fsdp.yml
index 8e1f03d24..27d8be3cd 100644
--- a/examples/mistral/mistral-qlora-fsdp.yml
+++ b/examples/mistral/mistral-qlora-fsdp.yml
@@ -51,7 +51,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
diff --git a/examples/mistral/mixtral/mixtral-8x22b-qlora-fsdp.yml b/examples/mistral/mixtral/mixtral-8x22b-qlora-fsdp.yml
index dc7bd9c37..1b66de8f0 100644
--- a/examples/mistral/mixtral/mixtral-8x22b-qlora-fsdp.yml
+++ b/examples/mistral/mixtral/mixtral-8x22b-qlora-fsdp.yml
@@ -49,7 +49,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
diff --git a/examples/mistral/mixtral/mixtral-qlora-fsdp.yml b/examples/mistral/mixtral/mixtral-qlora-fsdp.yml
index 5151e1292..bd7c8620e 100644
--- a/examples/mistral/mixtral/mixtral-qlora-fsdp.yml
+++ b/examples/mistral/mixtral/mixtral-qlora-fsdp.yml
@@ -51,7 +51,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
diff --git a/examples/mistral/mixtral/mixtral.yml b/examples/mistral/mixtral/mixtral.yml
index d1981a699..b493ed317 100644
--- a/examples/mistral/mixtral/mixtral.yml
+++ b/examples/mistral/mixtral/mixtral.yml
@@ -69,7 +69,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
diff --git a/examples/mistral/mixtral/mixtral_22.yml b/examples/mistral/mixtral/mixtral_22.yml
index 0b606b7d7..3b87af04e 100644
--- a/examples/mistral/mixtral/mixtral_22.yml
+++ b/examples/mistral/mixtral/mixtral_22.yml
@@ -40,7 +40,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 save_total_limit: 1
 save_steps:
diff --git a/examples/mistral/mps/lora-mps.yml b/examples/mistral/mps/lora-mps.yml
index 07ce191dc..1b8021085 100644
--- a/examples/mistral/mps/lora-mps.yml
+++ b/examples/mistral/mps/lora-mps.yml
@@ -53,8 +53,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: false
-sdp_attention: true
+attn_implementation: sdpa
 
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
diff --git a/examples/mistral/orpo/mistral-qlora-orpo.yml b/examples/mistral/orpo/mistral-qlora-orpo.yml
index 850d286f3..d1c0065e5 100644
--- a/examples/mistral/orpo/mistral-qlora-orpo.yml
+++ b/examples/mistral/orpo/mistral-qlora-orpo.yml
@@ -59,7 +59,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
diff --git a/examples/mistral/qlora.yml b/examples/mistral/qlora.yml
index 2a7495e95..4fa82d11e 100644
--- a/examples/mistral/qlora.yml
+++ b/examples/mistral/qlora.yml
@@ -54,7 +54,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
diff --git a/examples/mistral4/fft-text.yml b/examples/mistral4/fft-text.yml
index 3acb5b2ed..2cdab6a42 100644
--- a/examples/mistral4/fft-text.yml
+++ b/examples/mistral4/fft-text.yml
@@ -40,7 +40,7 @@ bf16: true
 tf32: true
 
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/mistral4/fft-vision.yml b/examples/mistral4/fft-vision.yml
index baff37fe4..22262c55a 100644
--- a/examples/mistral4/fft-vision.yml
+++ b/examples/mistral4/fft-vision.yml
@@ -39,7 +39,7 @@ bf16: true
 tf32: true
 
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/mistral4/qlora-text.yml b/examples/mistral4/qlora-text.yml
index ae0cdcead..887ce6da0 100644
--- a/examples/mistral4/qlora-text.yml
+++ b/examples/mistral4/qlora-text.yml
@@ -50,7 +50,7 @@ tf32: true
 
 gradient_checkpointing: true
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/mistral4/qlora-vision.yml b/examples/mistral4/qlora-vision.yml
index a80d166dd..d01f8e85b 100644
--- a/examples/mistral4/qlora-vision.yml
+++ b/examples/mistral4/qlora-vision.yml
@@ -55,7 +55,7 @@ tf32: true
 
 gradient_checkpointing: true
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/nemotron-h/120b-a12b-qlora.yaml b/examples/nemotron-h/120b-a12b-qlora.yaml
index 03e6d3b5e..1174cec21 100644
--- a/examples/nemotron-h/120b-a12b-qlora.yaml
+++ b/examples/nemotron-h/120b-a12b-qlora.yaml
@@ -72,7 +72,7 @@ gradient_checkpointing_kwargs:
 
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 2
diff --git a/examples/nemotron-h/nano-30b-a3b-qlora.yaml b/examples/nemotron-h/nano-30b-a3b-qlora.yaml
index 3994ab08e..206bd5df8 100644
--- a/examples/nemotron-h/nano-30b-a3b-qlora.yaml
+++ b/examples/nemotron-h/nano-30b-a3b-qlora.yaml
@@ -73,7 +73,7 @@ gradient_checkpointing_kwargs:
 
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/nemotron/nemotron-mini-4b-qlora.yaml b/examples/nemotron/nemotron-mini-4b-qlora.yaml
index e796c149c..3f3772071 100644
--- a/examples/nemotron/nemotron-mini-4b-qlora.yaml
+++ b/examples/nemotron/nemotron-mini-4b-qlora.yaml
@@ -48,7 +48,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/olmo3/olmo3-7b-qlora.yaml b/examples/olmo3/olmo3-7b-qlora.yaml
index de2bf1d3d..b494699e0 100644
--- a/examples/olmo3/olmo3-7b-qlora.yaml
+++ b/examples/olmo3/olmo3-7b-qlora.yaml
@@ -55,7 +55,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/orpheus/finetune.yml b/examples/orpheus/finetune.yml
index f4bc8054e..86a488c84 100644
--- a/examples/orpheus/finetune.yml
+++ b/examples/orpheus/finetune.yml
@@ -41,7 +41,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 5
diff --git a/examples/phi/phi-ft.yml b/examples/phi/phi-ft.yml
index 717a45929..c16b15d8a 100644
--- a/examples/phi/phi-ft.yml
+++ b/examples/phi/phi-ft.yml
@@ -48,7 +48,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: True
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/phi/phi-qlora.yml b/examples/phi/phi-qlora.yml
index 0fe1abea5..ac4970355 100644
--- a/examples/phi/phi-qlora.yml
+++ b/examples/phi/phi-qlora.yml
@@ -51,7 +51,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: True
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/phi/phi2-ft.yml b/examples/phi/phi2-ft.yml
index e470c0d24..5702cc9b8 100644
--- a/examples/phi/phi2-ft.yml
+++ b/examples/phi/phi2-ft.yml
@@ -48,7 +48,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: True
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/phi/phi3-ft-fsdp.yml b/examples/phi/phi3-ft-fsdp.yml
index 1793737b5..49d3e44cb 100644
--- a/examples/phi/phi3-ft-fsdp.yml
+++ b/examples/phi/phi3-ft-fsdp.yml
@@ -49,7 +49,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/phi/phi3-ft.yml b/examples/phi/phi3-ft.yml
index 0b204963c..d36317f7b 100644
--- a/examples/phi/phi3-ft.yml
+++ b/examples/phi/phi3-ft.yml
@@ -44,7 +44,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: True
 early_stopping_patience: 3
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 eval_steps: 1000
 save_steps: 5000
diff --git a/examples/pixtral/lora-12b.yml b/examples/pixtral/lora-12b.yml
index 0e6489914..2e36688a1 100644
--- a/examples/pixtral/lora-12b.yml
+++ b/examples/pixtral/lora-12b.yml
@@ -45,7 +45,7 @@ tf32: true
 
 gradient_checkpointing: true
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/plano/plano-4b-qlora.yaml b/examples/plano/plano-4b-qlora.yaml
index 106e44205..30e0c36ff 100644
--- a/examples/plano/plano-4b-qlora.yaml
+++ b/examples/plano/plano-4b-qlora.yaml
@@ -56,7 +56,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/qat_nvfp4/Gemma3-12B_baseline.yml b/examples/qat_nvfp4/Gemma3-12B_baseline.yml
index be4e86635..e1c7e998a 100644
--- a/examples/qat_nvfp4/Gemma3-12B_baseline.yml
+++ b/examples/qat_nvfp4/Gemma3-12B_baseline.yml
@@ -24,7 +24,7 @@ output_dir: ./outputs/out_gemma/
 
 sequence_len: 8096
 sample_packing: true
-flash_attention: true
+attn_implementation: flash_attention_2
 
 wandb_entity:
 wandb_watch:
diff --git a/examples/qat_nvfp4/Gemma3-12B_qat.yml b/examples/qat_nvfp4/Gemma3-12B_qat.yml
index 7fa81163f..061fd6061 100644
--- a/examples/qat_nvfp4/Gemma3-12B_qat.yml
+++ b/examples/qat_nvfp4/Gemma3-12B_qat.yml
@@ -24,7 +24,7 @@ output_dir: ./outputs/qat_out_gemma/
 
 sequence_len: 8096
 sample_packing: true
-flash_attention: true
+attn_implementation: flash_attention_2
 
 qat:
   activation_dtype: nvfp4
diff --git a/examples/qat_nvfp4/Math-Gemma3-12B_baseline.yml b/examples/qat_nvfp4/Math-Gemma3-12B_baseline.yml
index 9f209515b..f11f604b4 100644
--- a/examples/qat_nvfp4/Math-Gemma3-12B_baseline.yml
+++ b/examples/qat_nvfp4/Math-Gemma3-12B_baseline.yml
@@ -24,7 +24,7 @@ output_dir: ./outputs/out_math_gemma/
 
 sequence_len: 4096
 sample_packing: true
-flash_attention: true
+attn_implementation: flash_attention_2
 
 wandb_entity:
 wandb_watch:
diff --git a/examples/qat_nvfp4/Math-Gemma3-12B_qat.yml b/examples/qat_nvfp4/Math-Gemma3-12B_qat.yml
index ef7e754be..f9c71321e 100644
--- a/examples/qat_nvfp4/Math-Gemma3-12B_qat.yml
+++ b/examples/qat_nvfp4/Math-Gemma3-12B_qat.yml
@@ -24,7 +24,7 @@ output_dir: ./outputs/qat_out_math_gemma/
 
 sequence_len: 4096
 sample_packing: true
-flash_attention: true
+attn_implementation: flash_attention_2
 
 qat:
   activation_dtype: nvfp4
diff --git a/examples/qat_nvfp4/Math-Gemma3-27B_baseline.yml b/examples/qat_nvfp4/Math-Gemma3-27B_baseline.yml
index 3a262d342..de8bc1807 100644
--- a/examples/qat_nvfp4/Math-Gemma3-27B_baseline.yml
+++ b/examples/qat_nvfp4/Math-Gemma3-27B_baseline.yml
@@ -24,7 +24,7 @@ output_dir: ./outputs/out_math_gemma27/
 
 sequence_len: 4096
 sample_packing: true
-flash_attention: true
+attn_implementation: flash_attention_2
 
 wandb_entity:
 wandb_watch:
diff --git a/examples/qat_nvfp4/Math-Gemma3-27B_qat.yml b/examples/qat_nvfp4/Math-Gemma3-27B_qat.yml
index 87016ae9c..c77060ee2 100644
--- a/examples/qat_nvfp4/Math-Gemma3-27B_qat.yml
+++ b/examples/qat_nvfp4/Math-Gemma3-27B_qat.yml
@@ -24,7 +24,7 @@ output_dir: ./outputs/qat_out_math_gemma27/
 
 sequence_len: 4096
 sample_packing: true
-flash_attention: true
+attn_implementation: flash_attention_2
 
 qat:
   activation_dtype: nvfp4
diff --git a/examples/qat_nvfp4/Math-Qwen2.5-72B_baseline.yml b/examples/qat_nvfp4/Math-Qwen2.5-72B_baseline.yml
index efec25c54..487fc8e4e 100644
--- a/examples/qat_nvfp4/Math-Qwen2.5-72B_baseline.yml
+++ b/examples/qat_nvfp4/Math-Qwen2.5-72B_baseline.yml
@@ -24,7 +24,7 @@ output_dir: ./outputs/out_math_72b/
 
 sequence_len: 4096
 sample_packing: true
-flash_attention: true
+attn_implementation: flash_attention_2
 
 wandb_entity:
 wandb_watch:
diff --git a/examples/qat_nvfp4/Math-Qwen2.5-72B_qat.yml b/examples/qat_nvfp4/Math-Qwen2.5-72B_qat.yml
index 427d7af52..12812d859 100644
--- a/examples/qat_nvfp4/Math-Qwen2.5-72B_qat.yml
+++ b/examples/qat_nvfp4/Math-Qwen2.5-72B_qat.yml
@@ -24,7 +24,7 @@ output_dir: ./outputs/qat_out_math_72b/
 
 sequence_len: 4096
 sample_packing: true
-flash_attention: true
+attn_implementation: flash_attention_2
 
 qat:
   activation_dtype: nvfp4
diff --git a/examples/qat_nvfp4/Qwen2.5-72B_baseline.yml b/examples/qat_nvfp4/Qwen2.5-72B_baseline.yml
index e1eaba61f..c52fd6b0a 100644
--- a/examples/qat_nvfp4/Qwen2.5-72B_baseline.yml
+++ b/examples/qat_nvfp4/Qwen2.5-72B_baseline.yml
@@ -24,7 +24,7 @@ output_dir: ./outputs/out_qwen72b/
 
 sequence_len: 8096
 sample_packing: true
-flash_attention: true
+attn_implementation: flash_attention_2
 
 wandb_entity:
 wandb_watch:
diff --git a/examples/qat_nvfp4/Qwen2.5-72B_qat.yml b/examples/qat_nvfp4/Qwen2.5-72B_qat.yml
index dad7e5422..cc67107c0 100644
--- a/examples/qat_nvfp4/Qwen2.5-72B_qat.yml
+++ b/examples/qat_nvfp4/Qwen2.5-72B_qat.yml
@@ -24,7 +24,7 @@ output_dir: ./outputs/qat_out_qwen72b/
 
 sequence_len: 8096
 sample_packing: true
-flash_attention: true
+attn_implementation: flash_attention_2
 
 qat:
   activation_dtype: nvfp4
diff --git a/examples/qwen2-vl/lora-7b.yaml b/examples/qwen2-vl/lora-7b.yaml
index 285a35cbb..d9bc4826b 100644
--- a/examples/qwen2-vl/lora-7b.yaml
+++ b/examples/qwen2-vl/lora-7b.yaml
@@ -46,8 +46,7 @@ tf32: true
 
 gradient_checkpointing: true
 logging_steps: 1
-flash_attention: true
-eager_attention:
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/qwen2/adamw-pretrain-fsdp2.yaml b/examples/qwen2/adamw-pretrain-fsdp2.yaml
index 43fb17aab..4129338db 100644
--- a/examples/qwen2/adamw-pretrain-fsdp2.yaml
+++ b/examples/qwen2/adamw-pretrain-fsdp2.yaml
@@ -49,7 +49,7 @@ tf32: false
 
 gradient_checkpointing: false
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_steps: 10
 evals_per_epoch: 0
diff --git a/examples/qwen2/dpo.yaml b/examples/qwen2/dpo.yaml
index 3e87766d6..6096053fd 100644
--- a/examples/qwen2/dpo.yaml
+++ b/examples/qwen2/dpo.yaml
@@ -48,7 +48,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/qwen2/muon-pretrain-fsdp2.yaml b/examples/qwen2/muon-pretrain-fsdp2.yaml
index 35c0b71f4..40dcff7be 100644
--- a/examples/qwen2/muon-pretrain-fsdp2.yaml
+++ b/examples/qwen2/muon-pretrain-fsdp2.yaml
@@ -49,7 +49,7 @@ tf32: false
 
 gradient_checkpointing: false
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_steps: 10
 evals_per_epoch: 0
diff --git a/examples/qwen2/prm.yaml b/examples/qwen2/prm.yaml
index a709a598d..1b3579fd4 100644
--- a/examples/qwen2/prm.yaml
+++ b/examples/qwen2/prm.yaml
@@ -47,7 +47,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch:
diff --git a/examples/qwen2/qlora-fsdp.yaml b/examples/qwen2/qlora-fsdp.yaml
index 337619b61..7bb035c3a 100644
--- a/examples/qwen2/qlora-fsdp.yaml
+++ b/examples/qwen2/qlora-fsdp.yaml
@@ -47,7 +47,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/qwen2/reward-model.yaml b/examples/qwen2/reward-model.yaml
index 08b8b4552..b7039cba0 100644
--- a/examples/qwen2/reward-model.yaml
+++ b/examples/qwen2/reward-model.yaml
@@ -42,7 +42,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch:
diff --git a/examples/qwen2_5-vl/lora-7b.yaml b/examples/qwen2_5-vl/lora-7b.yaml
index 7d499d841..e78aac78b 100644
--- a/examples/qwen2_5-vl/lora-7b.yaml
+++ b/examples/qwen2_5-vl/lora-7b.yaml
@@ -46,8 +46,7 @@ tf32: true
 
 gradient_checkpointing: true
 logging_steps: 1
-flash_attention: true
-eager_attention:
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml b/examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml
index f63b1d1ce..e8e7e08c7 100644
--- a/examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml
+++ b/examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml
@@ -68,7 +68,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/qwen3.5/122b-a10b-moe-qlora-fsdp.yaml b/examples/qwen3.5/122b-a10b-moe-qlora-fsdp.yaml
index f66bcd370..47842c561 100644
--- a/examples/qwen3.5/122b-a10b-moe-qlora-fsdp.yaml
+++ b/examples/qwen3.5/122b-a10b-moe-qlora-fsdp.yaml
@@ -65,7 +65,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/qwen3.5/122b-a10b-moe-qlora.yaml b/examples/qwen3.5/122b-a10b-moe-qlora.yaml
index 4447cf73c..f2675c7d7 100644
--- a/examples/qwen3.5/122b-a10b-moe-qlora.yaml
+++ b/examples/qwen3.5/122b-a10b-moe-qlora.yaml
@@ -65,7 +65,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/qwen3.5/27b-fft.yaml b/examples/qwen3.5/27b-fft.yaml
index 9f875ec26..ab206b772 100644
--- a/examples/qwen3.5/27b-fft.yaml
+++ b/examples/qwen3.5/27b-fft.yaml
@@ -50,7 +50,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/qwen3.5/27b-qlora-fsdp.yaml b/examples/qwen3.5/27b-qlora-fsdp.yaml
index 79b87a32f..7a5423c77 100644
--- a/examples/qwen3.5/27b-qlora-fsdp.yaml
+++ b/examples/qwen3.5/27b-qlora-fsdp.yaml
@@ -61,7 +61,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/qwen3.5/27b-qlora.yaml b/examples/qwen3.5/27b-qlora.yaml
index 18c0af95b..2401a4865 100644
--- a/examples/qwen3.5/27b-qlora.yaml
+++ b/examples/qwen3.5/27b-qlora.yaml
@@ -61,7 +61,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/qwen3.5/35b-a3b-moe-qlora-fsdp.yaml b/examples/qwen3.5/35b-a3b-moe-qlora-fsdp.yaml
index ad17366cb..2fb7f15f8 100644
--- a/examples/qwen3.5/35b-a3b-moe-qlora-fsdp.yaml
+++ b/examples/qwen3.5/35b-a3b-moe-qlora-fsdp.yaml
@@ -65,7 +65,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/qwen3.5/35b-a3b-moe-qlora.yaml b/examples/qwen3.5/35b-a3b-moe-qlora.yaml
index 22468a178..a6afc1aa2 100644
--- a/examples/qwen3.5/35b-a3b-moe-qlora.yaml
+++ b/examples/qwen3.5/35b-a3b-moe-qlora.yaml
@@ -75,7 +75,7 @@ gradient_checkpointing: true
 activation_offloading: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/qwen3.5/35b-a3b-moe-vision-lora.yaml b/examples/qwen3.5/35b-a3b-moe-vision-lora.yaml
index a7c85f785..7cfad3290 100644
--- a/examples/qwen3.5/35b-a3b-moe-vision-lora.yaml
+++ b/examples/qwen3.5/35b-a3b-moe-vision-lora.yaml
@@ -50,7 +50,7 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: false
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 weight_decay: 0.0
diff --git a/examples/qwen3.5/9b-fft-vision.yaml b/examples/qwen3.5/9b-fft-vision.yaml
index b6aeb859d..e8427b884 100644
--- a/examples/qwen3.5/9b-fft-vision.yaml
+++ b/examples/qwen3.5/9b-fft-vision.yaml
@@ -40,7 +40,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/qwen3.5/9b-lora-vision.yaml b/examples/qwen3.5/9b-lora-vision.yaml
index 1c3717724..9c2b9397e 100644
--- a/examples/qwen3.5/9b-lora-vision.yaml
+++ b/examples/qwen3.5/9b-lora-vision.yaml
@@ -58,7 +58,7 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: false
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/qwen3/32b-qlora.yaml b/examples/qwen3/32b-qlora.yaml
index f4a4f2816..dd5dd696e 100644
--- a/examples/qwen3/32b-qlora.yaml
+++ b/examples/qwen3/32b-qlora.yaml
@@ -60,7 +60,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/qwen3/8b-qat-fsdp2.yml b/examples/qwen3/8b-qat-fsdp2.yml
index cfbe5a4b7..3c9607a9a 100644
--- a/examples/qwen3/8b-qat-fsdp2.yml
+++ b/examples/qwen3/8b-qat-fsdp2.yml
@@ -23,7 +23,7 @@ output_dir: ./outputs/qat_out/
 
 sequence_len: 2048
 sample_packing: true
-flex_attention: true
+attn_implementation: flex_attention
 
 
 flex_attn_compile_kwargs:
diff --git a/examples/qwen3/qlora-fsdp.yaml b/examples/qwen3/qlora-fsdp.yaml
index e4d584dc7..a3852d457 100644
--- a/examples/qwen3/qlora-fsdp.yaml
+++ b/examples/qwen3/qlora-fsdp.yaml
@@ -46,7 +46,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 4
diff --git a/examples/seed-oss/seed-oss-36b-qlora.yaml b/examples/seed-oss/seed-oss-36b-qlora.yaml
index 00e7cf3eb..a8423f851 100644
--- a/examples/seed-oss/seed-oss-36b-qlora.yaml
+++ b/examples/seed-oss/seed-oss-36b-qlora.yaml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/smolvlm2/smolvlm2-2B-lora.yaml b/examples/smolvlm2/smolvlm2-2B-lora.yaml
index 1aeff408d..4cd8d5b0d 100644
--- a/examples/smolvlm2/smolvlm2-2B-lora.yaml
+++ b/examples/smolvlm2/smolvlm2-2B-lora.yaml
@@ -45,8 +45,7 @@ tf32: true
 
 gradient_checkpointing: true
 logging_steps: 1
-flash_attention: true
-eager_attention:
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/streaming/pretrain.yaml b/examples/streaming/pretrain.yaml
index bc8edefd6..a0d8b17c0 100644
--- a/examples/streaming/pretrain.yaml
+++ b/examples/streaming/pretrain.yaml
@@ -20,7 +20,7 @@ output_dir: ./outputs/smollm2-135m-pretrain-streaming
 sequence_len: 1024
 sample_packing: true
 pretrain_multipack_attn: true  # Prevent cross-attention between packed sequences
-flash_attention: true
+attn_implementation: flash_attention_2
 
 # Batch size settings
 gradient_accumulation_steps: 8
diff --git a/examples/streaming/sft.yaml b/examples/streaming/sft.yaml
index 47b9f493f..4a43c34eb 100644
--- a/examples/streaming/sft.yaml
+++ b/examples/streaming/sft.yaml
@@ -18,7 +18,7 @@ output_dir: ./outputs/smollm2-135m-sft-streaming
 # Sequence and packing settings
 sequence_len: 1024
 sample_packing: true
-flash_attention: true
+attn_implementation: flash_attention_2
 
 # Batch size settings
 gradient_accumulation_steps: 4
diff --git a/examples/swanlab/dpo-swanlab-completions.yml b/examples/swanlab/dpo-swanlab-completions.yml
index 5615ca638..fb21dbbba 100644
--- a/examples/swanlab/dpo-swanlab-completions.yml
+++ b/examples/swanlab/dpo-swanlab-completions.yml
@@ -78,7 +78,7 @@ tf32: false
 
 # Performance
 gradient_checkpointing: true
-flash_attention: true
+attn_implementation: flash_attention_2
 
 # Checkpointing and Logging
 logging_steps: 1
diff --git a/examples/swanlab/dpo-swanlab-full-featured.yml b/examples/swanlab/dpo-swanlab-full-featured.yml
index c25178c63..ac52e6a85 100644
--- a/examples/swanlab/dpo-swanlab-full-featured.yml
+++ b/examples/swanlab/dpo-swanlab-full-featured.yml
@@ -102,7 +102,7 @@ bf16: auto
 tf32: false
 
 gradient_checkpointing: true
-flash_attention: true
+attn_implementation: flash_attention_2
 
 # ============================================================================
 # Checkpointing and Logging
diff --git a/examples/swanlab/lora-swanlab-profiling.yml b/examples/swanlab/lora-swanlab-profiling.yml
index 1255105a6..3dff6e315 100644
--- a/examples/swanlab/lora-swanlab-profiling.yml
+++ b/examples/swanlab/lora-swanlab-profiling.yml
@@ -59,7 +59,7 @@ tf32: false
 
 # Performance
 gradient_checkpointing: true
-flash_attention: true
+attn_implementation: flash_attention_2
 
 # Checkpointing and Logging
 logging_steps: 1
diff --git a/examples/trinity/trinity-nano-preview-qlora.yaml b/examples/trinity/trinity-nano-preview-qlora.yaml
index d8bf9f073..52c0c0c60 100644
--- a/examples/trinity/trinity-nano-preview-qlora.yaml
+++ b/examples/trinity/trinity-nano-preview-qlora.yaml
@@ -58,7 +58,7 @@ gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 # flash_attention: true  # Not supported
-sdp_attention: true
+attn_implementation: sdpa
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/voxtral/voxtral-mini-audio-qlora.yml b/examples/voxtral/voxtral-mini-audio-qlora.yml
index 59150c4ca..cfa351ccd 100644
--- a/examples/voxtral/voxtral-mini-audio-qlora.yml
+++ b/examples/voxtral/voxtral-mini-audio-qlora.yml
@@ -70,7 +70,7 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: false
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/voxtral/voxtral-mini-qlora.yml b/examples/voxtral/voxtral-mini-qlora.yml
index bdbc5f867..61e8933d0 100644
--- a/examples/voxtral/voxtral-mini-qlora.yml
+++ b/examples/voxtral/voxtral-mini-qlora.yml
@@ -64,7 +64,7 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 
 warmup_ratio: 0.1
 evals_per_epoch: