From 39226623d262327ca62fa384f860922fd8797f84 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 23 Apr 2026 22:15:07 +0000 Subject: [PATCH] migrate example configs to canonical attn_implementation --- examples/LiquidAI/lfm2-350m-fft.yaml | 2 +- examples/LiquidAI/lfm2-8b-a1b-lora.yaml | 2 +- examples/LiquidAI/lfm2-vl-lora.yaml | 3 +-- examples/alst/llama3-8b-deepspeed-alst.yaml | 2 +- examples/alst/llama3-8b-fsdp2-alst.yaml | 2 +- examples/apertus/apertus-8b-qlora.yaml | 2 +- examples/arcee/afm-4.5b-qlora.yaml | 2 +- examples/archived/cerebras/btlm-ft.yml | 3 +-- examples/archived/cerebras/qlora.yml | 3 +-- examples/archived/code-llama/13b/lora.yml | 2 +- examples/archived/code-llama/13b/qlora.yml | 2 +- examples/archived/code-llama/34b/lora.yml | 2 +- examples/archived/code-llama/34b/qlora.yml | 2 +- examples/archived/code-llama/7b/lora.yml | 2 +- examples/archived/code-llama/7b/qlora.yml | 2 +- examples/archived/dbrx/16bit-lora.yaml | 2 +- examples/archived/dbrx/8bit-lora.yaml | 2 +- examples/archived/dbrx/fft-ds-zero3.yaml | 2 +- examples/archived/deepcoder/deepcoder-14B-preview-lora.yml | 2 +- examples/archived/falcon/config-7b-lora.yml | 3 +-- examples/archived/falcon/config-7b-qlora.yml | 3 +-- examples/archived/falcon/config-7b.yml | 3 +-- examples/archived/gemma/qlora.yml | 2 +- examples/archived/gptj/qlora.yml | 3 +-- examples/archived/jeopardy-bot/config.yml | 3 +-- examples/archived/mpt-7b/config.yml | 1 - examples/archived/openllama-3b/config.yml | 2 +- examples/archived/openllama-3b/lora.yml | 2 +- examples/archived/openllama-3b/qlora.yml | 2 +- examples/archived/qwen/lora.yml | 1 - examples/archived/qwen/qlora.yml | 1 - examples/archived/qwen/qwen2-moe-lora.yaml | 2 +- examples/archived/qwen/qwen2-moe-qlora.yaml | 2 +- examples/archived/redpajama/config-3b.yml | 1 - examples/archived/replit-3b/config-lora.yml | 1 - examples/archived/stablelm-2/1.6b/fft.yml | 2 +- examples/archived/stablelm-2/1.6b/lora.yml | 2 +- examples/archived/starcoder2/qlora.yml | 2 +- examples/archived/tiny-llama/lora-mps.yml | 1 - examples/archived/tiny-llama/lora.yml | 2 +- examples/archived/tiny-llama/pretrain.yml | 2 +- examples/archived/tiny-llama/qlora.yml | 2 +- examples/archived/xgen-7b/xgen-7b-8k-qlora.yml | 3 +-- examples/archived/yi-34B-chat/qlora.yml | 2 +- examples/cohere/command-r-7b-qlora.yml | 2 +- examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml | 2 +- examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml | 2 +- examples/deepseek-v2/fft-fsdp-16b.yaml | 2 +- examples/deepseek-v2/qlora-fsdp-2_5.yaml | 2 +- examples/devstral/devstral-small-qlora.yml | 2 +- examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml | 2 +- examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml | 2 +- examples/eaft/eaft-example.yml | 3 +-- examples/ebft/llama-1b-ebft-opencode-novllm.yaml | 2 +- examples/ebft/llama-1b-ebft-opencode.yaml | 2 +- examples/ebft/llama-1b-ebft-strided-structured.yaml | 3 +-- examples/ebft/llama-1b-ebft-strided.yaml | 1 - examples/ebft/llama-3b-ebft-strided-fft.yaml | 1 - examples/ebft/llama-8b-ebft-strided-fft.yaml | 1 - examples/ebft/qwen35-4b-ebft-structured-async.yaml | 2 +- examples/ebft/qwen35-4b-ebft-structured.yaml | 2 +- examples/ebft/qwen35-9b-ebft-structured.yaml | 2 +- examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml | 2 +- examples/falcon-h1/falcon-h1-1b-qlora.yaml | 2 +- examples/falcon-h1/falcon-h1-34b-qlora.yaml | 2 +- examples/falcon-h1/falcon-h1-3b-qlora.yaml | 2 +- examples/falcon-h1/falcon-h1-500m-qlora.yaml | 2 +- examples/falcon-h1/falcon-h1-7b-qlora.yaml | 2 +- examples/gemma2/qlora.yml | 2 +- examples/gemma2/reward-model.yaml | 2 +- examples/gemma3/gemma-3-1b-qlora.yml | 2 +- examples/gemma3/gemma-3-270m-qlora.yml | 2 +- examples/gemma3/gemma-3-4b-qlora.yml | 3 +-- examples/gemma3/gemma-3-4b-vision-qlora.yml | 3 +-- examples/gemma4/26b-a4b-moe-qlora.yaml | 2 +- examples/gemma4/31b-qlora-flex.yaml | 2 +- examples/gemma4/31b-qlora.yaml | 2 +- examples/gemma4/e2b-vision-lora.yaml | 2 +- examples/glm4/qlora-32b.yaml | 2 +- examples/glm45/glm-45-air-qlora.yaml | 2 +- examples/glm46v/glm-4-6v-flash-ddp.yaml | 2 +- examples/glm46v/glm-4-6v-flash-qlora.yaml | 2 +- examples/glm47-flash/lora.yaml | 2 +- examples/glm47-flash/lora_fsdp.yaml | 2 +- examples/glm47-flash/qlora.yaml | 2 +- examples/glm47-flash/qlora_fsdp.yaml | 2 +- examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml | 2 +- examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml | 2 +- examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml | 2 +- examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml | 2 +- examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml | 2 +- examples/gpt-oss/gpt-oss-safeguard-20b-sft-lora-singlegpu.yaml | 2 +- examples/granite4/granite-4.0-tiny-fft.yaml | 2 +- examples/hunyuan/hunyuan-v1-dense-qlora.yaml | 2 +- examples/internvl3_5/internvl3_5-8b-qlora.yml | 3 +-- examples/jamba/qlora.yaml | 2 +- examples/jamba/qlora_deepspeed.yaml | 2 +- examples/jamba/qlora_fsdp_large.yaml | 2 +- examples/kimi-linear/kimi-48b-lora.yaml | 2 +- examples/llama-2/fft_optimized.yml | 2 +- examples/llama-2/gptq-lora.yml | 2 -- examples/llama-2/lisa.yml | 2 +- examples/llama-2/loftq.yml | 2 +- examples/llama-2/lora.yml | 2 +- examples/llama-2/qlora-fsdp.yml | 2 +- examples/llama-2/qlora.yml | 2 +- examples/llama-2/relora.yml | 2 +- examples/llama-3-vision/lora-11b.yaml | 2 +- examples/llama-3/3b-fp8-fsdp2.yaml | 2 +- examples/llama-3/3b-qat-fsdp2.yaml | 2 +- examples/llama-3/3b-qat-mxfp4.yaml | 2 +- examples/llama-3/3b-qat-nvfp4.yaml | 2 +- examples/llama-3/diffusion/pretrain-1b.yaml | 2 +- examples/llama-3/diffusion/sft-1b.yaml | 2 +- examples/llama-3/fft-8b-liger-fsdp.yaml | 2 +- examples/llama-3/fft-8b.yaml | 2 +- examples/llama-3/instruct-dpo-lora-8b.yml | 2 +- examples/llama-3/instruct-lora-8b.yml | 2 +- examples/llama-3/lora-1b-deduplicate-dpo.yml | 2 +- examples/llama-3/lora-1b-deduplicate-sft.yml | 2 +- examples/llama-3/lora-1b-kernels.yml | 2 +- examples/llama-3/lora-1b-ray.yml | 2 +- examples/llama-3/lora-1b-sample-packing-sequentially.yml | 2 +- examples/llama-3/lora-1b.yml | 2 +- examples/llama-3/lora-8b.yml | 2 +- examples/llama-3/opentelemetry-qlora.yml | 1 - examples/llama-3/qlora-1b-gdpo.yaml | 2 +- examples/llama-3/qlora-1b-kto.yaml | 2 +- examples/llama-3/qlora-1b.yml | 2 +- examples/llama-3/qlora-fsdp-405b.yaml | 2 +- examples/llama-3/qlora-fsdp-70b.yaml | 2 +- examples/llama-3/qlora.yml | 2 +- examples/llama-3/sparse-finetuning.yaml | 3 +-- examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml | 2 +- examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml | 2 +- examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml | 2 +- examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml | 2 +- examples/llama-4/scout-qlora-flexattn-fsdp2.yaml | 2 +- examples/llama-4/scout-qlora-single-h100-flex.yaml | 2 +- examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml | 2 +- examples/llava/lora-7b.yaml | 3 +-- examples/magistral/magistral-small-fsdp-qlora.yaml | 2 +- examples/magistral/magistral-small-qlora.yaml | 2 +- examples/magistral/think/magistral-small-think-qlora.yaml | 2 +- examples/magistral/vision/magistral-small-vision-24B-qlora.yml | 2 +- examples/mamba/config.yml | 1 - examples/mimo/mimo-7b-qlora.yaml | 2 +- examples/ministral/ministral-small-qlora.yaml | 2 +- examples/ministral3/ministral3-3b-qlora.yaml | 2 +- examples/ministral3/think/ministral3-3b-think-qlora.yaml | 2 +- examples/ministral3/vision/ministral3-3b-vision-qlora.yml | 2 +- examples/mistral-small/mistral-small-3.1-24B-lora.yml | 2 +- examples/mistral/bigstral/bigstral-ds-zero3.yaml | 2 +- examples/mistral/config.yml | 2 +- examples/mistral/dpo/mistral-dpo-qlora.yml | 1 - examples/mistral/lora.yml | 2 +- examples/mistral/mistral-qlora-fsdp.yml | 2 +- examples/mistral/mixtral/mixtral-8x22b-qlora-fsdp.yml | 2 +- examples/mistral/mixtral/mixtral-qlora-fsdp.yml | 2 +- examples/mistral/mixtral/mixtral.yml | 2 +- examples/mistral/mixtral/mixtral_22.yml | 2 +- examples/mistral/mps/lora-mps.yml | 3 +-- examples/mistral/orpo/mistral-qlora-orpo.yml | 2 +- examples/mistral/qlora.yml | 2 +- examples/mistral4/fft-text.yml | 2 +- examples/mistral4/fft-vision.yml | 2 +- examples/mistral4/qlora-text.yml | 2 +- examples/mistral4/qlora-vision.yml | 2 +- examples/nemotron-h/120b-a12b-qlora.yaml | 2 +- examples/nemotron-h/nano-30b-a3b-qlora.yaml | 2 +- examples/nemotron/nemotron-mini-4b-qlora.yaml | 2 +- examples/olmo3/olmo3-7b-qlora.yaml | 2 +- examples/orpheus/finetune.yml | 2 +- examples/phi/phi-ft.yml | 2 +- examples/phi/phi-qlora.yml | 2 +- examples/phi/phi2-ft.yml | 2 +- examples/phi/phi3-ft-fsdp.yml | 2 +- examples/phi/phi3-ft.yml | 2 +- examples/pixtral/lora-12b.yml | 2 +- examples/plano/plano-4b-qlora.yaml | 2 +- examples/qat_nvfp4/Gemma3-12B_baseline.yml | 2 +- examples/qat_nvfp4/Gemma3-12B_qat.yml | 2 +- examples/qat_nvfp4/Math-Gemma3-12B_baseline.yml | 2 +- examples/qat_nvfp4/Math-Gemma3-12B_qat.yml | 2 +- examples/qat_nvfp4/Math-Gemma3-27B_baseline.yml | 2 +- examples/qat_nvfp4/Math-Gemma3-27B_qat.yml | 2 +- examples/qat_nvfp4/Math-Qwen2.5-72B_baseline.yml | 2 +- examples/qat_nvfp4/Math-Qwen2.5-72B_qat.yml | 2 +- examples/qat_nvfp4/Qwen2.5-72B_baseline.yml | 2 +- examples/qat_nvfp4/Qwen2.5-72B_qat.yml | 2 +- examples/qwen2-vl/lora-7b.yaml | 3 +-- examples/qwen2/adamw-pretrain-fsdp2.yaml | 2 +- examples/qwen2/dpo.yaml | 2 +- examples/qwen2/muon-pretrain-fsdp2.yaml | 2 +- examples/qwen2/prm.yaml | 2 +- examples/qwen2/qlora-fsdp.yaml | 2 +- examples/qwen2/reward-model.yaml | 2 +- examples/qwen2_5-vl/lora-7b.yaml | 3 +-- examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml | 2 +- examples/qwen3.5/122b-a10b-moe-qlora-fsdp.yaml | 2 +- examples/qwen3.5/122b-a10b-moe-qlora.yaml | 2 +- examples/qwen3.5/27b-fft.yaml | 2 +- examples/qwen3.5/27b-qlora-fsdp.yaml | 2 +- examples/qwen3.5/27b-qlora.yaml | 2 +- examples/qwen3.5/35b-a3b-moe-qlora-fsdp.yaml | 2 +- examples/qwen3.5/35b-a3b-moe-qlora.yaml | 2 +- examples/qwen3.5/35b-a3b-moe-vision-lora.yaml | 2 +- examples/qwen3.5/9b-fft-vision.yaml | 2 +- examples/qwen3.5/9b-lora-vision.yaml | 2 +- examples/qwen3/32b-qlora.yaml | 2 +- examples/qwen3/8b-qat-fsdp2.yml | 2 +- examples/qwen3/qlora-fsdp.yaml | 2 +- examples/seed-oss/seed-oss-36b-qlora.yaml | 2 +- examples/smolvlm2/smolvlm2-2B-lora.yaml | 3 +-- examples/streaming/pretrain.yaml | 2 +- examples/streaming/sft.yaml | 2 +- examples/swanlab/dpo-swanlab-completions.yml | 2 +- examples/swanlab/dpo-swanlab-full-featured.yml | 2 +- examples/swanlab/lora-swanlab-profiling.yml | 2 +- examples/trinity/trinity-nano-preview-qlora.yaml | 2 +- examples/voxtral/voxtral-mini-audio-qlora.yml | 2 +- examples/voxtral/voxtral-mini-qlora.yml | 2 +- 222 files changed, 209 insertions(+), 243 deletions(-) diff --git a/examples/LiquidAI/lfm2-350m-fft.yaml b/examples/LiquidAI/lfm2-350m-fft.yaml index 145b56dd1..cd5942206 100644 --- a/examples/LiquidAI/lfm2-350m-fft.yaml +++ b/examples/LiquidAI/lfm2-350m-fft.yaml @@ -39,7 +39,7 @@ tf32: true gradient_checkpointing: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 2 diff --git a/examples/LiquidAI/lfm2-8b-a1b-lora.yaml b/examples/LiquidAI/lfm2-8b-a1b-lora.yaml index 73cbfcce7..4932ea06e 100644 --- a/examples/LiquidAI/lfm2-8b-a1b-lora.yaml +++ b/examples/LiquidAI/lfm2-8b-a1b-lora.yaml @@ -48,7 +48,7 @@ tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 2 diff --git a/examples/LiquidAI/lfm2-vl-lora.yaml b/examples/LiquidAI/lfm2-vl-lora.yaml index 313da8274..9a125da5e 100644 --- a/examples/LiquidAI/lfm2-vl-lora.yaml +++ b/examples/LiquidAI/lfm2-vl-lora.yaml @@ -50,8 +50,7 @@ tf32: true gradient_checkpointing: true logging_steps: 1 -flash_attention: true -eager_attention: +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/alst/llama3-8b-deepspeed-alst.yaml b/examples/alst/llama3-8b-deepspeed-alst.yaml index dea23c5ee..e844c6823 100644 --- a/examples/alst/llama3-8b-deepspeed-alst.yaml +++ b/examples/alst/llama3-8b-deepspeed-alst.yaml @@ -39,7 +39,7 @@ activation_offloading: legacy resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_steps: 100 saves_per_epoch: 1 diff --git a/examples/alst/llama3-8b-fsdp2-alst.yaml b/examples/alst/llama3-8b-fsdp2-alst.yaml index c8a978264..a7da92637 100644 --- a/examples/alst/llama3-8b-fsdp2-alst.yaml +++ b/examples/alst/llama3-8b-fsdp2-alst.yaml @@ -39,7 +39,7 @@ activation_offloading: legacy resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_steps: 100 saves_per_epoch: 1 diff --git a/examples/apertus/apertus-8b-qlora.yaml b/examples/apertus/apertus-8b-qlora.yaml index 521b282da..f43901363 100644 --- a/examples/apertus/apertus-8b-qlora.yaml +++ b/examples/apertus/apertus-8b-qlora.yaml @@ -55,7 +55,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/arcee/afm-4.5b-qlora.yaml b/examples/arcee/afm-4.5b-qlora.yaml index 2cb42cacd..8e70847ad 100644 --- a/examples/arcee/afm-4.5b-qlora.yaml +++ b/examples/arcee/afm-4.5b-qlora.yaml @@ -55,7 +55,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/archived/cerebras/btlm-ft.yml b/examples/archived/cerebras/btlm-ft.yml index c3495d287..5a5f8dc12 100644 --- a/examples/archived/cerebras/btlm-ft.yml +++ b/examples/archived/cerebras/btlm-ft.yml @@ -59,8 +59,7 @@ gradient_checkpointing: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true -sdp_attention: +attn_implementation: flash_attention_2 flash_optimum: gptq_groupsize: diff --git a/examples/archived/cerebras/qlora.yml b/examples/archived/cerebras/qlora.yml index 4598a8338..22f52e682 100644 --- a/examples/archived/cerebras/qlora.yml +++ b/examples/archived/cerebras/qlora.yml @@ -39,8 +39,7 @@ tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -xformers_attention: true -flash_attention: +attn_implementation: xformers gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 diff --git a/examples/archived/code-llama/13b/lora.yml b/examples/archived/code-llama/13b/lora.yml index ace94b619..43f623357 100644 --- a/examples/archived/code-llama/13b/lora.yml +++ b/examples/archived/code-llama/13b/lora.yml @@ -45,7 +45,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/archived/code-llama/13b/qlora.yml b/examples/archived/code-llama/13b/qlora.yml index f4ed17af5..086f5e3d8 100644 --- a/examples/archived/code-llama/13b/qlora.yml +++ b/examples/archived/code-llama/13b/qlora.yml @@ -46,7 +46,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/archived/code-llama/34b/lora.yml b/examples/archived/code-llama/34b/lora.yml index 0a1d71467..19aa898be 100644 --- a/examples/archived/code-llama/34b/lora.yml +++ b/examples/archived/code-llama/34b/lora.yml @@ -45,7 +45,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/archived/code-llama/34b/qlora.yml b/examples/archived/code-llama/34b/qlora.yml index ec17bf200..2ec78f0d8 100644 --- a/examples/archived/code-llama/34b/qlora.yml +++ b/examples/archived/code-llama/34b/qlora.yml @@ -46,7 +46,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/archived/code-llama/7b/lora.yml b/examples/archived/code-llama/7b/lora.yml index 174c17d2c..30bc63355 100644 --- a/examples/archived/code-llama/7b/lora.yml +++ b/examples/archived/code-llama/7b/lora.yml @@ -45,7 +45,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/archived/code-llama/7b/qlora.yml b/examples/archived/code-llama/7b/qlora.yml index 08e67d8c2..0c3b38519 100644 --- a/examples/archived/code-llama/7b/qlora.yml +++ b/examples/archived/code-llama/7b/qlora.yml @@ -46,7 +46,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/archived/dbrx/16bit-lora.yaml b/examples/archived/dbrx/16bit-lora.yaml index 05946dfe9..eca58f94c 100644 --- a/examples/archived/dbrx/16bit-lora.yaml +++ b/examples/archived/dbrx/16bit-lora.yaml @@ -52,7 +52,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: diff --git a/examples/archived/dbrx/8bit-lora.yaml b/examples/archived/dbrx/8bit-lora.yaml index f159bf7fa..59f5241b4 100644 --- a/examples/archived/dbrx/8bit-lora.yaml +++ b/examples/archived/dbrx/8bit-lora.yaml @@ -55,7 +55,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: diff --git a/examples/archived/dbrx/fft-ds-zero3.yaml b/examples/archived/dbrx/fft-ds-zero3.yaml index 13cd0d997..2cb3e6da1 100644 --- a/examples/archived/dbrx/fft-ds-zero3.yaml +++ b/examples/archived/dbrx/fft-ds-zero3.yaml @@ -39,7 +39,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: diff --git a/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml b/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml index 3223ec19a..b125e9e3f 100644 --- a/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml +++ b/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml @@ -45,7 +45,7 @@ tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/archived/falcon/config-7b-lora.yml b/examples/archived/falcon/config-7b-lora.yml index f4fedbede..71dd572b3 100644 --- a/examples/archived/falcon/config-7b-lora.yml +++ b/examples/archived/falcon/config-7b-lora.yml @@ -43,8 +43,7 @@ tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -xformers_attention: true -flash_attention: +attn_implementation: xformers gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 diff --git a/examples/archived/falcon/config-7b-qlora.yml b/examples/archived/falcon/config-7b-qlora.yml index a44cc40a6..edd6550a7 100644 --- a/examples/archived/falcon/config-7b-qlora.yml +++ b/examples/archived/falcon/config-7b-qlora.yml @@ -73,8 +73,7 @@ early_stopping_patience: 3 resume_from_checkpoint: auto_resume_from_checkpoints: true logging_steps: 1 -xformers_attention: true -flash_attention: +attn_implementation: xformers gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 diff --git a/examples/archived/falcon/config-7b.yml b/examples/archived/falcon/config-7b.yml index 5481fb236..6da39d7ab 100644 --- a/examples/archived/falcon/config-7b.yml +++ b/examples/archived/falcon/config-7b.yml @@ -40,8 +40,7 @@ tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -xformers_attention: true -flash_attention: +attn_implementation: xformers gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 diff --git a/examples/archived/gemma/qlora.yml b/examples/archived/gemma/qlora.yml index 80829b3c9..5b5ec4a9f 100644 --- a/examples/archived/gemma/qlora.yml +++ b/examples/archived/gemma/qlora.yml @@ -47,7 +47,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/archived/gptj/qlora.yml b/examples/archived/gptj/qlora.yml index 6348566c2..7e10adeaa 100644 --- a/examples/archived/gptj/qlora.yml +++ b/examples/archived/gptj/qlora.yml @@ -36,8 +36,7 @@ tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -xformers_attention: true -flash_attention: +attn_implementation: xformers gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 diff --git a/examples/archived/jeopardy-bot/config.yml b/examples/archived/jeopardy-bot/config.yml index ab1d19784..90ca3b4bc 100644 --- a/examples/archived/jeopardy-bot/config.yml +++ b/examples/archived/jeopardy-bot/config.yml @@ -37,8 +37,7 @@ bf16: auto tf32: true resume_from_checkpoint: logging_steps: 5 -xformers_attention: true -flash_attention: +attn_implementation: xformers gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 diff --git a/examples/archived/mpt-7b/config.yml b/examples/archived/mpt-7b/config.yml index 1fff51b6e..588981bf7 100644 --- a/examples/archived/mpt-7b/config.yml +++ b/examples/archived/mpt-7b/config.yml @@ -39,7 +39,6 @@ bf16: auto tf32: true resume_from_checkpoint: logging_steps: 5 -flash_attention: gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 diff --git a/examples/archived/openllama-3b/config.yml b/examples/archived/openllama-3b/config.yml index 63056ed6d..14104ff4b 100644 --- a/examples/archived/openllama-3b/config.yml +++ b/examples/archived/openllama-3b/config.yml @@ -39,7 +39,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 diff --git a/examples/archived/openllama-3b/lora.yml b/examples/archived/openllama-3b/lora.yml index b70821ce2..30d3888f1 100644 --- a/examples/archived/openllama-3b/lora.yml +++ b/examples/archived/openllama-3b/lora.yml @@ -47,7 +47,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 diff --git a/examples/archived/openllama-3b/qlora.yml b/examples/archived/openllama-3b/qlora.yml index a34f2964b..fc9d1d703 100644 --- a/examples/archived/openllama-3b/qlora.yml +++ b/examples/archived/openllama-3b/qlora.yml @@ -40,7 +40,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 diff --git a/examples/archived/qwen/lora.yml b/examples/archived/qwen/lora.yml index 29de25611..362a848a8 100644 --- a/examples/archived/qwen/lora.yml +++ b/examples/archived/qwen/lora.yml @@ -47,7 +47,6 @@ tf32: false gradient_checkpointing: false resume_from_checkpoint: logging_steps: 1 -flash_attention: warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/archived/qwen/qlora.yml b/examples/archived/qwen/qlora.yml index d46669444..bce3012e7 100644 --- a/examples/archived/qwen/qlora.yml +++ b/examples/archived/qwen/qlora.yml @@ -47,7 +47,6 @@ tf32: false gradient_checkpointing: false resume_from_checkpoint: logging_steps: 1 -flash_attention: warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/archived/qwen/qwen2-moe-lora.yaml b/examples/archived/qwen/qwen2-moe-lora.yaml index 1d5e1b524..97c0d51a6 100644 --- a/examples/archived/qwen/qwen2-moe-lora.yaml +++ b/examples/archived/qwen/qwen2-moe-lora.yaml @@ -43,7 +43,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/archived/qwen/qwen2-moe-qlora.yaml b/examples/archived/qwen/qwen2-moe-qlora.yaml index 08731441b..a16089eed 100644 --- a/examples/archived/qwen/qwen2-moe-qlora.yaml +++ b/examples/archived/qwen/qwen2-moe-qlora.yaml @@ -46,7 +46,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/archived/redpajama/config-3b.yml b/examples/archived/redpajama/config-3b.yml index c5b229c3d..676f31476 100644 --- a/examples/archived/redpajama/config-3b.yml +++ b/examples/archived/redpajama/config-3b.yml @@ -40,7 +40,6 @@ bf16: auto tf32: true resume_from_checkpoint: logging_steps: 5 -flash_attention: gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 diff --git a/examples/archived/replit-3b/config-lora.yml b/examples/archived/replit-3b/config-lora.yml index d8561762c..b0a0c9089 100644 --- a/examples/archived/replit-3b/config-lora.yml +++ b/examples/archived/replit-3b/config-lora.yml @@ -38,7 +38,6 @@ tf32: true gradient_checkpointing: resume_from_checkpoint: logging_steps: 1 -flash_attention: gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 diff --git a/examples/archived/stablelm-2/1.6b/fft.yml b/examples/archived/stablelm-2/1.6b/fft.yml index 585888f43..05f59544c 100644 --- a/examples/archived/stablelm-2/1.6b/fft.yml +++ b/examples/archived/stablelm-2/1.6b/fft.yml @@ -44,7 +44,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 flash_attn_cross_entropy: false flash_attn_rms_norm: true flash_attn_fuse_mlp: true diff --git a/examples/archived/stablelm-2/1.6b/lora.yml b/examples/archived/stablelm-2/1.6b/lora.yml index 6d358bdd8..1edb56e0c 100644 --- a/examples/archived/stablelm-2/1.6b/lora.yml +++ b/examples/archived/stablelm-2/1.6b/lora.yml @@ -47,7 +47,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 flash_attn_cross_entropy: false flash_attn_rms_norm: true diff --git a/examples/archived/starcoder2/qlora.yml b/examples/archived/starcoder2/qlora.yml index fecf98d23..0fd0f453c 100644 --- a/examples/archived/starcoder2/qlora.yml +++ b/examples/archived/starcoder2/qlora.yml @@ -46,7 +46,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/archived/tiny-llama/lora-mps.yml b/examples/archived/tiny-llama/lora-mps.yml index 125090a78..bf3292c35 100644 --- a/examples/archived/tiny-llama/lora-mps.yml +++ b/examples/archived/tiny-llama/lora-mps.yml @@ -47,7 +47,6 @@ tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: false warmup_ratio: 0.1 evals_per_epoch: 0 diff --git a/examples/archived/tiny-llama/lora.yml b/examples/archived/tiny-llama/lora.yml index 817481e18..a12d63746 100644 --- a/examples/archived/tiny-llama/lora.yml +++ b/examples/archived/tiny-llama/lora.yml @@ -45,7 +45,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/archived/tiny-llama/pretrain.yml b/examples/archived/tiny-llama/pretrain.yml index f15c6ce19..4d1686138 100644 --- a/examples/archived/tiny-llama/pretrain.yml +++ b/examples/archived/tiny-llama/pretrain.yml @@ -36,7 +36,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: diff --git a/examples/archived/tiny-llama/qlora.yml b/examples/archived/tiny-llama/qlora.yml index d3ff59cb8..b1adcb2e6 100644 --- a/examples/archived/tiny-llama/qlora.yml +++ b/examples/archived/tiny-llama/qlora.yml @@ -47,7 +47,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml b/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml index fc09a1e7b..d548032b9 100644 --- a/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml +++ b/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml @@ -71,8 +71,7 @@ early_stopping_patience: 3 resume_from_checkpoint: auto_resume_from_checkpoints: true logging_steps: 1 -xformers_attention: true -flash_attention: +attn_implementation: xformers gptq_groupsize: gptq_model_v1: warmup_ratio: 0.1 diff --git a/examples/archived/yi-34B-chat/qlora.yml b/examples/archived/yi-34B-chat/qlora.yml index ba8d12fc8..5d3d54dc6 100644 --- a/examples/archived/yi-34B-chat/qlora.yml +++ b/examples/archived/yi-34B-chat/qlora.yml @@ -10,7 +10,7 @@ load_in_4bit: true sequence_len: 1024 bf16: auto tf32: false -flash_attention: true +attn_implementation: flash_attention_2 special_tokens: bos_token: "<|startoftext|>" eos_token: "<|endoftext|>" diff --git a/examples/cohere/command-r-7b-qlora.yml b/examples/cohere/command-r-7b-qlora.yml index b4741636b..c4d03b0ec 100644 --- a/examples/cohere/command-r-7b-qlora.yml +++ b/examples/cohere/command-r-7b-qlora.yml @@ -48,7 +48,7 @@ tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: diff --git a/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml b/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml index 97d1bb6b3..c36b0e74a 100644 --- a/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml +++ b/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml @@ -45,7 +45,7 @@ tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml b/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml index b80cc5bc0..2b2aafd75 100644 --- a/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml +++ b/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml @@ -45,7 +45,7 @@ tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/deepseek-v2/fft-fsdp-16b.yaml b/examples/deepseek-v2/fft-fsdp-16b.yaml index 6e936da16..2eac9aea3 100644 --- a/examples/deepseek-v2/fft-fsdp-16b.yaml +++ b/examples/deepseek-v2/fft-fsdp-16b.yaml @@ -35,7 +35,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 2 diff --git a/examples/deepseek-v2/qlora-fsdp-2_5.yaml b/examples/deepseek-v2/qlora-fsdp-2_5.yaml index aab5034a0..0e23a0266 100644 --- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml +++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml @@ -59,7 +59,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 2 diff --git a/examples/devstral/devstral-small-qlora.yml b/examples/devstral/devstral-small-qlora.yml index ca8e8e043..4a359a2e6 100644 --- a/examples/devstral/devstral-small-qlora.yml +++ b/examples/devstral/devstral-small-qlora.yml @@ -51,7 +51,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 scaling_softmax: true loss_watchdog_threshold: 5.0 diff --git a/examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml b/examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml index f10dc9bd2..a99a6bef8 100644 --- a/examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml +++ b/examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml @@ -29,7 +29,7 @@ output_dir: ./outputs/ndp-out/ sequence_len: 2048 sample_packing: true -flash_attention: true +attn_implementation: flash_attention_2 gradient_accumulation_steps: 1 micro_batch_size: 1 diff --git a/examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml b/examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml index 584a33f44..a12b524ed 100644 --- a/examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml +++ b/examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml @@ -26,7 +26,7 @@ output_dir: ./outputs/ndp-out/ sequence_len: 8192 sample_packing: true -flash_attention: true +attn_implementation: flash_attention_2 gradient_accumulation_steps: 1 micro_batch_size: 1 # must be 1 when using context parallel diff --git a/examples/eaft/eaft-example.yml b/examples/eaft/eaft-example.yml index fed4179d2..b4b13a14c 100644 --- a/examples/eaft/eaft-example.yml +++ b/examples/eaft/eaft-example.yml @@ -65,8 +65,7 @@ early_stopping_patience: resume_from_checkpoint: local_rank: logging_steps: 1 -xformers_attention: -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 weight_decay: 0.0 diff --git a/examples/ebft/llama-1b-ebft-opencode-novllm.yaml b/examples/ebft/llama-1b-ebft-opencode-novllm.yaml index 0891033f0..7d7edad33 100644 --- a/examples/ebft/llama-1b-ebft-opencode-novllm.yaml +++ b/examples/ebft/llama-1b-ebft-opencode-novllm.yaml @@ -46,7 +46,7 @@ lora_dropout: 0.05 lora_target_linear: true bf16: auto -flash_attention: true +attn_implementation: flash_attention_2 gradient_checkpointing: true special_tokens: diff --git a/examples/ebft/llama-1b-ebft-opencode.yaml b/examples/ebft/llama-1b-ebft-opencode.yaml index d0d1069d8..c77c36677 100644 --- a/examples/ebft/llama-1b-ebft-opencode.yaml +++ b/examples/ebft/llama-1b-ebft-opencode.yaml @@ -66,7 +66,7 @@ lora_target_linear: true # --- Hardware --- bf16: auto -flash_attention: true +attn_implementation: flash_attention_2 gradient_checkpointing: true special_tokens: diff --git a/examples/ebft/llama-1b-ebft-strided-structured.yaml b/examples/ebft/llama-1b-ebft-strided-structured.yaml index 8ba63b64b..02e89dea0 100644 --- a/examples/ebft/llama-1b-ebft-strided-structured.yaml +++ b/examples/ebft/llama-1b-ebft-strided-structured.yaml @@ -47,8 +47,7 @@ lora_dropout: 0.05 lora_target_linear: true bf16: auto -flash_attention: false # strided EBFT overrides to flex_attention (or eager fallback) at runtime -flex_attention: true # fused flex_attention kernel compiles itself; don't set torch_compile: true +attn_implementation: flex_attention # (full-model compile conflicts with gradient checkpointing + flex_attention) gradient_checkpointing: true gradient_checkpointing_kwargs: diff --git a/examples/ebft/llama-1b-ebft-strided.yaml b/examples/ebft/llama-1b-ebft-strided.yaml index c9519f160..e3cfe8040 100644 --- a/examples/ebft/llama-1b-ebft-strided.yaml +++ b/examples/ebft/llama-1b-ebft-strided.yaml @@ -46,7 +46,6 @@ lora_dropout: 0.05 lora_target_linear: true bf16: auto -flash_attention: false # strided EBFT overrides to flex_attention (or eager fallback) at runtime gradient_checkpointing: true special_tokens: diff --git a/examples/ebft/llama-3b-ebft-strided-fft.yaml b/examples/ebft/llama-3b-ebft-strided-fft.yaml index 5695efa40..e39d3bcfa 100644 --- a/examples/ebft/llama-3b-ebft-strided-fft.yaml +++ b/examples/ebft/llama-3b-ebft-strided-fft.yaml @@ -48,7 +48,6 @@ lora_target_linear: true bf16: auto torch_dtype: bfloat16 -flash_attention: false gradient_checkpointing: true torch_compile: true gradient_checkpointing_kwargs: diff --git a/examples/ebft/llama-8b-ebft-strided-fft.yaml b/examples/ebft/llama-8b-ebft-strided-fft.yaml index 8cf962849..caed98085 100644 --- a/examples/ebft/llama-8b-ebft-strided-fft.yaml +++ b/examples/ebft/llama-8b-ebft-strided-fft.yaml @@ -41,7 +41,6 @@ warmup_steps: 10 weight_decay: 0.01 bf16: auto -flash_attention: false # strided EBFT uses flex_attention at runtime gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false diff --git a/examples/ebft/qwen35-4b-ebft-structured-async.yaml b/examples/ebft/qwen35-4b-ebft-structured-async.yaml index 759a31730..daa77d6f6 100644 --- a/examples/ebft/qwen35-4b-ebft-structured-async.yaml +++ b/examples/ebft/qwen35-4b-ebft-structured-async.yaml @@ -72,7 +72,7 @@ lora_dropout: 0.0 lora_target_modules: ".*\\.layers\\.(3|7|11|15|19|23|27|31)\\.self_attn\\.(q|k|v|o)_proj|.*\\.mlp\\.(gate|up|down)_proj" bf16: auto -flash_attention: true +attn_implementation: flash_attention_2 gradient_checkpointing: true special_tokens: diff --git a/examples/ebft/qwen35-4b-ebft-structured.yaml b/examples/ebft/qwen35-4b-ebft-structured.yaml index 9108e87e9..d1b2a72f2 100644 --- a/examples/ebft/qwen35-4b-ebft-structured.yaml +++ b/examples/ebft/qwen35-4b-ebft-structured.yaml @@ -63,7 +63,7 @@ lora_dropout: 0.0 lora_target_modules: ".*\\.layers\\.(3|7|11|15|19|23|27|31)\\.self_attn\\.(q|k|v|o)_proj|.*\\.mlp\\.(gate|up|down)_proj" bf16: auto -flash_attention: true +attn_implementation: flash_attention_2 gradient_checkpointing: true special_tokens: diff --git a/examples/ebft/qwen35-9b-ebft-structured.yaml b/examples/ebft/qwen35-9b-ebft-structured.yaml index e79fb5fbf..ad3b8538e 100644 --- a/examples/ebft/qwen35-9b-ebft-structured.yaml +++ b/examples/ebft/qwen35-9b-ebft-structured.yaml @@ -68,7 +68,7 @@ lora_dropout: 0.0 lora_target_modules: ".*\\.layers\\.(3|7|11|15|19|23|27|31)\\.self_attn\\.(q|k|v|o)_proj|.*\\.mlp\\.(gate|up|down)_proj" bf16: auto -flash_attention: true +attn_implementation: flash_attention_2 gradient_checkpointing: true special_tokens: diff --git a/examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml b/examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml index 2473179f0..f59f0df5c 100644 --- a/examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml +++ b/examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml @@ -62,7 +62,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: diff --git a/examples/falcon-h1/falcon-h1-1b-qlora.yaml b/examples/falcon-h1/falcon-h1-1b-qlora.yaml index bfb7836ef..8c3eb080d 100644 --- a/examples/falcon-h1/falcon-h1-1b-qlora.yaml +++ b/examples/falcon-h1/falcon-h1-1b-qlora.yaml @@ -61,7 +61,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: diff --git a/examples/falcon-h1/falcon-h1-34b-qlora.yaml b/examples/falcon-h1/falcon-h1-34b-qlora.yaml index 80a9d45b5..28e7de956 100644 --- a/examples/falcon-h1/falcon-h1-34b-qlora.yaml +++ b/examples/falcon-h1/falcon-h1-34b-qlora.yaml @@ -62,7 +62,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: diff --git a/examples/falcon-h1/falcon-h1-3b-qlora.yaml b/examples/falcon-h1/falcon-h1-3b-qlora.yaml index 02be8ac5d..71b38e2f7 100644 --- a/examples/falcon-h1/falcon-h1-3b-qlora.yaml +++ b/examples/falcon-h1/falcon-h1-3b-qlora.yaml @@ -62,7 +62,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/falcon-h1/falcon-h1-500m-qlora.yaml b/examples/falcon-h1/falcon-h1-500m-qlora.yaml index b112d5d85..91602ae71 100644 --- a/examples/falcon-h1/falcon-h1-500m-qlora.yaml +++ b/examples/falcon-h1/falcon-h1-500m-qlora.yaml @@ -62,7 +62,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: diff --git a/examples/falcon-h1/falcon-h1-7b-qlora.yaml b/examples/falcon-h1/falcon-h1-7b-qlora.yaml index c5505873d..cc7e8f6cd 100644 --- a/examples/falcon-h1/falcon-h1-7b-qlora.yaml +++ b/examples/falcon-h1/falcon-h1-7b-qlora.yaml @@ -62,7 +62,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/gemma2/qlora.yml b/examples/gemma2/qlora.yml index 8a295a1f8..b2fca74da 100644 --- a/examples/gemma2/qlora.yml +++ b/examples/gemma2/qlora.yml @@ -53,7 +53,7 @@ tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: diff --git a/examples/gemma2/reward-model.yaml b/examples/gemma2/reward-model.yaml index 67b1228b2..f48bff626 100644 --- a/examples/gemma2/reward-model.yaml +++ b/examples/gemma2/reward-model.yaml @@ -43,7 +43,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: diff --git a/examples/gemma3/gemma-3-1b-qlora.yml b/examples/gemma3/gemma-3-1b-qlora.yml index 4bcbf09f4..95b99a0da 100644 --- a/examples/gemma3/gemma-3-1b-qlora.yml +++ b/examples/gemma3/gemma-3-1b-qlora.yml @@ -62,7 +62,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: diff --git a/examples/gemma3/gemma-3-270m-qlora.yml b/examples/gemma3/gemma-3-270m-qlora.yml index 1f247ab05..800a88a1b 100644 --- a/examples/gemma3/gemma-3-270m-qlora.yml +++ b/examples/gemma3/gemma-3-270m-qlora.yml @@ -62,7 +62,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: diff --git a/examples/gemma3/gemma-3-4b-qlora.yml b/examples/gemma3/gemma-3-4b-qlora.yml index 5d939da19..e7c43ddef 100644 --- a/examples/gemma3/gemma-3-4b-qlora.yml +++ b/examples/gemma3/gemma-3-4b-qlora.yml @@ -58,8 +58,7 @@ gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false logging_steps: 1 -flash_attention: true -eager_attention: +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/gemma3/gemma-3-4b-vision-qlora.yml b/examples/gemma3/gemma-3-4b-vision-qlora.yml index a12e84bee..790d9543a 100644 --- a/examples/gemma3/gemma-3-4b-vision-qlora.yml +++ b/examples/gemma3/gemma-3-4b-vision-qlora.yml @@ -55,8 +55,7 @@ gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false logging_steps: 1 -flash_attention: true -eager_attention: +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/gemma4/26b-a4b-moe-qlora.yaml b/examples/gemma4/26b-a4b-moe-qlora.yaml index e7bdb6f46..cdc70ef4a 100644 --- a/examples/gemma4/26b-a4b-moe-qlora.yaml +++ b/examples/gemma4/26b-a4b-moe-qlora.yaml @@ -84,7 +84,7 @@ activation_offloading: true logging_steps: 1 # FA2 not supported -sdp_attention: true +attn_implementation: sdpa warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/gemma4/31b-qlora-flex.yaml b/examples/gemma4/31b-qlora-flex.yaml index 8456c9c13..87221c515 100644 --- a/examples/gemma4/31b-qlora-flex.yaml +++ b/examples/gemma4/31b-qlora-flex.yaml @@ -62,7 +62,7 @@ activation_offloading: true logging_steps: 1 # FA not supported -flex_attention: true +attn_implementation: flex_attention warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/gemma4/31b-qlora.yaml b/examples/gemma4/31b-qlora.yaml index 42086a43c..4a633436e 100644 --- a/examples/gemma4/31b-qlora.yaml +++ b/examples/gemma4/31b-qlora.yaml @@ -60,7 +60,7 @@ activation_offloading: true logging_steps: 1 # FA not supported -sdp_attention: true +attn_implementation: sdpa warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/gemma4/e2b-vision-lora.yaml b/examples/gemma4/e2b-vision-lora.yaml index c779aaea5..ae90bc1cb 100644 --- a/examples/gemma4/e2b-vision-lora.yaml +++ b/examples/gemma4/e2b-vision-lora.yaml @@ -50,7 +50,7 @@ gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false logging_steps: 1 -sdp_attention: true +attn_implementation: sdpa warmup_ratio: 0.1 weight_decay: 0.0 diff --git a/examples/glm4/qlora-32b.yaml b/examples/glm4/qlora-32b.yaml index 832abde05..151820924 100644 --- a/examples/glm4/qlora-32b.yaml +++ b/examples/glm4/qlora-32b.yaml @@ -50,7 +50,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 diff --git a/examples/glm45/glm-45-air-qlora.yaml b/examples/glm45/glm-45-air-qlora.yaml index accb8898f..5723d3c45 100644 --- a/examples/glm45/glm-45-air-qlora.yaml +++ b/examples/glm45/glm-45-air-qlora.yaml @@ -55,7 +55,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/glm46v/glm-4-6v-flash-ddp.yaml b/examples/glm46v/glm-4-6v-flash-ddp.yaml index c67ac5e28..274f041a3 100644 --- a/examples/glm46v/glm-4-6v-flash-ddp.yaml +++ b/examples/glm46v/glm-4-6v-flash-ddp.yaml @@ -45,7 +45,7 @@ gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false logging_steps: 1 -sdp_attention: true +attn_implementation: sdpa warmup_ratio: 0.1 evals_per_epoch: 0 diff --git a/examples/glm46v/glm-4-6v-flash-qlora.yaml b/examples/glm46v/glm-4-6v-flash-qlora.yaml index 287944ae8..9fe8d6e43 100644 --- a/examples/glm46v/glm-4-6v-flash-qlora.yaml +++ b/examples/glm46v/glm-4-6v-flash-qlora.yaml @@ -42,7 +42,7 @@ tf32: false gradient_checkpointing: true logging_steps: 1 -sdp_attention: true +attn_implementation: sdpa warmup_ratio: 0.1 evals_per_epoch: 0 diff --git a/examples/glm47-flash/lora.yaml b/examples/glm47-flash/lora.yaml index 2586babb7..5f3de36e9 100644 --- a/examples/glm47-flash/lora.yaml +++ b/examples/glm47-flash/lora.yaml @@ -58,7 +58,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/glm47-flash/lora_fsdp.yaml b/examples/glm47-flash/lora_fsdp.yaml index bee20bf02..cf1d2de55 100644 --- a/examples/glm47-flash/lora_fsdp.yaml +++ b/examples/glm47-flash/lora_fsdp.yaml @@ -57,7 +57,7 @@ tf32: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/glm47-flash/qlora.yaml b/examples/glm47-flash/qlora.yaml index 834c46af8..a05bf54d2 100644 --- a/examples/glm47-flash/qlora.yaml +++ b/examples/glm47-flash/qlora.yaml @@ -58,7 +58,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/glm47-flash/qlora_fsdp.yaml b/examples/glm47-flash/qlora_fsdp.yaml index 0bb87813f..9ad5a6212 100644 --- a/examples/glm47-flash/qlora_fsdp.yaml +++ b/examples/glm47-flash/qlora_fsdp.yaml @@ -57,7 +57,7 @@ tf32: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml b/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml index b7082f986..512784e50 100644 --- a/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml +++ b/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml @@ -47,7 +47,7 @@ learning_rate: 2e-5 bf16: true tf32: true -flash_attention: true +attn_implementation: flash_attention_2 attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3 gradient_checkpointing: true diff --git a/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml b/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml index b718ff2eb..e36cd5192 100644 --- a/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml +++ b/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml @@ -43,7 +43,7 @@ learning_rate: 2e-5 bf16: true tf32: true -flash_attention: true +attn_implementation: flash_attention_2 attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3 gradient_checkpointing: true diff --git a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml index af1c93bc0..cd85460d8 100644 --- a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml +++ b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml @@ -44,7 +44,7 @@ learning_rate: 2e-5 bf16: true tf32: true -flash_attention: true +attn_implementation: flash_attention_2 attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3 gradient_checkpointing: true diff --git a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml index 894ba99b8..2ebfd1a80 100644 --- a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml +++ b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml @@ -43,7 +43,7 @@ learning_rate: 2e-5 bf16: true tf32: true -flash_attention: true +attn_implementation: flash_attention_2 attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3 gradient_checkpointing: true diff --git a/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml b/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml index 7c4f97846..dd632e4a0 100644 --- a/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml +++ b/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml @@ -56,7 +56,7 @@ learning_rate: 2e-4 bf16: true tf32: true -flash_attention: true +attn_implementation: flash_attention_2 attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3 gradient_checkpointing: true diff --git a/examples/gpt-oss/gpt-oss-safeguard-20b-sft-lora-singlegpu.yaml b/examples/gpt-oss/gpt-oss-safeguard-20b-sft-lora-singlegpu.yaml index cbb9efc8e..d57f9501d 100644 --- a/examples/gpt-oss/gpt-oss-safeguard-20b-sft-lora-singlegpu.yaml +++ b/examples/gpt-oss/gpt-oss-safeguard-20b-sft-lora-singlegpu.yaml @@ -56,7 +56,7 @@ learning_rate: 2e-4 bf16: true tf32: true -flash_attention: true +attn_implementation: flash_attention_2 attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3 gradient_checkpointing: true diff --git a/examples/granite4/granite-4.0-tiny-fft.yaml b/examples/granite4/granite-4.0-tiny-fft.yaml index 7ff8207ae..fd7d2a312 100644 --- a/examples/granite4/granite-4.0-tiny-fft.yaml +++ b/examples/granite4/granite-4.0-tiny-fft.yaml @@ -36,7 +36,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/hunyuan/hunyuan-v1-dense-qlora.yaml b/examples/hunyuan/hunyuan-v1-dense-qlora.yaml index a94345a61..1ae6b000d 100644 --- a/examples/hunyuan/hunyuan-v1-dense-qlora.yaml +++ b/examples/hunyuan/hunyuan-v1-dense-qlora.yaml @@ -55,7 +55,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/internvl3_5/internvl3_5-8b-qlora.yml b/examples/internvl3_5/internvl3_5-8b-qlora.yml index 9a72d078a..2d924c6f1 100644 --- a/examples/internvl3_5/internvl3_5-8b-qlora.yml +++ b/examples/internvl3_5/internvl3_5-8b-qlora.yml @@ -50,8 +50,7 @@ tf32: true gradient_checkpointing: true logging_steps: 1 -flash_attention: true -eager_attention: +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/jamba/qlora.yaml b/examples/jamba/qlora.yaml index 538ed3a10..f625fb6f5 100644 --- a/examples/jamba/qlora.yaml +++ b/examples/jamba/qlora.yaml @@ -47,7 +47,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: diff --git a/examples/jamba/qlora_deepspeed.yaml b/examples/jamba/qlora_deepspeed.yaml index b288635e7..8ec74f905 100644 --- a/examples/jamba/qlora_deepspeed.yaml +++ b/examples/jamba/qlora_deepspeed.yaml @@ -46,7 +46,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: diff --git a/examples/jamba/qlora_fsdp_large.yaml b/examples/jamba/qlora_fsdp_large.yaml index 4db889fbc..76cc0ef18 100644 --- a/examples/jamba/qlora_fsdp_large.yaml +++ b/examples/jamba/qlora_fsdp_large.yaml @@ -44,7 +44,7 @@ gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: true logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/kimi-linear/kimi-48b-lora.yaml b/examples/kimi-linear/kimi-48b-lora.yaml index 8e855dd72..befa29891 100644 --- a/examples/kimi-linear/kimi-48b-lora.yaml +++ b/examples/kimi-linear/kimi-48b-lora.yaml @@ -65,7 +65,7 @@ early_stopping_patience: resume_from_checkpoint: local_rank: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 diff --git a/examples/llama-2/fft_optimized.yml b/examples/llama-2/fft_optimized.yml index ea119348e..7af25dd17 100644 --- a/examples/llama-2/fft_optimized.yml +++ b/examples/llama-2/fft_optimized.yml @@ -42,7 +42,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 flash_attn_cross_entropy: false flash_attn_rms_norm: true flash_attn_fuse_mlp: true diff --git a/examples/llama-2/gptq-lora.yml b/examples/llama-2/gptq-lora.yml index de1caaa05..c4073b80a 100644 --- a/examples/llama-2/gptq-lora.yml +++ b/examples/llama-2/gptq-lora.yml @@ -53,8 +53,6 @@ tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: -sdp_attention: flash_optimum: warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/llama-2/lisa.yml b/examples/llama-2/lisa.yml index d21c01a49..40ba6d0d0 100644 --- a/examples/llama-2/lisa.yml +++ b/examples/llama-2/lisa.yml @@ -46,7 +46,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 flash_attn_cross_entropy: false flash_attn_rms_norm: true flash_attn_fuse_mlp: true diff --git a/examples/llama-2/loftq.yml b/examples/llama-2/loftq.yml index 619e5bcce..f1562ec29 100644 --- a/examples/llama-2/loftq.yml +++ b/examples/llama-2/loftq.yml @@ -45,7 +45,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/llama-2/lora.yml b/examples/llama-2/lora.yml index 0a677f11a..8c2242b71 100644 --- a/examples/llama-2/lora.yml +++ b/examples/llama-2/lora.yml @@ -45,7 +45,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/llama-2/qlora-fsdp.yml b/examples/llama-2/qlora-fsdp.yml index 1e7064de8..102eb7af7 100644 --- a/examples/llama-2/qlora-fsdp.yml +++ b/examples/llama-2/qlora-fsdp.yml @@ -48,7 +48,7 @@ gradient_checkpointing_kwargs: use_reentrant: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/llama-2/qlora.yml b/examples/llama-2/qlora.yml index 327d88c15..87e710792 100644 --- a/examples/llama-2/qlora.yml +++ b/examples/llama-2/qlora.yml @@ -46,7 +46,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/llama-2/relora.yml b/examples/llama-2/relora.yml index fabdf0e0f..8e3df58bf 100644 --- a/examples/llama-2/relora.yml +++ b/examples/llama-2/relora.yml @@ -51,7 +51,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/llama-3-vision/lora-11b.yaml b/examples/llama-3-vision/lora-11b.yaml index adbb61643..4e5eb4c4e 100644 --- a/examples/llama-3-vision/lora-11b.yaml +++ b/examples/llama-3-vision/lora-11b.yaml @@ -50,7 +50,7 @@ tf32: true gradient_checkpointing: true logging_steps: 1 # flash_attention: true # use for text-only mode -sdp_attention: true +attn_implementation: sdpa warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/llama-3/3b-fp8-fsdp2.yaml b/examples/llama-3/3b-fp8-fsdp2.yaml index 57b308abd..cfc15870f 100644 --- a/examples/llama-3/3b-fp8-fsdp2.yaml +++ b/examples/llama-3/3b-fp8-fsdp2.yaml @@ -25,7 +25,7 @@ sample_packing: true pad_to_sequence_len: true sequence_len: 512 -flex_attention: true +attn_implementation: flex_attention flex_attn_compile_kwargs: dynamic: false mode: max-autotune-no-cudagraphs diff --git a/examples/llama-3/3b-qat-fsdp2.yaml b/examples/llama-3/3b-qat-fsdp2.yaml index 0c5a87891..99c975351 100644 --- a/examples/llama-3/3b-qat-fsdp2.yaml +++ b/examples/llama-3/3b-qat-fsdp2.yaml @@ -26,7 +26,7 @@ dataset_prepared_path: ./outputs/qat_out/dataset_prepared sample_packing: false sequence_len: 8192 -flash_attention: true +attn_implementation: flash_attention_2 qat: activation_dtype: int8 diff --git a/examples/llama-3/3b-qat-mxfp4.yaml b/examples/llama-3/3b-qat-mxfp4.yaml index 7ae941e9e..4e9f64685 100644 --- a/examples/llama-3/3b-qat-mxfp4.yaml +++ b/examples/llama-3/3b-qat-mxfp4.yaml @@ -24,7 +24,7 @@ output_dir: ./outputs/qat_out/ dataset_prepared_path: ./outputs/dataset_prepared sequence_len: 2048 -flash_attention: true +attn_implementation: flash_attention_2 qat: activation_dtype: mxfp4 diff --git a/examples/llama-3/3b-qat-nvfp4.yaml b/examples/llama-3/3b-qat-nvfp4.yaml index 1ec809bbe..77cf2b19b 100644 --- a/examples/llama-3/3b-qat-nvfp4.yaml +++ b/examples/llama-3/3b-qat-nvfp4.yaml @@ -24,7 +24,7 @@ output_dir: ./outputs/qat_out/ dataset_prepared_path: ./outputs/dataset_prepared sequence_len: 8192 -flash_attention: true +attn_implementation: flash_attention_2 qat: activation_dtype: nvfp4 diff --git a/examples/llama-3/diffusion/pretrain-1b.yaml b/examples/llama-3/diffusion/pretrain-1b.yaml index 8d05e4c60..1b488db7a 100644 --- a/examples/llama-3/diffusion/pretrain-1b.yaml +++ b/examples/llama-3/diffusion/pretrain-1b.yaml @@ -35,7 +35,7 @@ warmup_ratio: 0.1 optimizer: adamw_8bit lr_scheduler: cosine learning_rate: 3e-4 -sdp_attention: true +attn_implementation: sdpa bf16: auto tf32: true diff --git a/examples/llama-3/diffusion/sft-1b.yaml b/examples/llama-3/diffusion/sft-1b.yaml index f3b29a809..b6de76af3 100644 --- a/examples/llama-3/diffusion/sft-1b.yaml +++ b/examples/llama-3/diffusion/sft-1b.yaml @@ -41,7 +41,7 @@ tf32: true gradient_checkpointing: true resume_from_checkpoint: -sdp_attention: true +attn_implementation: sdpa logging_steps: 1 save_strategy: best diff --git a/examples/llama-3/fft-8b-liger-fsdp.yaml b/examples/llama-3/fft-8b-liger-fsdp.yaml index a655b97a9..b96bc920e 100644 --- a/examples/llama-3/fft-8b-liger-fsdp.yaml +++ b/examples/llama-3/fft-8b-liger-fsdp.yaml @@ -49,7 +49,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 2 diff --git a/examples/llama-3/fft-8b.yaml b/examples/llama-3/fft-8b.yaml index c72ec6662..3e2809196 100644 --- a/examples/llama-3/fft-8b.yaml +++ b/examples/llama-3/fft-8b.yaml @@ -34,7 +34,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 2 diff --git a/examples/llama-3/instruct-dpo-lora-8b.yml b/examples/llama-3/instruct-dpo-lora-8b.yml index cf823353b..b49ace2ed 100644 --- a/examples/llama-3/instruct-dpo-lora-8b.yml +++ b/examples/llama-3/instruct-dpo-lora-8b.yml @@ -65,7 +65,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/llama-3/instruct-lora-8b.yml b/examples/llama-3/instruct-lora-8b.yml index 401df1d72..1c61ce9e4 100644 --- a/examples/llama-3/instruct-lora-8b.yml +++ b/examples/llama-3/instruct-lora-8b.yml @@ -47,7 +47,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/llama-3/lora-1b-deduplicate-dpo.yml b/examples/llama-3/lora-1b-deduplicate-dpo.yml index 2897636f4..2be72c4d0 100644 --- a/examples/llama-3/lora-1b-deduplicate-dpo.yml +++ b/examples/llama-3/lora-1b-deduplicate-dpo.yml @@ -77,7 +77,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/llama-3/lora-1b-deduplicate-sft.yml b/examples/llama-3/lora-1b-deduplicate-sft.yml index c5190d892..ad21cb266 100644 --- a/examples/llama-3/lora-1b-deduplicate-sft.yml +++ b/examples/llama-3/lora-1b-deduplicate-sft.yml @@ -53,7 +53,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/llama-3/lora-1b-kernels.yml b/examples/llama-3/lora-1b-kernels.yml index 0bcf46b17..b0914f87a 100644 --- a/examples/llama-3/lora-1b-kernels.yml +++ b/examples/llama-3/lora-1b-kernels.yml @@ -54,7 +54,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 diff --git a/examples/llama-3/lora-1b-ray.yml b/examples/llama-3/lora-1b-ray.yml index 46c83348e..a3aa1cf5e 100644 --- a/examples/llama-3/lora-1b-ray.yml +++ b/examples/llama-3/lora-1b-ray.yml @@ -48,7 +48,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 diff --git a/examples/llama-3/lora-1b-sample-packing-sequentially.yml b/examples/llama-3/lora-1b-sample-packing-sequentially.yml index dba78597b..f6c24bc74 100644 --- a/examples/llama-3/lora-1b-sample-packing-sequentially.yml +++ b/examples/llama-3/lora-1b-sample-packing-sequentially.yml @@ -55,7 +55,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/llama-3/lora-1b.yml b/examples/llama-3/lora-1b.yml index 2ae2f0056..d01c618bc 100644 --- a/examples/llama-3/lora-1b.yml +++ b/examples/llama-3/lora-1b.yml @@ -49,7 +49,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 diff --git a/examples/llama-3/lora-8b.yml b/examples/llama-3/lora-8b.yml index d72b6527d..90084ec95 100644 --- a/examples/llama-3/lora-8b.yml +++ b/examples/llama-3/lora-8b.yml @@ -49,7 +49,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/llama-3/opentelemetry-qlora.yml b/examples/llama-3/opentelemetry-qlora.yml index d8ce7b1ec..0c9995dae 100644 --- a/examples/llama-3/opentelemetry-qlora.yml +++ b/examples/llama-3/opentelemetry-qlora.yml @@ -39,7 +39,6 @@ tf32: false gradient_checkpointing: true logging_steps: 1 -flash_attention: false warmup_ratio: 0.1 evals_per_epoch: 2 diff --git a/examples/llama-3/qlora-1b-gdpo.yaml b/examples/llama-3/qlora-1b-gdpo.yaml index d806fcf26..f754a6887 100644 --- a/examples/llama-3/qlora-1b-gdpo.yaml +++ b/examples/llama-3/qlora-1b-gdpo.yaml @@ -56,7 +56,7 @@ gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false -flash_attention: true +attn_implementation: flash_attention_2 logging_steps: 1 save_steps: 50 save_safetensors: true diff --git a/examples/llama-3/qlora-1b-kto.yaml b/examples/llama-3/qlora-1b-kto.yaml index a6a84e7b1..18c240d97 100644 --- a/examples/llama-3/qlora-1b-kto.yaml +++ b/examples/llama-3/qlora-1b-kto.yaml @@ -53,7 +53,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/llama-3/qlora-1b.yml b/examples/llama-3/qlora-1b.yml index 1e4f97438..d1e5e18ae 100644 --- a/examples/llama-3/qlora-1b.yml +++ b/examples/llama-3/qlora-1b.yml @@ -51,7 +51,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 diff --git a/examples/llama-3/qlora-fsdp-405b.yaml b/examples/llama-3/qlora-fsdp-405b.yaml index 5c236f2cf..b801af845 100644 --- a/examples/llama-3/qlora-fsdp-405b.yaml +++ b/examples/llama-3/qlora-fsdp-405b.yaml @@ -38,7 +38,7 @@ gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: true logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/llama-3/qlora-fsdp-70b.yaml b/examples/llama-3/qlora-fsdp-70b.yaml index c052bc19d..5ce774e18 100644 --- a/examples/llama-3/qlora-fsdp-70b.yaml +++ b/examples/llama-3/qlora-fsdp-70b.yaml @@ -48,7 +48,7 @@ gradient_checkpointing_kwargs: use_reentrant: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/llama-3/qlora.yml b/examples/llama-3/qlora.yml index a8f47a0e2..fad507cd9 100644 --- a/examples/llama-3/qlora.yml +++ b/examples/llama-3/qlora.yml @@ -46,7 +46,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/llama-3/sparse-finetuning.yaml b/examples/llama-3/sparse-finetuning.yaml index 348756b70..0ce4aa03d 100644 --- a/examples/llama-3/sparse-finetuning.yaml +++ b/examples/llama-3/sparse-finetuning.yaml @@ -44,8 +44,7 @@ gradient_checkpointing_kwargs: early_stopping_patience: resume_from_checkpoint: logging_steps: 1 -xformers_attention: -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 2 diff --git a/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml b/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml index b20f79758..2c701a2aa 100644 --- a/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml +++ b/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml @@ -60,7 +60,7 @@ bf16: true tf32: true logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 gradient_checkpointing: offload gradient_checkpointing_kwargs: diff --git a/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml b/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml index 40449009c..8197d1629 100644 --- a/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml +++ b/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml @@ -67,7 +67,7 @@ bf16: true tf32: true logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml b/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml index abdc51378..2dcff36cd 100644 --- a/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml +++ b/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml @@ -70,7 +70,7 @@ bf16: true tf32: true logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 gradient_checkpointing: offload gradient_checkpointing_kwargs: diff --git a/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml b/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml index 4136dc14a..de7ae5f50 100644 --- a/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml +++ b/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml @@ -62,7 +62,7 @@ bf16: true tf32: true logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml b/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml index 02c04c691..c5343fa2e 100644 --- a/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml +++ b/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml @@ -59,7 +59,7 @@ bf16: true tf32: true logging_steps: 1 -flex_attention: true +attn_implementation: flex_attention flex_attn_compile_kwargs: dynamic: false mode: max-autotune-no-cudagraphs diff --git a/examples/llama-4/scout-qlora-single-h100-flex.yaml b/examples/llama-4/scout-qlora-single-h100-flex.yaml index 33a691189..00491c3b1 100644 --- a/examples/llama-4/scout-qlora-single-h100-flex.yaml +++ b/examples/llama-4/scout-qlora-single-h100-flex.yaml @@ -64,7 +64,7 @@ bf16: true tf32: true torch_compile: true -flex_attention: true +attn_implementation: flex_attention flex_attn_compile_kwargs: dynamic: false mode: max-autotune-no-cudagraphs diff --git a/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml b/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml index 5972c2ae3..9b3e089b5 100644 --- a/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml +++ b/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml @@ -61,7 +61,7 @@ bf16: true tf32: true logging_steps: 1 -flex_attention: true +attn_implementation: flex_attention flex_attn_compile_kwargs: dynamic: false mode: max-autotune-no-cudagraphs diff --git a/examples/llava/lora-7b.yaml b/examples/llava/lora-7b.yaml index 77ef7474d..56b48fda9 100644 --- a/examples/llava/lora-7b.yaml +++ b/examples/llava/lora-7b.yaml @@ -45,8 +45,7 @@ tf32: true gradient_checkpointing: true logging_steps: 1 -flash_attention: true -eager_attention: +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/magistral/magistral-small-fsdp-qlora.yaml b/examples/magistral/magistral-small-fsdp-qlora.yaml index d46c49fe0..f31ca7326 100644 --- a/examples/magistral/magistral-small-fsdp-qlora.yaml +++ b/examples/magistral/magistral-small-fsdp-qlora.yaml @@ -59,7 +59,7 @@ tf32: false gradient_checkpointing: resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/magistral/magistral-small-qlora.yaml b/examples/magistral/magistral-small-qlora.yaml index 188924d39..90f6b6f91 100644 --- a/examples/magistral/magistral-small-qlora.yaml +++ b/examples/magistral/magistral-small-qlora.yaml @@ -58,7 +58,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/magistral/think/magistral-small-think-qlora.yaml b/examples/magistral/think/magistral-small-think-qlora.yaml index b715b3156..85abe18da 100644 --- a/examples/magistral/think/magistral-small-think-qlora.yaml +++ b/examples/magistral/think/magistral-small-think-qlora.yaml @@ -58,7 +58,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/magistral/vision/magistral-small-vision-24B-qlora.yml b/examples/magistral/vision/magistral-small-vision-24B-qlora.yml index 397db383e..abd244647 100644 --- a/examples/magistral/vision/magistral-small-vision-24B-qlora.yml +++ b/examples/magistral/vision/magistral-small-vision-24B-qlora.yml @@ -53,7 +53,7 @@ tf32: true gradient_checkpointing: true logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/mamba/config.yml b/examples/mamba/config.yml index 5f36595a3..0c39768d8 100644 --- a/examples/mamba/config.yml +++ b/examples/mamba/config.yml @@ -39,7 +39,6 @@ tf32: true gradient_checkpointing: false resume_from_checkpoint: logging_steps: 1 -flash_attention: warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/mimo/mimo-7b-qlora.yaml b/examples/mimo/mimo-7b-qlora.yaml index 689213bcd..7ced584e1 100644 --- a/examples/mimo/mimo-7b-qlora.yaml +++ b/examples/mimo/mimo-7b-qlora.yaml @@ -58,7 +58,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/ministral/ministral-small-qlora.yaml b/examples/ministral/ministral-small-qlora.yaml index 0d5300ef6..4c3bdfe94 100644 --- a/examples/ministral/ministral-small-qlora.yaml +++ b/examples/ministral/ministral-small-qlora.yaml @@ -58,7 +58,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/ministral3/ministral3-3b-qlora.yaml b/examples/ministral3/ministral3-3b-qlora.yaml index b369c9d41..49eec882f 100644 --- a/examples/ministral3/ministral3-3b-qlora.yaml +++ b/examples/ministral3/ministral3-3b-qlora.yaml @@ -58,7 +58,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 scaling_softmax: true warmup_ratio: 0.1 diff --git a/examples/ministral3/think/ministral3-3b-think-qlora.yaml b/examples/ministral3/think/ministral3-3b-think-qlora.yaml index 987c0bd54..508575cac 100644 --- a/examples/ministral3/think/ministral3-3b-think-qlora.yaml +++ b/examples/ministral3/think/ministral3-3b-think-qlora.yaml @@ -58,7 +58,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/ministral3/vision/ministral3-3b-vision-qlora.yml b/examples/ministral3/vision/ministral3-3b-vision-qlora.yml index 0a0fdce4a..f1430ba53 100644 --- a/examples/ministral3/vision/ministral3-3b-vision-qlora.yml +++ b/examples/ministral3/vision/ministral3-3b-vision-qlora.yml @@ -53,7 +53,7 @@ tf32: true gradient_checkpointing: true logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/mistral-small/mistral-small-3.1-24B-lora.yml b/examples/mistral-small/mistral-small-3.1-24B-lora.yml index d45d13ac6..4d3f78a13 100644 --- a/examples/mistral-small/mistral-small-3.1-24B-lora.yml +++ b/examples/mistral-small/mistral-small-3.1-24B-lora.yml @@ -51,7 +51,7 @@ tf32: true gradient_checkpointing: true logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/mistral/bigstral/bigstral-ds-zero3.yaml b/examples/mistral/bigstral/bigstral-ds-zero3.yaml index a8dc36216..4648ae4b4 100644 --- a/examples/mistral/bigstral/bigstral-ds-zero3.yaml +++ b/examples/mistral/bigstral/bigstral-ds-zero3.yaml @@ -42,7 +42,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 save_total_limit: 1 save_steps: diff --git a/examples/mistral/config.yml b/examples/mistral/config.yml index e74162537..aa1066733 100644 --- a/examples/mistral/config.yml +++ b/examples/mistral/config.yml @@ -36,7 +36,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/mistral/dpo/mistral-dpo-qlora.yml b/examples/mistral/dpo/mistral-dpo-qlora.yml index 8fea14a0f..604eada74 100644 --- a/examples/mistral/dpo/mistral-dpo-qlora.yml +++ b/examples/mistral/dpo/mistral-dpo-qlora.yml @@ -71,7 +71,6 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: false warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/mistral/lora.yml b/examples/mistral/lora.yml index 757287f19..b157fcc21 100644 --- a/examples/mistral/lora.yml +++ b/examples/mistral/lora.yml @@ -54,7 +54,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 diff --git a/examples/mistral/mistral-qlora-fsdp.yml b/examples/mistral/mistral-qlora-fsdp.yml index 8e1f03d24..27d8be3cd 100644 --- a/examples/mistral/mistral-qlora-fsdp.yml +++ b/examples/mistral/mistral-qlora-fsdp.yml @@ -51,7 +51,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 diff --git a/examples/mistral/mixtral/mixtral-8x22b-qlora-fsdp.yml b/examples/mistral/mixtral/mixtral-8x22b-qlora-fsdp.yml index dc7bd9c37..1b66de8f0 100644 --- a/examples/mistral/mixtral/mixtral-8x22b-qlora-fsdp.yml +++ b/examples/mistral/mixtral/mixtral-8x22b-qlora-fsdp.yml @@ -49,7 +49,7 @@ tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 diff --git a/examples/mistral/mixtral/mixtral-qlora-fsdp.yml b/examples/mistral/mixtral/mixtral-qlora-fsdp.yml index 5151e1292..bd7c8620e 100644 --- a/examples/mistral/mixtral/mixtral-qlora-fsdp.yml +++ b/examples/mistral/mixtral/mixtral-qlora-fsdp.yml @@ -51,7 +51,7 @@ tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 diff --git a/examples/mistral/mixtral/mixtral.yml b/examples/mistral/mixtral/mixtral.yml index d1981a699..b493ed317 100644 --- a/examples/mistral/mixtral/mixtral.yml +++ b/examples/mistral/mixtral/mixtral.yml @@ -69,7 +69,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 diff --git a/examples/mistral/mixtral/mixtral_22.yml b/examples/mistral/mixtral/mixtral_22.yml index 0b606b7d7..3b87af04e 100644 --- a/examples/mistral/mixtral/mixtral_22.yml +++ b/examples/mistral/mixtral/mixtral_22.yml @@ -40,7 +40,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 save_total_limit: 1 save_steps: diff --git a/examples/mistral/mps/lora-mps.yml b/examples/mistral/mps/lora-mps.yml index 07ce191dc..1b8021085 100644 --- a/examples/mistral/mps/lora-mps.yml +++ b/examples/mistral/mps/lora-mps.yml @@ -53,8 +53,7 @@ tf32: true gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: false -sdp_attention: true +attn_implementation: sdpa loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 diff --git a/examples/mistral/orpo/mistral-qlora-orpo.yml b/examples/mistral/orpo/mistral-qlora-orpo.yml index 850d286f3..d1c0065e5 100644 --- a/examples/mistral/orpo/mistral-qlora-orpo.yml +++ b/examples/mistral/orpo/mistral-qlora-orpo.yml @@ -59,7 +59,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 diff --git a/examples/mistral/qlora.yml b/examples/mistral/qlora.yml index 2a7495e95..4fa82d11e 100644 --- a/examples/mistral/qlora.yml +++ b/examples/mistral/qlora.yml @@ -54,7 +54,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 diff --git a/examples/mistral4/fft-text.yml b/examples/mistral4/fft-text.yml index 3acb5b2ed..2cdab6a42 100644 --- a/examples/mistral4/fft-text.yml +++ b/examples/mistral4/fft-text.yml @@ -40,7 +40,7 @@ bf16: true tf32: true logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/mistral4/fft-vision.yml b/examples/mistral4/fft-vision.yml index baff37fe4..22262c55a 100644 --- a/examples/mistral4/fft-vision.yml +++ b/examples/mistral4/fft-vision.yml @@ -39,7 +39,7 @@ bf16: true tf32: true logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/mistral4/qlora-text.yml b/examples/mistral4/qlora-text.yml index ae0cdcead..887ce6da0 100644 --- a/examples/mistral4/qlora-text.yml +++ b/examples/mistral4/qlora-text.yml @@ -50,7 +50,7 @@ tf32: true gradient_checkpointing: true logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/mistral4/qlora-vision.yml b/examples/mistral4/qlora-vision.yml index a80d166dd..d01f8e85b 100644 --- a/examples/mistral4/qlora-vision.yml +++ b/examples/mistral4/qlora-vision.yml @@ -55,7 +55,7 @@ tf32: true gradient_checkpointing: true logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/nemotron-h/120b-a12b-qlora.yaml b/examples/nemotron-h/120b-a12b-qlora.yaml index 03e6d3b5e..1174cec21 100644 --- a/examples/nemotron-h/120b-a12b-qlora.yaml +++ b/examples/nemotron-h/120b-a12b-qlora.yaml @@ -72,7 +72,7 @@ gradient_checkpointing_kwargs: resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 2 diff --git a/examples/nemotron-h/nano-30b-a3b-qlora.yaml b/examples/nemotron-h/nano-30b-a3b-qlora.yaml index 3994ab08e..206bd5df8 100644 --- a/examples/nemotron-h/nano-30b-a3b-qlora.yaml +++ b/examples/nemotron-h/nano-30b-a3b-qlora.yaml @@ -73,7 +73,7 @@ gradient_checkpointing_kwargs: resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/nemotron/nemotron-mini-4b-qlora.yaml b/examples/nemotron/nemotron-mini-4b-qlora.yaml index e796c149c..3f3772071 100644 --- a/examples/nemotron/nemotron-mini-4b-qlora.yaml +++ b/examples/nemotron/nemotron-mini-4b-qlora.yaml @@ -48,7 +48,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/olmo3/olmo3-7b-qlora.yaml b/examples/olmo3/olmo3-7b-qlora.yaml index de2bf1d3d..b494699e0 100644 --- a/examples/olmo3/olmo3-7b-qlora.yaml +++ b/examples/olmo3/olmo3-7b-qlora.yaml @@ -55,7 +55,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/orpheus/finetune.yml b/examples/orpheus/finetune.yml index f4bc8054e..86a488c84 100644 --- a/examples/orpheus/finetune.yml +++ b/examples/orpheus/finetune.yml @@ -41,7 +41,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 5 diff --git a/examples/phi/phi-ft.yml b/examples/phi/phi-ft.yml index 717a45929..c16b15d8a 100644 --- a/examples/phi/phi-ft.yml +++ b/examples/phi/phi-ft.yml @@ -48,7 +48,7 @@ gradient_checkpointing_kwargs: use_reentrant: True resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/phi/phi-qlora.yml b/examples/phi/phi-qlora.yml index 0fe1abea5..ac4970355 100644 --- a/examples/phi/phi-qlora.yml +++ b/examples/phi/phi-qlora.yml @@ -51,7 +51,7 @@ gradient_checkpointing_kwargs: use_reentrant: True resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/phi/phi2-ft.yml b/examples/phi/phi2-ft.yml index e470c0d24..5702cc9b8 100644 --- a/examples/phi/phi2-ft.yml +++ b/examples/phi/phi2-ft.yml @@ -48,7 +48,7 @@ gradient_checkpointing_kwargs: use_reentrant: True resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/phi/phi3-ft-fsdp.yml b/examples/phi/phi3-ft-fsdp.yml index 1793737b5..49d3e44cb 100644 --- a/examples/phi/phi3-ft-fsdp.yml +++ b/examples/phi/phi3-ft-fsdp.yml @@ -49,7 +49,7 @@ gradient_checkpointing_kwargs: use_reentrant: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/phi/phi3-ft.yml b/examples/phi/phi3-ft.yml index 0b204963c..d36317f7b 100644 --- a/examples/phi/phi3-ft.yml +++ b/examples/phi/phi3-ft.yml @@ -44,7 +44,7 @@ gradient_checkpointing_kwargs: use_reentrant: True early_stopping_patience: 3 logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 eval_steps: 1000 save_steps: 5000 diff --git a/examples/pixtral/lora-12b.yml b/examples/pixtral/lora-12b.yml index 0e6489914..2e36688a1 100644 --- a/examples/pixtral/lora-12b.yml +++ b/examples/pixtral/lora-12b.yml @@ -45,7 +45,7 @@ tf32: true gradient_checkpointing: true logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/plano/plano-4b-qlora.yaml b/examples/plano/plano-4b-qlora.yaml index 106e44205..30e0c36ff 100644 --- a/examples/plano/plano-4b-qlora.yaml +++ b/examples/plano/plano-4b-qlora.yaml @@ -56,7 +56,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/qat_nvfp4/Gemma3-12B_baseline.yml b/examples/qat_nvfp4/Gemma3-12B_baseline.yml index be4e86635..e1c7e998a 100644 --- a/examples/qat_nvfp4/Gemma3-12B_baseline.yml +++ b/examples/qat_nvfp4/Gemma3-12B_baseline.yml @@ -24,7 +24,7 @@ output_dir: ./outputs/out_gemma/ sequence_len: 8096 sample_packing: true -flash_attention: true +attn_implementation: flash_attention_2 wandb_entity: wandb_watch: diff --git a/examples/qat_nvfp4/Gemma3-12B_qat.yml b/examples/qat_nvfp4/Gemma3-12B_qat.yml index 7fa81163f..061fd6061 100644 --- a/examples/qat_nvfp4/Gemma3-12B_qat.yml +++ b/examples/qat_nvfp4/Gemma3-12B_qat.yml @@ -24,7 +24,7 @@ output_dir: ./outputs/qat_out_gemma/ sequence_len: 8096 sample_packing: true -flash_attention: true +attn_implementation: flash_attention_2 qat: activation_dtype: nvfp4 diff --git a/examples/qat_nvfp4/Math-Gemma3-12B_baseline.yml b/examples/qat_nvfp4/Math-Gemma3-12B_baseline.yml index 9f209515b..f11f604b4 100644 --- a/examples/qat_nvfp4/Math-Gemma3-12B_baseline.yml +++ b/examples/qat_nvfp4/Math-Gemma3-12B_baseline.yml @@ -24,7 +24,7 @@ output_dir: ./outputs/out_math_gemma/ sequence_len: 4096 sample_packing: true -flash_attention: true +attn_implementation: flash_attention_2 wandb_entity: wandb_watch: diff --git a/examples/qat_nvfp4/Math-Gemma3-12B_qat.yml b/examples/qat_nvfp4/Math-Gemma3-12B_qat.yml index ef7e754be..f9c71321e 100644 --- a/examples/qat_nvfp4/Math-Gemma3-12B_qat.yml +++ b/examples/qat_nvfp4/Math-Gemma3-12B_qat.yml @@ -24,7 +24,7 @@ output_dir: ./outputs/qat_out_math_gemma/ sequence_len: 4096 sample_packing: true -flash_attention: true +attn_implementation: flash_attention_2 qat: activation_dtype: nvfp4 diff --git a/examples/qat_nvfp4/Math-Gemma3-27B_baseline.yml b/examples/qat_nvfp4/Math-Gemma3-27B_baseline.yml index 3a262d342..de8bc1807 100644 --- a/examples/qat_nvfp4/Math-Gemma3-27B_baseline.yml +++ b/examples/qat_nvfp4/Math-Gemma3-27B_baseline.yml @@ -24,7 +24,7 @@ output_dir: ./outputs/out_math_gemma27/ sequence_len: 4096 sample_packing: true -flash_attention: true +attn_implementation: flash_attention_2 wandb_entity: wandb_watch: diff --git a/examples/qat_nvfp4/Math-Gemma3-27B_qat.yml b/examples/qat_nvfp4/Math-Gemma3-27B_qat.yml index 87016ae9c..c77060ee2 100644 --- a/examples/qat_nvfp4/Math-Gemma3-27B_qat.yml +++ b/examples/qat_nvfp4/Math-Gemma3-27B_qat.yml @@ -24,7 +24,7 @@ output_dir: ./outputs/qat_out_math_gemma27/ sequence_len: 4096 sample_packing: true -flash_attention: true +attn_implementation: flash_attention_2 qat: activation_dtype: nvfp4 diff --git a/examples/qat_nvfp4/Math-Qwen2.5-72B_baseline.yml b/examples/qat_nvfp4/Math-Qwen2.5-72B_baseline.yml index efec25c54..487fc8e4e 100644 --- a/examples/qat_nvfp4/Math-Qwen2.5-72B_baseline.yml +++ b/examples/qat_nvfp4/Math-Qwen2.5-72B_baseline.yml @@ -24,7 +24,7 @@ output_dir: ./outputs/out_math_72b/ sequence_len: 4096 sample_packing: true -flash_attention: true +attn_implementation: flash_attention_2 wandb_entity: wandb_watch: diff --git a/examples/qat_nvfp4/Math-Qwen2.5-72B_qat.yml b/examples/qat_nvfp4/Math-Qwen2.5-72B_qat.yml index 427d7af52..12812d859 100644 --- a/examples/qat_nvfp4/Math-Qwen2.5-72B_qat.yml +++ b/examples/qat_nvfp4/Math-Qwen2.5-72B_qat.yml @@ -24,7 +24,7 @@ output_dir: ./outputs/qat_out_math_72b/ sequence_len: 4096 sample_packing: true -flash_attention: true +attn_implementation: flash_attention_2 qat: activation_dtype: nvfp4 diff --git a/examples/qat_nvfp4/Qwen2.5-72B_baseline.yml b/examples/qat_nvfp4/Qwen2.5-72B_baseline.yml index e1eaba61f..c52fd6b0a 100644 --- a/examples/qat_nvfp4/Qwen2.5-72B_baseline.yml +++ b/examples/qat_nvfp4/Qwen2.5-72B_baseline.yml @@ -24,7 +24,7 @@ output_dir: ./outputs/out_qwen72b/ sequence_len: 8096 sample_packing: true -flash_attention: true +attn_implementation: flash_attention_2 wandb_entity: wandb_watch: diff --git a/examples/qat_nvfp4/Qwen2.5-72B_qat.yml b/examples/qat_nvfp4/Qwen2.5-72B_qat.yml index dad7e5422..cc67107c0 100644 --- a/examples/qat_nvfp4/Qwen2.5-72B_qat.yml +++ b/examples/qat_nvfp4/Qwen2.5-72B_qat.yml @@ -24,7 +24,7 @@ output_dir: ./outputs/qat_out_qwen72b/ sequence_len: 8096 sample_packing: true -flash_attention: true +attn_implementation: flash_attention_2 qat: activation_dtype: nvfp4 diff --git a/examples/qwen2-vl/lora-7b.yaml b/examples/qwen2-vl/lora-7b.yaml index 285a35cbb..d9bc4826b 100644 --- a/examples/qwen2-vl/lora-7b.yaml +++ b/examples/qwen2-vl/lora-7b.yaml @@ -46,8 +46,7 @@ tf32: true gradient_checkpointing: true logging_steps: 1 -flash_attention: true -eager_attention: +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/qwen2/adamw-pretrain-fsdp2.yaml b/examples/qwen2/adamw-pretrain-fsdp2.yaml index 43fb17aab..4129338db 100644 --- a/examples/qwen2/adamw-pretrain-fsdp2.yaml +++ b/examples/qwen2/adamw-pretrain-fsdp2.yaml @@ -49,7 +49,7 @@ tf32: false gradient_checkpointing: false logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_steps: 10 evals_per_epoch: 0 diff --git a/examples/qwen2/dpo.yaml b/examples/qwen2/dpo.yaml index 3e87766d6..6096053fd 100644 --- a/examples/qwen2/dpo.yaml +++ b/examples/qwen2/dpo.yaml @@ -48,7 +48,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/qwen2/muon-pretrain-fsdp2.yaml b/examples/qwen2/muon-pretrain-fsdp2.yaml index 35c0b71f4..40dcff7be 100644 --- a/examples/qwen2/muon-pretrain-fsdp2.yaml +++ b/examples/qwen2/muon-pretrain-fsdp2.yaml @@ -49,7 +49,7 @@ tf32: false gradient_checkpointing: false logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_steps: 10 evals_per_epoch: 0 diff --git a/examples/qwen2/prm.yaml b/examples/qwen2/prm.yaml index a709a598d..1b3579fd4 100644 --- a/examples/qwen2/prm.yaml +++ b/examples/qwen2/prm.yaml @@ -47,7 +47,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: diff --git a/examples/qwen2/qlora-fsdp.yaml b/examples/qwen2/qlora-fsdp.yaml index 337619b61..7bb035c3a 100644 --- a/examples/qwen2/qlora-fsdp.yaml +++ b/examples/qwen2/qlora-fsdp.yaml @@ -47,7 +47,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/qwen2/reward-model.yaml b/examples/qwen2/reward-model.yaml index 08b8b4552..b7039cba0 100644 --- a/examples/qwen2/reward-model.yaml +++ b/examples/qwen2/reward-model.yaml @@ -42,7 +42,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: diff --git a/examples/qwen2_5-vl/lora-7b.yaml b/examples/qwen2_5-vl/lora-7b.yaml index 7d499d841..e78aac78b 100644 --- a/examples/qwen2_5-vl/lora-7b.yaml +++ b/examples/qwen2_5-vl/lora-7b.yaml @@ -46,8 +46,7 @@ tf32: true gradient_checkpointing: true logging_steps: 1 -flash_attention: true -eager_attention: +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml b/examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml index f63b1d1ce..e8e7e08c7 100644 --- a/examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml +++ b/examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml @@ -68,7 +68,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/qwen3.5/122b-a10b-moe-qlora-fsdp.yaml b/examples/qwen3.5/122b-a10b-moe-qlora-fsdp.yaml index f66bcd370..47842c561 100644 --- a/examples/qwen3.5/122b-a10b-moe-qlora-fsdp.yaml +++ b/examples/qwen3.5/122b-a10b-moe-qlora-fsdp.yaml @@ -65,7 +65,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/qwen3.5/122b-a10b-moe-qlora.yaml b/examples/qwen3.5/122b-a10b-moe-qlora.yaml index 4447cf73c..f2675c7d7 100644 --- a/examples/qwen3.5/122b-a10b-moe-qlora.yaml +++ b/examples/qwen3.5/122b-a10b-moe-qlora.yaml @@ -65,7 +65,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/qwen3.5/27b-fft.yaml b/examples/qwen3.5/27b-fft.yaml index 9f875ec26..ab206b772 100644 --- a/examples/qwen3.5/27b-fft.yaml +++ b/examples/qwen3.5/27b-fft.yaml @@ -50,7 +50,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/qwen3.5/27b-qlora-fsdp.yaml b/examples/qwen3.5/27b-qlora-fsdp.yaml index 79b87a32f..7a5423c77 100644 --- a/examples/qwen3.5/27b-qlora-fsdp.yaml +++ b/examples/qwen3.5/27b-qlora-fsdp.yaml @@ -61,7 +61,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/qwen3.5/27b-qlora.yaml b/examples/qwen3.5/27b-qlora.yaml index 18c0af95b..2401a4865 100644 --- a/examples/qwen3.5/27b-qlora.yaml +++ b/examples/qwen3.5/27b-qlora.yaml @@ -61,7 +61,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/qwen3.5/35b-a3b-moe-qlora-fsdp.yaml b/examples/qwen3.5/35b-a3b-moe-qlora-fsdp.yaml index ad17366cb..2fb7f15f8 100644 --- a/examples/qwen3.5/35b-a3b-moe-qlora-fsdp.yaml +++ b/examples/qwen3.5/35b-a3b-moe-qlora-fsdp.yaml @@ -65,7 +65,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/qwen3.5/35b-a3b-moe-qlora.yaml b/examples/qwen3.5/35b-a3b-moe-qlora.yaml index 22468a178..a6afc1aa2 100644 --- a/examples/qwen3.5/35b-a3b-moe-qlora.yaml +++ b/examples/qwen3.5/35b-a3b-moe-qlora.yaml @@ -75,7 +75,7 @@ gradient_checkpointing: true activation_offloading: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/qwen3.5/35b-a3b-moe-vision-lora.yaml b/examples/qwen3.5/35b-a3b-moe-vision-lora.yaml index a7c85f785..7cfad3290 100644 --- a/examples/qwen3.5/35b-a3b-moe-vision-lora.yaml +++ b/examples/qwen3.5/35b-a3b-moe-vision-lora.yaml @@ -50,7 +50,7 @@ gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 weight_decay: 0.0 diff --git a/examples/qwen3.5/9b-fft-vision.yaml b/examples/qwen3.5/9b-fft-vision.yaml index b6aeb859d..e8427b884 100644 --- a/examples/qwen3.5/9b-fft-vision.yaml +++ b/examples/qwen3.5/9b-fft-vision.yaml @@ -40,7 +40,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/qwen3.5/9b-lora-vision.yaml b/examples/qwen3.5/9b-lora-vision.yaml index 1c3717724..9c2b9397e 100644 --- a/examples/qwen3.5/9b-lora-vision.yaml +++ b/examples/qwen3.5/9b-lora-vision.yaml @@ -58,7 +58,7 @@ gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/qwen3/32b-qlora.yaml b/examples/qwen3/32b-qlora.yaml index f4a4f2816..dd5dd696e 100644 --- a/examples/qwen3/32b-qlora.yaml +++ b/examples/qwen3/32b-qlora.yaml @@ -60,7 +60,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/qwen3/8b-qat-fsdp2.yml b/examples/qwen3/8b-qat-fsdp2.yml index cfbe5a4b7..3c9607a9a 100644 --- a/examples/qwen3/8b-qat-fsdp2.yml +++ b/examples/qwen3/8b-qat-fsdp2.yml @@ -23,7 +23,7 @@ output_dir: ./outputs/qat_out/ sequence_len: 2048 sample_packing: true -flex_attention: true +attn_implementation: flex_attention flex_attn_compile_kwargs: diff --git a/examples/qwen3/qlora-fsdp.yaml b/examples/qwen3/qlora-fsdp.yaml index e4d584dc7..a3852d457 100644 --- a/examples/qwen3/qlora-fsdp.yaml +++ b/examples/qwen3/qlora-fsdp.yaml @@ -46,7 +46,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 4 diff --git a/examples/seed-oss/seed-oss-36b-qlora.yaml b/examples/seed-oss/seed-oss-36b-qlora.yaml index 00e7cf3eb..a8423f851 100644 --- a/examples/seed-oss/seed-oss-36b-qlora.yaml +++ b/examples/seed-oss/seed-oss-36b-qlora.yaml @@ -47,7 +47,7 @@ tf32: false gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/smolvlm2/smolvlm2-2B-lora.yaml b/examples/smolvlm2/smolvlm2-2B-lora.yaml index 1aeff408d..4cd8d5b0d 100644 --- a/examples/smolvlm2/smolvlm2-2B-lora.yaml +++ b/examples/smolvlm2/smolvlm2-2B-lora.yaml @@ -45,8 +45,7 @@ tf32: true gradient_checkpointing: true logging_steps: 1 -flash_attention: true -eager_attention: +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/streaming/pretrain.yaml b/examples/streaming/pretrain.yaml index bc8edefd6..a0d8b17c0 100644 --- a/examples/streaming/pretrain.yaml +++ b/examples/streaming/pretrain.yaml @@ -20,7 +20,7 @@ output_dir: ./outputs/smollm2-135m-pretrain-streaming sequence_len: 1024 sample_packing: true pretrain_multipack_attn: true # Prevent cross-attention between packed sequences -flash_attention: true +attn_implementation: flash_attention_2 # Batch size settings gradient_accumulation_steps: 8 diff --git a/examples/streaming/sft.yaml b/examples/streaming/sft.yaml index 47b9f493f..4a43c34eb 100644 --- a/examples/streaming/sft.yaml +++ b/examples/streaming/sft.yaml @@ -18,7 +18,7 @@ output_dir: ./outputs/smollm2-135m-sft-streaming # Sequence and packing settings sequence_len: 1024 sample_packing: true -flash_attention: true +attn_implementation: flash_attention_2 # Batch size settings gradient_accumulation_steps: 4 diff --git a/examples/swanlab/dpo-swanlab-completions.yml b/examples/swanlab/dpo-swanlab-completions.yml index 5615ca638..fb21dbbba 100644 --- a/examples/swanlab/dpo-swanlab-completions.yml +++ b/examples/swanlab/dpo-swanlab-completions.yml @@ -78,7 +78,7 @@ tf32: false # Performance gradient_checkpointing: true -flash_attention: true +attn_implementation: flash_attention_2 # Checkpointing and Logging logging_steps: 1 diff --git a/examples/swanlab/dpo-swanlab-full-featured.yml b/examples/swanlab/dpo-swanlab-full-featured.yml index c25178c63..ac52e6a85 100644 --- a/examples/swanlab/dpo-swanlab-full-featured.yml +++ b/examples/swanlab/dpo-swanlab-full-featured.yml @@ -102,7 +102,7 @@ bf16: auto tf32: false gradient_checkpointing: true -flash_attention: true +attn_implementation: flash_attention_2 # ============================================================================ # Checkpointing and Logging diff --git a/examples/swanlab/lora-swanlab-profiling.yml b/examples/swanlab/lora-swanlab-profiling.yml index 1255105a6..3dff6e315 100644 --- a/examples/swanlab/lora-swanlab-profiling.yml +++ b/examples/swanlab/lora-swanlab-profiling.yml @@ -59,7 +59,7 @@ tf32: false # Performance gradient_checkpointing: true -flash_attention: true +attn_implementation: flash_attention_2 # Checkpointing and Logging logging_steps: 1 diff --git a/examples/trinity/trinity-nano-preview-qlora.yaml b/examples/trinity/trinity-nano-preview-qlora.yaml index d8bf9f073..52c0c0c60 100644 --- a/examples/trinity/trinity-nano-preview-qlora.yaml +++ b/examples/trinity/trinity-nano-preview-qlora.yaml @@ -58,7 +58,7 @@ gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 # flash_attention: true # Not supported -sdp_attention: true +attn_implementation: sdpa warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/voxtral/voxtral-mini-audio-qlora.yml b/examples/voxtral/voxtral-mini-audio-qlora.yml index 59150c4ca..cfa351ccd 100644 --- a/examples/voxtral/voxtral-mini-audio-qlora.yml +++ b/examples/voxtral/voxtral-mini-audio-qlora.yml @@ -70,7 +70,7 @@ gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/voxtral/voxtral-mini-qlora.yml b/examples/voxtral/voxtral-mini-qlora.yml index bdbc5f867..61e8933d0 100644 --- a/examples/voxtral/voxtral-mini-qlora.yml +++ b/examples/voxtral/voxtral-mini-qlora.yml @@ -64,7 +64,7 @@ gradient_checkpointing_kwargs: use_reentrant: false resume_from_checkpoint: logging_steps: 1 -flash_attention: true +attn_implementation: flash_attention_2 warmup_ratio: 0.1 evals_per_epoch: