migrate example configs to canonical attn_implementation

2026-04-23 22:15:07 +00:00
parent 2d64d009d8
commit 39226623d2
222 changed files with 209 additions and 243 deletions
--- a/examples/ebft/llama-1b-ebft-opencode-novllm.yaml
+++ b/examples/ebft/llama-1b-ebft-opencode-novllm.yaml
@@ -46,7 +46,7 @@ lora_dropout: 0.05
 lora_target_linear: true

 bf16: auto
-flash_attention: true
+attn_implementation: flash_attention_2
 gradient_checkpointing: true

 special_tokens:
--- a/examples/ebft/llama-1b-ebft-opencode.yaml
+++ b/examples/ebft/llama-1b-ebft-opencode.yaml
@@ -66,7 +66,7 @@ lora_target_linear: true

 # --- Hardware ---
 bf16: auto
-flash_attention: true
+attn_implementation: flash_attention_2
 gradient_checkpointing: true

 special_tokens:
--- a/examples/ebft/llama-1b-ebft-strided-structured.yaml
+++ b/examples/ebft/llama-1b-ebft-strided-structured.yaml
@@ -47,8 +47,7 @@ lora_dropout: 0.05
 lora_target_linear: true

 bf16: auto
-flash_attention: false  # strided EBFT overrides to flex_attention (or eager fallback) at runtime
-flex_attention: true    # fused flex_attention kernel compiles itself; don't set torch_compile: true
+attn_implementation: flex_attention
                        # (full-model compile conflicts with gradient checkpointing + flex_attention)
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
--- a/examples/ebft/llama-1b-ebft-strided.yaml
+++ b/examples/ebft/llama-1b-ebft-strided.yaml
@@ -46,7 +46,6 @@ lora_dropout: 0.05
 lora_target_linear: true

 bf16: auto
-flash_attention: false  # strided EBFT overrides to flex_attention (or eager fallback) at runtime
 gradient_checkpointing: true

 special_tokens:
--- a/examples/ebft/llama-3b-ebft-strided-fft.yaml
+++ b/examples/ebft/llama-3b-ebft-strided-fft.yaml
@@ -48,7 +48,6 @@ lora_target_linear: true

 bf16: auto
 torch_dtype: bfloat16
-flash_attention: false
 gradient_checkpointing: true
 torch_compile: true
 gradient_checkpointing_kwargs:
--- a/examples/ebft/llama-8b-ebft-strided-fft.yaml
+++ b/examples/ebft/llama-8b-ebft-strided-fft.yaml
@@ -41,7 +41,6 @@ warmup_steps: 10
 weight_decay: 0.01

 bf16: auto
-flash_attention: false  # strided EBFT uses flex_attention at runtime
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
--- a/examples/ebft/qwen35-4b-ebft-structured-async.yaml
+++ b/examples/ebft/qwen35-4b-ebft-structured-async.yaml
@@ -72,7 +72,7 @@ lora_dropout: 0.0
 lora_target_modules: ".*\\.layers\\.(3|7|11|15|19|23|27|31)\\.self_attn\\.(q|k|v|o)_proj|.*\\.mlp\\.(gate|up|down)_proj"

 bf16: auto
-flash_attention: true
+attn_implementation: flash_attention_2
 gradient_checkpointing: true

 special_tokens:
--- a/examples/ebft/qwen35-4b-ebft-structured.yaml
+++ b/examples/ebft/qwen35-4b-ebft-structured.yaml
@@ -63,7 +63,7 @@ lora_dropout: 0.0
 lora_target_modules: ".*\\.layers\\.(3|7|11|15|19|23|27|31)\\.self_attn\\.(q|k|v|o)_proj|.*\\.mlp\\.(gate|up|down)_proj"

 bf16: auto
-flash_attention: true
+attn_implementation: flash_attention_2
 gradient_checkpointing: true

 special_tokens:
--- a/examples/ebft/qwen35-9b-ebft-structured.yaml
+++ b/examples/ebft/qwen35-9b-ebft-structured.yaml
@@ -68,7 +68,7 @@ lora_dropout: 0.0
 lora_target_modules: ".*\\.layers\\.(3|7|11|15|19|23|27|31)\\.self_attn\\.(q|k|v|o)_proj|.*\\.mlp\\.(gate|up|down)_proj"

 bf16: auto
-flash_attention: true
+attn_implementation: flash_attention_2
 gradient_checkpointing: true

 special_tokens: