migrate example configs to canonical attn_implementation

2026-04-23 22:15:07 +00:00
parent 2d64d009d8
commit 39226623d2
222 changed files with 209 additions and 243 deletions
--- a/examples/mistral/bigstral/bigstral-ds-zero3.yaml
+++ b/examples/mistral/bigstral/bigstral-ds-zero3.yaml
@@ -42,7 +42,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 save_total_limit: 1
 save_steps:
--- a/examples/mistral/config.yml
+++ b/examples/mistral/config.yml
@@ -36,7 +36,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/mistral/dpo/mistral-dpo-qlora.yml
+++ b/examples/mistral/dpo/mistral-dpo-qlora.yml
@@ -71,7 +71,6 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: false

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/mistral/lora.yml
+++ b/examples/mistral/lora.yml
@@ -54,7 +54,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/mistral/mistral-qlora-fsdp.yml
+++ b/examples/mistral/mistral-qlora-fsdp.yml
@@ -51,7 +51,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/mistral/mixtral/mixtral-8x22b-qlora-fsdp.yml
+++ b/examples/mistral/mixtral/mixtral-8x22b-qlora-fsdp.yml
@@ -49,7 +49,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/mistral/mixtral/mixtral-qlora-fsdp.yml
+++ b/examples/mistral/mixtral/mixtral-qlora-fsdp.yml
@@ -51,7 +51,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/mistral/mixtral/mixtral.yml
+++ b/examples/mistral/mixtral/mixtral.yml
@@ -69,7 +69,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/mistral/mixtral/mixtral_22.yml
+++ b/examples/mistral/mixtral/mixtral_22.yml
@@ -40,7 +40,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 save_total_limit: 1
 save_steps:
--- a/examples/mistral/mps/lora-mps.yml
+++ b/examples/mistral/mps/lora-mps.yml
@@ -53,8 +53,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: false
-sdp_attention: true
+attn_implementation: sdpa

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/mistral/orpo/mistral-qlora-orpo.yml
+++ b/examples/mistral/orpo/mistral-qlora-orpo.yml
@@ -59,7 +59,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/mistral/qlora.yml
+++ b/examples/mistral/qlora.yml
@@ -54,7 +54,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3