bump peft to 3.5.1

2025-05-06 11:38:14 -04:00
122 changed files with 311 additions and 1064 deletions
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -32,8 +32,6 @@ tokenizer_legacy:
 resize_token_embeddings_to_32x:
 # Optional[bool] Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.
 shrink_embeddings:
-# Optional[bool] Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs
-embeddings_skip_upcast:
 # Whether to load the model with randomly initialized weights. Useful for
 # pre-training a model from scratch or debugging purposes.
 random_init_weights:
@@ -75,12 +73,11 @@ load_in_8bit: true
 load_in_4bit:

 # Use CUDA bf16
-bf16: true # bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection. require >=ampere
+bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
 # Use CUDA fp16
 fp16: true
 # Use CUDA tf32
 tf32: true # require >=ampere
-# Note: if bf16 is set to 'auto', and fp16 is set to true, we will prefer the explict fp16 setting

 # No AMP (automatic mixed precision)
 bfloat16: true # require >=ampere
@@ -187,8 +184,8 @@ datasets:
    # adding a system turn with empty content.
    drop_system_message:

-    # Optional[bool]. (for Qwen3 template only) Whether to split the assistant content based on a reasoning trace inside delimited tags
-    # See example at `docs/dataset-formats/conversation.qmd`
+    # Optional[bool]. Whether to split the assistant turn based on a reasoning trace inside delimited tags
+    # defaults to False
    split_thinking:

    # IMPORTANT: The following fields determine which parts of the conversation to train on.
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -196,34 +196,6 @@ datasets:
 It is not necessary to set both `message_field_training` and `message_field_training_detail` at once.
 :::

-8. (For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.
-
-```yaml
-datasets:
-  - path: ...
-    type: chat_template
-    chat_template: qwen3
-    split_thinking: true
-```
-
-For example, a content can look like:
-
-```json
-{
-  "content": "<think>Some thinking outputs</think>Output after thinking."
-}
-```
-
-After split, it will look like:
-
-```json
-{
-  "reasoning_content": "Some thinking outputs",
-  "content": "Output after thinking..."
-}
-```
-
-
 ## sharegpt

 ::: {.callout-important}
--- a/examples/cerebras/btlm-ft.yml
+++ b/examples/cerebras/btlm-ft.yml
@@ -59,7 +59,9 @@ gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1

-attention: flash
+flash_attention: true
+sdp_attention:
+flash_optimum:

 gptq_groupsize:
 gptq_model_v1:
--- a/examples/cerebras/qlora.yml
+++ b/examples/cerebras/qlora.yml
@@ -39,7 +39,8 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 10
--- a/examples/code-llama/13b/lora.yml
+++ b/examples/code-llama/13b/lora.yml
@@ -45,8 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/code-llama/13b/qlora.yml
+++ b/examples/code-llama/13b/qlora.yml
@@ -46,8 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/code-llama/34b/lora.yml
+++ b/examples/code-llama/34b/lora.yml
@@ -45,8 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/code-llama/34b/qlora.yml
+++ b/examples/code-llama/34b/qlora.yml
@@ -46,8 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/code-llama/7b/lora.yml
+++ b/examples/code-llama/7b/lora.yml
@@ -45,8 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/code-llama/7b/qlora.yml
+++ b/examples/code-llama/7b/qlora.yml
@@ -46,8 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/cohere/command-r-7b-qlora.yml
+++ b/examples/cohere/command-r-7b-qlora.yml
@@ -49,8 +49,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -112,7 +112,9 @@
    "early_stopping_patience:\n",
    "resume_from_checkpoint:\n",
    "logging_steps: 1\n",
-    "attention: sdpa\n",
+    "xformers_attention:\n",
+    "flash_attention: false\n",
+    "sdp_attention: true\n",
    "\n",
    "warmup_steps: 1\n",
    "max_steps: 25\n",
--- a/examples/dbrx/16bit-lora.yaml
+++ b/examples/dbrx/16bit-lora.yaml
@@ -52,8 +52,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch:
--- a/examples/dbrx/8bit-lora.yaml
+++ b/examples/dbrx/8bit-lora.yaml
@@ -55,8 +55,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch:
--- a/examples/dbrx/fft-ds-zero3.yaml
+++ b/examples/dbrx/fft-ds-zero3.yaml
@@ -39,8 +39,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch:
--- a/examples/deepseek-v2/fft-fsdp-16b.yaml
+++ b/examples/deepseek-v2/fft-fsdp-16b.yaml
@@ -35,8 +35,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 2
--- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -59,8 +59,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 2
--- a/examples/falcon/config-7b-lora.yml
+++ b/examples/falcon/config-7b-lora.yml
@@ -43,7 +43,8 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 40
--- a/examples/falcon/config-7b-qlora.yml
+++ b/examples/falcon/config-7b-qlora.yml
@@ -73,7 +73,8 @@ early_stopping_patience: 3
 resume_from_checkpoint:
 auto_resume_from_checkpoints: true
 logging_steps: 1
-attention: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 10
--- a/examples/falcon/config-7b.yml
+++ b/examples/falcon/config-7b.yml
@@ -40,7 +40,8 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 40
--- a/examples/gemma/qlora.yml
+++ b/examples/gemma/qlora.yml
@@ -47,8 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/gemma2/qlora.yml
+++ b/examples/gemma2/qlora.yml
@@ -53,8 +53,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/gemma2/reward-model.yaml
+++ b/examples/gemma2/reward-model.yaml
@@ -43,8 +43,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/gemma3/gemma-3-1b-qlora.yml
+++ b/examples/gemma3/gemma-3-1b-qlora.yml
@@ -57,8 +57,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/gemma3/gemma-3-4b-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-qlora.yml
@@ -51,7 +51,8 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 logging_steps: 1
-attention: flash
+flash_attention: true
+eager_attention:

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/gemma3/gemma-3-4b-vision-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-vision-qlora.yml
@@ -53,7 +53,8 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 logging_steps: 1
-attention: flash
+flash_attention: true
+eager_attention:

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/gptj/qlora.yml
+++ b/examples/gptj/qlora.yml
@@ -36,7 +36,8 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 10
--- a/examples/jamba/qlora.yaml
+++ b/examples/jamba/qlora.yaml
@@ -47,8 +47,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch:
--- a/examples/jamba/qlora_deepspeed.yaml
+++ b/examples/jamba/qlora_deepspeed.yaml
@@ -46,8 +46,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch:
--- a/examples/jamba/qlora_fsdp_large.yaml
+++ b/examples/jamba/qlora_fsdp_large.yaml
@@ -45,8 +45,7 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: true
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 1
--- a/examples/jeopardy-bot/config.yml
+++ b/examples/jeopardy-bot/config.yml
@@ -37,7 +37,8 @@ bf16: auto
 tf32: true
 resume_from_checkpoint:
 logging_steps: 5
-attention: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
--- a/examples/llama-2/fft_optimized.yml
+++ b/examples/llama-2/fft_optimized.yml
@@ -42,8 +42,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
 flash_attn_fuse_qkv: false
--- a/examples/llama-2/gptq-lora.yml
+++ b/examples/llama-2/gptq-lora.yml
@@ -53,7 +53,9 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
+flash_attention:
+sdp_attention:
+flash_optimum:
 warmup_steps: 100
 evals_per_epoch: 4
 saves_per_epoch: 1
--- a/examples/llama-2/lisa.yml
+++ b/examples/llama-2/lisa.yml
@@ -46,8 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
 flash_attn_fuse_qkv: false
--- a/examples/llama-2/loftq.yml
+++ b/examples/llama-2/loftq.yml
@@ -45,8 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -45,8 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-2/qlora-fsdp.yml
+++ b/examples/llama-2/qlora-fsdp.yml
@@ -48,8 +48,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -46,8 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-2/relora.yml
+++ b/examples/llama-2/relora.yml
@@ -48,8 +48,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3-vision/lora-11b.yaml
+++ b/examples/llama-3-vision/lora-11b.yaml
@@ -50,7 +50,8 @@ tf32: true

 gradient_checkpointing: true
 logging_steps: 1
-attention: flash
+flash_attention: true
+eager_attention:

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/llama-3/fft-8b-liger-fsdp.yaml
+++ b/examples/llama-3/fft-8b-liger-fsdp.yaml
@@ -49,8 +49,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 2
--- a/examples/llama-3/fft-8b.yaml
+++ b/examples/llama-3/fft-8b.yaml
@@ -34,8 +34,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 2
--- a/examples/llama-3/instruct-dpo-lora-8b.yml
+++ b/examples/llama-3/instruct-dpo-lora-8b.yml
@@ -61,8 +61,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/instruct-lora-8b.yml
+++ b/examples/llama-3/instruct-lora-8b.yml
@@ -56,8 +56,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/lora-1b-deduplicate-dpo.yml
+++ b/examples/llama-3/lora-1b-deduplicate-dpo.yml
@@ -77,8 +77,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/lora-1b-deduplicate-sft.yml
+++ b/examples/llama-3/lora-1b-deduplicate-sft.yml
@@ -53,8 +53,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/lora-1b-kernels.yml
+++ b/examples/llama-3/lora-1b-kernels.yml
@@ -54,8 +54,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/llama-3/lora-1b-ray.yml
+++ b/examples/llama-3/lora-1b-ray.yml
@@ -48,8 +48,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/llama-3/lora-1b-sample-packing-sequentially.yml
+++ b/examples/llama-3/lora-1b-sample-packing-sequentially.yml
@@ -55,8 +55,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/lora-1b.yml
+++ b/examples/llama-3/lora-1b.yml
@@ -48,8 +48,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/llama-3/lora-8b.yml
+++ b/examples/llama-3/lora-8b.yml
@@ -49,8 +49,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/qlora-1b-kto.yaml
+++ b/examples/llama-3/qlora-1b-kto.yaml
@@ -53,8 +53,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 20
 evals_per_epoch: 4
--- a/examples/llama-3/qlora-1b.yml
+++ b/examples/llama-3/qlora-1b.yml
@@ -51,8 +51,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/llama-3/qlora-fsdp-405b.yaml
+++ b/examples/llama-3/qlora-fsdp-405b.yaml
@@ -39,8 +39,7 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: true
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/qlora-fsdp-70b.yaml
+++ b/examples/llama-3/qlora-fsdp-70b.yaml
@@ -48,8 +48,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/qlora.yml
+++ b/examples/llama-3/qlora.yml
@@ -46,8 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llava/lora-7b.yaml
+++ b/examples/llava/lora-7b.yaml
@@ -46,7 +46,8 @@ tf32: true

 gradient_checkpointing: true
 logging_steps: 1
-attention: flash
+flash_attention: true
+eager_attention:

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/mamba/config.yml
+++ b/examples/mamba/config.yml
@@ -39,7 +39,7 @@ tf32: true
 gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: eager
+flash_attention:

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/mistral/bigstral-ds-zero3.yaml
+++ b/examples/mistral/bigstral-ds-zero3.yaml
@@ -42,8 +42,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 save_total_limit: 1
 save_steps:
--- a/examples/mistral/config.yml
+++ b/examples/mistral/config.yml
@@ -36,8 +36,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/mistral/lora-mps.yml
+++ b/examples/mistral/lora-mps.yml
@@ -53,7 +53,8 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: sdpa
+flash_attention: false
+sdp_attention: true

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/mistral/lora.yml
+++ b/examples/mistral/lora.yml
@@ -54,8 +54,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/mistral/mistral-dpo-qlora.yml
+++ b/examples/mistral/mistral-dpo-qlora.yml
@@ -71,7 +71,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: eager
+flash_attention: false

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/mistral/mistral-qlora-fsdp.yml
+++ b/examples/mistral/mistral-qlora-fsdp.yml
@@ -51,8 +51,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/mistral/mistral-qlora-orpo.yml
+++ b/examples/mistral/mistral-qlora-orpo.yml
@@ -59,8 +59,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/mistral/mistral-small-3.1-24B-lora.yml
+++ b/examples/mistral/mistral-small-3.1-24B-lora.yml
@@ -48,7 +48,9 @@ tf32: true

 gradient_checkpointing: true
 logging_steps: 1
-attention: eager  # PixtralVisionModel does not support Flash Attention 2.0 yet.
+flash_attention: false # PixtralVisionModel does not support Flash Attention 2.0 yet.
+eager_attention:
+
 warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
--- a/examples/mistral/mixtral-8x22b-qlora-fsdp.yml
+++ b/examples/mistral/mixtral-8x22b-qlora-fsdp.yml
@@ -49,8 +49,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/mistral/mixtral-qlora-fsdp.yml
+++ b/examples/mistral/mixtral-qlora-fsdp.yml
@@ -51,8 +51,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/mistral/mixtral.yml
+++ b/examples/mistral/mixtral.yml
@@ -69,8 +69,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/mistral/mixtral_22.yml
+++ b/examples/mistral/mixtral_22.yml
@@ -40,8 +40,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 save_total_limit: 1
 save_steps:
--- a/examples/mistral/qlora.yml
+++ b/examples/mistral/qlora.yml
@@ -54,8 +54,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/mpt-7b/config.yml
+++ b/examples/mpt-7b/config.yml
@@ -39,7 +39,7 @@ bf16: auto
 tf32: true
 resume_from_checkpoint:
 logging_steps: 5
-attention: eager
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
--- a/examples/openllama-3b/config.yml
+++ b/examples/openllama-3b/config.yml
@@ -39,8 +39,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
--- a/examples/openllama-3b/lora.yml
+++ b/examples/openllama-3b/lora.yml
@@ -47,8 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
--- a/examples/openllama-3b/qlora.yml
+++ b/examples/openllama-3b/qlora.yml
@@ -40,8 +40,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
--- a/examples/phi/phi-ft.yml
+++ b/examples/phi/phi-ft.yml
@@ -48,8 +48,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: True
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 4
--- a/examples/phi/phi-qlora.yml
+++ b/examples/phi/phi-qlora.yml
@@ -51,8 +51,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: True
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 4
--- a/examples/phi/phi2-ft.yml
+++ b/examples/phi/phi2-ft.yml
@@ -48,8 +48,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: True
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 4
--- a/examples/phi/phi3-ft-fsdp.yml
+++ b/examples/phi/phi3-ft-fsdp.yml
@@ -49,8 +49,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 4
--- a/examples/phi/phi3-ft.yml
+++ b/examples/phi/phi3-ft.yml
@@ -44,8 +44,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: True
 early_stopping_patience: 3
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 eval_steps: 1000
 save_steps: 5000
--- a/examples/pixtral/lora-12b.yml
+++ b/examples/pixtral/lora-12b.yml
@@ -46,7 +46,8 @@ tf32: true

 gradient_checkpointing: true
 logging_steps: 1
-attention: eager  # PixtralVisionModel does not support Flash Attention 2.0 yet
+flash_attention: false # PixtralVisionModel does not support Flash Attention 2.0 yet
+eager_attention:

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/qwen/lora.yml
+++ b/examples/qwen/lora.yml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: eager
+flash_attention:

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/qwen/qlora.yml
+++ b/examples/qwen/qlora.yml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
+flash_attention:

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/qwen/qwen2-moe-lora.yaml
+++ b/examples/qwen/qwen2-moe-lora.yaml
@@ -43,8 +43,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/qwen/qwen2-moe-qlora.yaml
+++ b/examples/qwen/qwen2-moe-qlora.yaml
@@ -46,8 +46,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/qwen2-vl/lora-7b.yaml
+++ b/examples/qwen2-vl/lora-7b.yaml
@@ -46,7 +46,8 @@ tf32: true

 gradient_checkpointing: true
 logging_steps: 1
-attention: flash
+flash_attention: true
+eager_attention:

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/qwen2/dpo.yaml
+++ b/examples/qwen2/dpo.yaml
@@ -49,8 +49,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/qwen2/prm.yaml
+++ b/examples/qwen2/prm.yaml
@@ -47,8 +47,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/qwen2/qlora-fsdp.yaml
+++ b/examples/qwen2/qlora-fsdp.yaml
@@ -47,8 +47,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/qwen2/reward-model.yaml
+++ b/examples/qwen2/reward-model.yaml
@@ -43,8 +43,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/redpajama/config-3b.yml
+++ b/examples/redpajama/config-3b.yml
@@ -40,7 +40,7 @@ bf16: auto
 tf32: true
 resume_from_checkpoint:
 logging_steps: 5
-attention: flash
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
--- a/examples/replit-3b/config-lora.yml
+++ b/examples/replit-3b/config-lora.yml
@@ -38,7 +38,7 @@ tf32: true
 gradient_checkpointing:
 resume_from_checkpoint:
 logging_steps: 1
-attention: eager
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
--- a/examples/stablelm-2/1.6b/fft.yml
+++ b/examples/stablelm-2/1.6b/fft.yml
@@ -44,8 +44,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
 flash_attn_fuse_qkv: false
--- a/examples/stablelm-2/1.6b/lora.yml
+++ b/examples/stablelm-2/1.6b/lora.yml
@@ -47,8 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true

--- a/examples/starcoder2/qlora.yml
+++ b/examples/starcoder2/qlora.yml
@@ -46,8 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 20
 evals_per_epoch: 4
--- a/examples/tiny-llama/lora-mps.yml
+++ b/examples/tiny-llama/lora-mps.yml
@@ -47,7 +47,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: eager
+flash_attention: false

 warmup_steps: 10
 evals_per_epoch: 0
--- a/examples/tiny-llama/lora.yml
+++ b/examples/tiny-llama/lora.yml
@@ -45,8 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/tiny-llama/pretrain.yml
+++ b/examples/tiny-llama/pretrain.yml
@@ -36,8 +36,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch:
--- a/examples/tiny-llama/qlora.yml
+++ b/examples/tiny-llama/qlora.yml
@@ -47,8 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/xgen-7b/xgen-7b-8k-qlora.yml
+++ b/examples/xgen-7b/xgen-7b-8k-qlora.yml
@@ -71,7 +71,8 @@ early_stopping_patience: 3
 resume_from_checkpoint:
 auto_resume_from_checkpoints: true
 logging_steps: 1
-attention: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 10
--- a/Show More
+++ b/Show More