note MAX_JOBS for flash-attn compile speed

add miaai environment setup guide
Add optional Axolotl MoRA/ReMoRA integration (#3647 ) [skip ci]
2026-05-13 04:45:21 +00:00 · 2026-05-13 04:16:03 +00:00 · 2026-05-12 07:19:55 -04:00 · 2026-05-09 17:52:35 -04:00 · 2026-05-05 11:25:39 -04:00 · 2026-05-05 11:22:35 -04:00
337 changed files with 6122 additions and 1323 deletions
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -31,10 +31,11 @@ PRs are **greatly welcome**!

 Please run below to setup env
 ```bash
-# Install axolotl + dev and test dependencies from lockfile
+# Install axolotl + dev and test dependencies
 export UV_TORCH_BACKEND=cu128  # or cu130
-uv sync --extra flash-attn --extra deepspeed --group dev --group test
+uv venv --no-project --relocatable
 source .venv/bin/activate
+uv pip install --no-build-isolation -e '.[deepspeed]' --group dev --group test
 pre-commit install

 # test
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -30,14 +30,6 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.9.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
-            platforms: "linux/amd64,linux/arm64"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
@@ -168,14 +160,6 @@ jobs:
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64,linux/arm64"
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.9.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-uv-base"
-            platforms: "linux/amd64,linux/arm64"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -18,12 +18,6 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.9.0
-            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
@@ -180,12 +174,6 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.9.0
-            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -72,7 +72,7 @@ jobs:
        exclude:
          - python_version: "3.14"
            pytorch_version: "2.9.1"
-    timeout-minutes: 20
+    timeout-minutes: 25

    steps:
      - name: cleanup node
--- a/README.md
+++ b/README.md
@@ -29,6 +29,9 @@

 ## 🎉 Latest Updates

+- 2026/04:
+  - New model support has been added in Axolotl for [Mistral Medium 3.5](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/mistral-medium-3_5) and [Gemma 4](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/gemma4).
+  - Axolotl is now [uv-first](https://github.com/axolotl-ai-cloud/axolotl/pull/3545) and has [SonicMoE fused LoRA](https://github.com/axolotl-ai-cloud/axolotl/pull/3519) support.
 - 2026/03:
  - New model support has been added in Axolotl for [Mistral Small 4](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/mistral4), [Qwen3.5, Qwen3.5 MoE](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen3.5), [GLM-4.7-Flash](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm47-flash), [GLM-4.6V](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm46v), and [GLM-4.5-Air](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm45).
  - [MoE expert quantization](https://docs.axolotl.ai/docs/expert_quantization.html) support (via `quantize_moe_experts: true`) greatly reduces VRAM when training MoE models (FSDP2 compat).
--- a/SETUP_MIAAI.md
+++ b/SETUP_MIAAI.md
@@ -0,0 +1,83 @@
+# Axolotl Setup — miaai (RTX 5080, CUDA 13.2)
+
+## System Info
+- GPU: NVIDIA RTX 5080 (16GB VRAM)
+- Driver: 580.126.09 — max CUDA 13.0 (nvcc from conda resolves to 13.2)
+- OS: Ubuntu (Python 3.13 system — do NOT use system Python for ML)
+- Axolotl branch: `activeblue/main`
+
+## One-time Setup
+
+### 1. Install Miniconda
+```bash
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
+bash miniconda.sh -b -p /opt/miniconda3
+/opt/miniconda3/bin/conda init bash
+source ~/.bashrc
+```
+
+### 2. Create Python 3.11 environment
+```bash
+conda create -n axolotl python=3.11 -y
+conda activate axolotl
+```
+
+### 3. Clone and sync repo with upstream
+```bash
+git clone https://git.activeblue.net/tocmo0nlord/axolotl.git
+cd axolotl
+git remote add upstream https://github.com/axolotl-ai-cloud/axolotl.git
+git fetch upstream
+git rebase upstream/main        # keeps activeblue patches on top
+git push origin activeblue/main --force-with-lease
+```
+
+### 4. Install CUDA toolkit (needed to compile flash-attn)
+```bash
+conda install -y -c "nvidia/label/cuda-12.8.0" cuda-toolkit
+export CUDA_HOME=$CONDA_PREFIX
+export PATH=$CUDA_HOME/bin:$PATH
+```
+
+### 5. Install PyTorch — use cu132 (matches nvcc from conda)
+> NOTE: torchaudio has no cu132 wheel — skip it, not needed for LLM training
+```bash
+pip install torch torchvision --index-url https://download.pytorch.org/whl/cu132
+python -c "import torch; print('CUDA:', torch.version.cuda); print('GPU:', torch.cuda.get_device_name(0))"
+```
+
+### 6. Install Axolotl
+```bash
+pip install -e "."
+```
+
+> **flash-attn compiles CUDA kernels from source — takes 15–25 min on 10 cores of i7-14700K.**
+> Always set `MAX_JOBS` to the number of available CPU cores to parallelize and speed up compilation:
+```bash
+MAX_JOBS=10 pip install flash-attn --no-build-isolation
+```
+
+## Every Session (after first-time setup)
+```bash
+export PATH="/opt/miniconda3/bin:$PATH"
+conda activate axolotl
+export CUDA_HOME=$CONDA_PREFIX
+export PATH=$CUDA_HOME/bin:$PATH
+cd /home/tocmo0nlord/axolotl
+```
+
+## Run Training
+```bash
+axolotl train human_chat_qlora.yml
+```
+
+## Common Pitfalls Encountered
+| Problem | Cause | Fix |
+|---|---|---|
+| `externally-managed-environment` | System Python 3.13 blocks pip | Use conda env, never system pip |
+| `No module named torch` (flash-attn) | pip builds in isolated env | Use `--no-build-isolation` |
+| `CUDA_HOME not set` | CUDA toolkit not installed | `conda install cuda-toolkit` from nvidia channel |
+| `CUDA version mismatch 13.2 vs 12.8` | Conda nvcc is 13.2, torch was cu128 | Reinstall torch with `--index-url .../cu132` |
+| `torchaudio` not found for cu132 | No cu132 wheel exists | Skip torchaudio — not needed |
+| `src refspec main does not match` | Fork default branch is `activeblue/main` | `git push origin activeblue/main` |
+| flash-attn compile is slow | Single-threaded by default | Set `MAX_JOBS=<cpu_count>` before pip install |
--- a/2
+++ b/2
@@ -1 +1 @@
-0.16.0.dev0
+0.16.2.dev0
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -311,6 +311,7 @@ website:
            - docs/dataset_loading.qmd
            - docs/qat.qmd
            - docs/quantize.qmd
+            - docs/1_58bit_finetuning.qmd
            - docs/optimizations.qmd

        - section: "Core Concepts"
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -24,9 +24,9 @@ WORKDIR /workspace/axolotl
 # If AXOLOTL_EXTRAS is set, append it in brackets; don't install deepspeed with arm64
 RUN pip uninstall -y causal_conv1d
 RUN if [ "$TARGETARCH" = "arm64" ]; then \
-        BASE_EXTRAS="flash-attn,ring-flash-attn,optimizers,ray"; \
+        BASE_EXTRAS="optimizers,ray"; \
    else \
-        BASE_EXTRAS="deepspeed,flash-attn,ring-flash-attn,optimizers,ray"; \
+        BASE_EXTRAS="deepspeed,optimizers,ray"; \
    fi && \
    if [ "$AXOLOTL_EXTRAS" != "" ]; then \
        pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -58,19 +58,3 @@ RUN git lfs install --skip-repo && \
    # The base image ships with `pydantic==1.8.2` which is not working
    pip3 install -U --no-cache-dir pydantic==1.10.10 && \
    pip3 cache purge
-
-# Map Python version (e.g., 3.12 -> cp312)
-RUN PYTHON_CP="cp$(echo $PYTHON_VERSION | tr -d '.')" && \
-    # Map PyTorch version (e.g., 2.9.1 -> torch2.9, 2.10.0 -> torch2.10)
-    TORCH_TAG="torch$(echo $PYTORCH_VERSION | grep -oP '^\d+\.\d+')" && \
-    # Map architecture
-    case "$TARGETARCH" in \
-        amd64) ARCH_TAG="x86_64" ;; \
-        arm64) ARCH_TAG="aarch64" ;; \
-        *) echo "Unsupported architecture: $TARGETARCH"; exit 1 ;; \
-    esac && \
-    WHL_VERSION="v0.7.16" && \
-    WHL_FILE="flash_attn-2.8.3+cu${CUDA}${TORCH_TAG}-${PYTHON_CP}-${PYTHON_CP}-linux_${ARCH_TAG}.whl" && \
-    wget -nv "https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}" && \
-    pip3 install --no-cache-dir "${WHL_FILE}" && \
-    rm "${WHL_FILE}"
--- a/docker/Dockerfile-tests
+++ b/docker/Dockerfile-tests
@@ -24,9 +24,9 @@ RUN git fetch origin +$GITHUB_REF && \

 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        pip install --no-build-isolation -e .[deepspeed,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \
+        pip install --no-build-isolation -e .[deepspeed,mamba-ssm] $AXOLOTL_ARGS; \
    fi

 # So we can test the Docker image
--- a/docker/Dockerfile-uv
+++ b/docker/Dockerfile-uv
@@ -24,9 +24,9 @@ WORKDIR /workspace/axolotl
 # If AXOLOTL_EXTRAS is set, append it in brackets; don't install deepspeed with arm64
 RUN uv pip uninstall causal_conv1d
 RUN if [ "$TARGETARCH" = "arm64" ]; then \
-        BASE_EXTRAS="flash-attn,ring-flash-attn,optimizers,ray"; \
+        BASE_EXTRAS="optimizers,ray"; \
    else \
-        BASE_EXTRAS="deepspeed,flash-attn,ring-flash-attn,optimizers,ray"; \
+        BASE_EXTRAS="deepspeed,optimizers,ray"; \
    fi && \
    if [ "$AXOLOTL_EXTRAS" != "" ]; then \
        uv pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
--- a/docker/Dockerfile-uv-base
+++ b/docker/Dockerfile-uv-base
@@ -38,20 +38,3 @@ RUN uv pip install packaging setuptools wheel psutil \
 RUN if [ "$TARGETARCH" = "amd64" ]; then \
        MAMBA_SKIP_CUDA_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE uv pip install --no-build-isolation mamba_ssm causal_conv1d; \
    fi
-
-# Map Python version (e.g., 3.12 -> cp312)
-RUN PYTHON_CP="cp$(echo $PYTHON_VERSION | tr -d '.')" && \
-    # Map PyTorch version (e.g., 2.9.1 -> torch2.9, 2.10.0 -> torch2.10)
-    TORCH_TAG="torch$(echo $PYTORCH_VERSION | grep -oP '^\d+\.\d+')" && \
-    LINUX_TAG="manylinux_" && \
-    # Map architecture
-    case "$TARGETARCH" in \
-        amd64) ARCH_TAG="2_24_x86_64.manylinux_2_28_x86_64" ;; \
-        arm64) ARCH_TAG="2_34_aarch64" ;; \
-        *) echo "Unsupported architecture: $TARGETARCH"; exit 1 ;; \
-    esac && \
-    WHL_VERSION="v0.7.16" && \
-    WHL_FILE="flash_attn-2.8.3+cu${CUDA}${TORCH_TAG}-${PYTHON_CP}-${PYTHON_CP}-${LINUX_TAG}${ARCH_TAG}.whl" && \
-    wget -nv "https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}" && \
-    uv pip install --no-cache-dir "${WHL_FILE}" && \
-    rm "${WHL_FILE}"
--- a/docs/1_58bit_finetuning.qmd
+++ b/docs/1_58bit_finetuning.qmd
@@ -0,0 +1,70 @@
+---
+title: "1.58-bit Finetuning"
+back-to-top-navigation: true
+toc: true
+toc-expand: 2
+toc-depth: 4
+---
+
+## Overview
+
+1.58-bit finetuning allows you to finetune BitNet models when their prequantized weights are provided. In theory, it will be possible to fine-tune any LLM in 1.58bit format but the performance degradation will be dramatic.
+
+Axolotl supports 1.58-bit finetuning via the [`onebitllms`](https://github.com/tiiuae/onebitllms) library, which replaces standard linear layers with BitNet-compatible counterparts ready to use for training.
+
+::: {.callout-note}
+LoRA is not supported for BitNet models
+:::
+
+## Installation
+
+Install the `onebitllms` package before using this feature:
+
+```bash
+uv pip install onebitllms
+```
+
+Or from source:
+
+```bash
+uv pip install git+https://github.com/tiiuae/onebitllms
+```
+
+## Supported models
+
+For now, only `Falcon-E` series of models are supported. Make sure to use their `-prequantized` version:
+
+```bash
+tiiuae/Falcon-E-3B-Base-prequantized
+tiiuae/Falcon-E-1B-Base-prequantized
+```
+
+In theory, any other model would 'work' but the performance degradation will be huge. This remains an area of exploration.
+
+## Configuration
+
+To enable 1.58-bit finetuning, set the following in your configuration file:
+
+```yaml
+base_model: tiiuae/Falcon-E-3B-Base-prequantized  # A BitNet-compatible model
+
+use_onebitllms: true
+```
+
+::: {.callout-note}
+For BitNet models, it is recommended to use a higher learning rate than classic models (usually in the order of magnitude of 10x).
+:::
+
+## Considerations after training
+
+Once your model has been trained with 1.58bit fine-tuning, you can convert the trained model in ternary format using the `onebitllms` CLI:
+
+```bash
+onebitllms quantize_to_1bit INPUT_PATH OUTPUT_PATH
+```
+
+After that, you can use supported packages such as `llama.cpp` or Apple MLX package to run the trained model.
+
+## Example Configuration
+
+You can find example configurations in `examples/falcon-e` which contain one configuration for SFT and one configuration for DPO.
--- a/docs/agents/new_model_support.md
+++ b/docs/agents/new_model_support.md
@@ -121,11 +121,11 @@ Older models that use `_prepare_4d_causal_attention_mask` (Llama, Mistral, Qwen2

 | Backend | Config | head_dim limit | torch_compile | Notes |
 |---------|--------|---------------|---------------|-------|
-| FA2 | `flash_attention: true` | 256 | ✅ | Fastest when supported |
-| FA4 | auto with `flash_attention: true` | 256 (SM90+) | ✅ | Auto-detected on H100+ |
-| SDPA | `sdp_attention: true` | None | ✅ | Universal fallback |
-| flex | `flex_attention: true` | None | ⚠️ Triton OOM for large head_dim | Good for variable head dims |
-| eager | neither set | None | ✅ | Slowest, always works |
+| FA2 | `attn_implementation: flash_attention_2` | 256 | ✅ | Fastest when supported |
+| FA4 | auto with `attn_implementation: flash_attention_2` | 256 (SM90+) | ✅ | Auto-detected on H100+ |
+| SDPA | `attn_implementation: sdpa` | None | ✅ | Universal fallback |
+| flex | `attn_implementation: flex_attention` | None | ⚠️ Triton OOM for large head_dim | Good for variable head dims |
+| eager | `attn_implementation: eager` | None | ✅ | Slowest, always works |

 **Check model support**: Look at `_supports_flash_attn_2`, `_supports_flex_attn`, `_supports_sdpa` attributes on the model class.

--- a/docs/agents/sft.md
+++ b/docs/agents/sft.md
@@ -83,7 +83,7 @@ Watch for: loss never decreasing (check `train_on_inputs`, dataset, LR), loss go
 | Issue | Fix |
 |-------|-----|
 | OOM during training | Reduce `micro_batch_size`, enable `gradient_checkpointing`, reduce `sequence_len` |
-| `sample_packing` + SDPA + bf16 = 0.0 loss | Use `flash_attention: true` or disable `sample_packing` |
+| `sample_packing` + SDPA + bf16 = 0.0 loss | Use `attn_implementation: flash_attention_2` or disable `sample_packing` |
 | Missing chat template error | Set `chat_template: chatml` explicitly |
 | Label masking wrong | Run `axolotl preprocess config.yaml --debug` and inspect labels |
 | Loss NaN | Use `bf16: auto`, lower LR, check data for empty samples |
--- a/docs/attention.qmd
+++ b/docs/attention.qmd
@@ -3,28 +3,71 @@ title: Attention
 description: Supported attention modules in Axolotl
 ---

-## SDP Attention
-
-This is the default built-in attention in PyTorch.
+Axolotl routes attention via a single config field:

 ```yaml
-sdp_attention: true
+attn_implementation: <backend>
 ```

-For more details: [PyTorch docs](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
+`attn_implementation` is passed through to `transformers` verbatim (via
+`model.config._attn_implementation`). Accepted values are the HF-native
+backends, axolotl-registered backends, or a hub-kernel path.

-## Flash Attention
+## Backends

-Axolotl supports Flash Attention 2, 3, and 4. The best available version is used automatically
-based on your installed packages and GPU.
+| `attn_implementation` | Description |
+|---|---|
+| `eager` | Plain PyTorch attention. No packing support. |
+| `sdpa` | PyTorch `scaled_dot_product_attention`. No packing support. |
+| `flash_attention_2` | Dao-AILab Flash Attention 2. |
+| `flash_attention_3` | Dao-AILab Flash Attention 3 (Hopper+). |
+| `flex_attention` | Torch Flex Attention (requires torch ≥ 2.6). |
+| `xformers` | xFormers memory-efficient attention. |
+| `sage` | SageAttention (QK int8 / PV fp16). |
+| `s2` | Shifted-Sparse Attention (LLaMA only, FA2 under the hood). |
+| `fp8` | torchao FP8 low-precision attention (requires SM90+, torch ≥ 2.11). Loaded as SDPA and patched post-load. |
+| `kernels-community/flash-attn3` | HF hub FA3 kernel. |
+| `kernels-community/sage-attention` | HF hub SageAttention kernel. |
+| Other `<org>/<name>` path | Any hub-kernel path supported by `transformers`. |
+
+Short-form aliases (`flash`, `fa2`, `flex`, `sdp`, etc.) are **not accepted** —
+set the canonical name above.
+
+### Capability flags
+
+Axolotl derives three boolean capability flags from `attn_implementation` and
+exposes them on the validated config:
+
+- `cfg.attn_supports_packing` — backend supports varlen sample packing via
+  `position_ids`. Gates multipack patches and `sample_packing_drop_attention_mask`.
+- `cfg.attn_uses_flash_lib` — backend needs the `flash_attn` (Dao-AILab)
+  monkeypatches (FA4 auto, LLaMA flash hijack, ring-FA).
+- `cfg.attn_needs_dtype_cast` — backend requires fp16/bf16 embeddings
+  (everything except `eager` and `sdpa`).
+
+These are **computed** — they cannot be overridden from YAML.
+
+## Per-backend notes
+
+### SDPA
+
+Default PyTorch attention. See
+[PyTorch docs](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html).

 ```yaml
-flash_attention: true
+attn_implementation: sdpa
 ```

-For more details: [Flash Attention](https://github.com/Dao-AILab/flash-attention/)
+### Flash Attention

-### Flash Attention 2
+Axolotl supports FA2, FA3, and FA4. The best available version is used
+automatically based on your installed packages and GPU.
+
+```yaml
+attn_implementation: flash_attention_2  # or flash_attention_3
+```
+
+#### Flash Attention 2

 Requirements: Ampere, Ada, or Hopper GPUs (Turing or lower not supported)

@@ -39,23 +82,25 @@ Alternatively, try reinstall or downgrade a version.

 :::

-### Flash Attention 3
+#### Flash Attention 3

 Requirements: Hopper only and CUDA 12.8 (recommended)

 ```bash
 git clone https://github.com/Dao-AILab/flash-attention.git
 cd flash-attention/hopper
-
 python setup.py install
 ```

-### Flash Attention 4
+#### Flash Attention 4

-Requirements: Hopper or Blackwell GPUs
+Requirements: Hopper or Blackwell GPUs. Auto-applied when `attn_uses_flash_lib`
+is true and FA4 is importable.
+
+FA4 is still a pre-release on PyPI, so `--pre` is required:

 ```bash
-pip install flash-attn-4
+pip install --pre flash-attn-4
 ```

 Or from source:
@@ -63,7 +108,6 @@ Or from source:
 ```bash
 git clone https://github.com/Dao-AILab/flash-attention.git
 cd flash-attention/flash_attn/cute
-
 pip install -e .

 # FA2's flash_attn package includes a cute/ stub that shadows FA4.
@@ -86,93 +130,113 @@ and falls back to FA2/3.

 :::

-For more details: [flash-attention/flash_attn/cute](https://github.com/Dao-AILab/flash-attention/tree/main/flash_attn/cute)
-
 ### AMD

-Requirements: ROCm 6.0 and above.
+Requirements: ROCm 6.0 and above. See
+[Flash Attention AMD docs](https://github.com/Dao-AILab/flash-attention/tree/main?tab=readme-ov-file#amd-rocm-support).

-See [Flash Attention AMD docs](https://github.com/Dao-AILab/flash-attention/tree/main?tab=readme-ov-file#amd-rocm-support).
-
-## Flex Attention
-
-A flexible PyTorch API for attention used in combination with `torch.compile`.
+### Flex Attention

 ```yaml
-flex_attention: true
-
-# recommended
-torch_compile: true
+attn_implementation: flex_attention
+torch_compile: true  # recommended
 ```

-::: {.callout-note}
+Requires torch ≥ 2.6. See [PyTorch docs](https://pytorch.org/blog/flexattention/).

-We recommend using latest stable version of PyTorch for best performance.
+### SageAttention

-:::
-
-For more details: [PyTorch docs](https://pytorch.org/blog/flexattention/)
-
-## SageAttention
-
-Attention kernels with QK Int8 and PV FP16 accumulator.
+Requirements: Ampere, Ada, or Hopper GPUs.

 ```yaml
-sage_attention: true
+attn_implementation: sage
 ```

-Requirements: Ampere, Ada, or Hopper GPUs
-
 ```bash
 pip install sageattention==2.2.0 --no-build-isolation
 ```

 ::: {.callout-warning}

-Only LoRA/QLoRA recommended at the moment. We found loss drop to 0 for full finetuning. See [GitHub Issue](https://github.com/thu-ml/SageAttention/issues/198).
+Only LoRA/QLoRA recommended. Full finetuning has been observed to drop loss to 0. See
+[GitHub Issue](https://github.com/thu-ml/SageAttention/issues/198).

 :::

-For more details: [Sage Attention](https://github.com/thu-ml/SageAttention)
+For more details: [Sage Attention](https://github.com/thu-ml/SageAttention).

-::: {.callout-note}
-
-We do not support SageAttention 3 at the moment. If you are interested on adding this or improving SageAttention implementation, please make an Issue.
-
-:::
-
-
-## xFormers
+### xFormers

 ```yaml
-xformers_attention: true
+attn_implementation: xformers
 ```

 ::: {.callout-tip}

-We recommend using with Turing GPUs or below (such as on Colab).
+Recommended for Turing GPUs or below (e.g. Colab T4).

 :::

-For more details: [xFormers](https://github.com/facebookresearch/xformers)
-
-## Shifted Sparse Attention
+### Shifted Sparse Attention

 ::: {.callout-warning}

-We plan to deprecate this! If you use this feature, we recommend switching to methods above.
+Planned for deprecation. Prefer one of the backends above.

 :::

-Requirements: LLaMA model architecture
+Requirements: LLaMA model architecture. Loaded as FA2 under the hood and
+patched to implement shifted-sparse attention. Does not support sample packing.

 ```yaml
-flash_attention: true
-s2_attention: true
+attn_implementation: s2
 ```

-::: {.callout-tip}
+### FP8

-No sample packing support!
+torchao low-precision attention. Loaded as SDPA and patched post-load.
+
+Requirements: SM90+ (Hopper/Blackwell), PyTorch ≥ 2.11, torchao ≥ 0.17,
+flash-attn with FA3. KV caching must be disabled.
+
+```yaml
+attn_implementation: fp8
+```
+
+### Hub kernels
+
+```yaml
+attn_implementation: kernels-community/flash-attn3
+```
+
+Passed through to `transformers`; axolotl does not install the kernel itself.
+For recognized hub paths the capability flags are set automatically; for
+arbitrary paths axolotl uses conservative defaults (`attn_supports_packing=False`,
+`attn_uses_flash_lib=False`).
+
+## Migrating from legacy boolean flags
+
+The following legacy config fields are **deprecated** and will be removed in a
+future release. Each emits a `DeprecationWarning` when set and is stripped from
+the validated config.
+
+| Legacy | Canonical |
+|---|---|
+| `flash_attention: true` | `attn_implementation: flash_attention_2` |
+| `sdp_attention: true` | `attn_implementation: sdpa` |
+| `xformers_attention: true` | `attn_implementation: xformers` |
+| `flex_attention: true` | `attn_implementation: flex_attention` |
+| `sage_attention: true` | `attn_implementation: sage` |
+| `s2_attention: true` | `attn_implementation: s2` |
+| `eager_attention: true` | `attn_implementation: eager` |
+
+Combining `attn_implementation` with a legacy flag (e.g. `attn_implementation:
+flash_attention_2` **and** `flash_attention: true`) raises — pick one.
+
+::: {.callout-note}
+
+Existing example configs under `examples/` still use the legacy flags. They
+continue to work with a deprecation warning; they will be migrated in a
+follow-up pass.

 :::
--- a/docs/debugging.qmd
+++ b/docs/debugging.qmd
@@ -77,8 +77,9 @@ Make sure you have an [editable install](https://setuptools.pypa.io/en/latest/us

 ```bash
 export UV_TORCH_BACKEND=cu128  # or cu130
-uv sync --extra flash-attn --extra deepspeed --group dev --group test
+uv venv --no-project --relocatable
 source .venv/bin/activate
+uv pip install --no-build-isolation -e '.[deepspeed]' --group dev --group test
 ```

 #### Remote Hosts
@@ -218,8 +219,9 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --
 You will now be in the container.  Next, install Axolotl with dev dependencies:

 ```bash
-uv sync --extra flash-attn --extra deepspeed --group dev --group test
+uv venv --no-project --relocatable
 source .venv/bin/activate
+uv pip install --no-build-isolation -e '.[deepspeed]' --group dev --group test
 ```

 ### Attach To Container
--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -10,13 +10,16 @@ This section describes the different Docker images that are released by AxolotlA
 [Docker Hub](https://hub.docker.com/u/axolotlai).

 ::: {.callout-important}
-For Blackwell GPUs, please use the tags with PyTorch 2.9.1 and CUDA 12.8.
-:::
+### Switch to the `-uv` images

-::: {.callout-tip}
-Each image below is available in a **uv variant** that uses [uv](https://docs.astral.sh/uv/) with
-a relocatable venv (`/workspace/axolotl-venv`) instead of Miniconda + pip. Append `-uv` to the image name
-(e.g. `axolotlai/axolotl-base-uv`). Tags follow the same format. We recommend the uv images for new deployments.
+Each image below ships a **uv variant** that uses [uv](https://docs.astral.sh/uv/) with a relocatable venv
+(`/workspace/axolotl-venv`) instead of Miniconda + pip. Append `-uv` to the image name
+(e.g. `axolotlai/axolotl-uv`, `axolotlai/axolotl-base-uv`, `axolotlai/axolotl-cloud-uv`). Tags follow the
+same format as their non-uv counterparts.
+
+**We recommend switching to the `-uv` images early.** In the near future we will publish the uv-based
+build to the non-uv tags as well. The non-uv names will continue to work, but they will start serving
+the uv image.
 :::

 ## Base
@@ -85,7 +88,7 @@ Tags examples:
 - `main-py3.12-cu130-2.10.0`
 - `main-latest`
 - `main-20260315-py3.11-cu128-2.9.1`
- `0.12.0`
+- `0.16.1`

 ## Cloud

--- a/docs/ebft.qmd
+++ b/docs/ebft.qmd
@@ -129,7 +129,7 @@ gradient_accumulation_steps: 4
 max_steps: 20
 learning_rate: 5.0e-6
 bf16: auto
-flash_attention: true
+attn_implementation: flash_attention_2
 gradient_checkpointing: true
 output_dir: ./outputs/ebft-quickstart
 ```
@@ -304,7 +304,7 @@ lora_alpha: 32
 lora_target_linear: true

 bf16: auto
-flex_attention: true
+attn_implementation: flex_attention
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: true          # Required with flex_attention
--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -57,7 +57,7 @@ description: Frequently asked questions

 **Q: vLLM is not working with Axolotl**

-> A: We currently recommend torch 2.6.0 for use with `vllm`. Please ensure you use the right version. For Docker, please use the `main-py3.11-cu124-2.6.0` tag.
+> A: We currently recommend torch 2.10 for use with `vllm`. Please ensure you use the right version. For Docker, please use the `main-py3.12-cu128-2.10.0` tag (note: torch 2.10 images are built with Python 3.12).

 **Q: FA2 2.8.0 `undefined symbol` runtime error on CUDA 12.4**

--- a/docs/grpo.qmd
+++ b/docs/grpo.qmd
@@ -154,7 +154,7 @@ lr_scheduler: cosine
 warmup_steps: 10

 bf16: true
-flash_attention: true
+attn_implementation: flash_attention_2
 gradient_checkpointing: true

 special_tokens:
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -15,7 +15,7 @@ This guide covers all the ways you can install and set up Axolotl for your envir

 - NVIDIA GPU (Ampere architecture or newer for `bf16` and Flash Attention) or AMD GPU
 - Python ≥3.11
- PyTorch ≥2.9.0
+- PyTorch ≥2.9.1

 ## Installation {#sec-installation}

@@ -36,9 +36,9 @@ source $HOME/.local/bin/env
 Choose your CUDA version (e.g. `cu128`, `cu130`), create a venv, and install:
 ```{.bash}
 export UV_TORCH_BACKEND=cu128  # or cu130
-uv venv --no-project --relocatable
+uv venv
 source .venv/bin/activate
-uv pip install --no-build-isolation axolotl[flash-attn,deepspeed]
+uv pip install --no-build-isolation axolotl[deepspeed]
 ```

 ### Edge/Development Build {#sec-edge-build}
@@ -49,12 +49,11 @@ For the latest features between releases:
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
 export UV_TORCH_BACKEND=cu128  # or cu130
-uv sync --extra flash-attn --extra deepspeed
+uv venv
 source .venv/bin/activate
+uv pip install --no-build-isolation -e '.[deepspeed]'
 ```

-`uv sync` creates a `.venv`, installs exact pinned versions from `uv.lock`, and sets up an editable install automatically.
-
 ### Docker {#sec-docker}

 ```{.bash}
@@ -132,11 +131,11 @@ source $HOME/.local/bin/env

 # Create a fresh venv (recommended for a clean start)
 export UV_TORCH_BACKEND=cu128  # or cu130
-uv venv --no-project --relocatable
+uv venv
 source .venv/bin/activate

 # Reinstall axolotl
-uv pip install --no-build-isolation axolotl[flash-attn,deepspeed]
+uv pip install --no-build-isolation axolotl[deepspeed]
 ```

 ## Using pip (Alternative) {#sec-pip}
@@ -151,13 +150,13 @@ Follow the instructions at: [https://pytorch.org/get-started/locally/](https://p

 ```{.bash}
 pip3 install -U packaging setuptools wheel ninja
-pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
+pip3 install --no-build-isolation axolotl[deepspeed]
 ```

 For editable/development installs:
 ```{.bash}
 pip3 install -U packaging setuptools wheel ninja
-pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
+pip3 install --no-build-isolation -e '.[deepspeed]'
 ```

 ## Troubleshooting {#sec-troubleshooting}
--- a/docs/multimodal_assistant_mask.md
+++ b/docs/multimodal_assistant_mask.md
@@ -0,0 +1,84 @@
+# Multimodal assistant-only loss masking
+
+## Correct placement
+
+```yaml
+# Top-level: only train_on_inputs lives here.
+train_on_inputs: false
+
+datasets:
+  - path: data/train.jsonl
+    type: chat_template
+    roles_to_train:          # per-dataset — this is what the MM scanner reads
+      - assistant
+    train_on_eos: turn       # per-dataset — same
+
+test_datasets:
+  - path: data/val.jsonl
+    type: chat_template
+    split: train
+    roles_to_train:
+      - assistant
+    train_on_eos: turn
+```
+
+## How to verify at runtime
+
+`build_collator` logs the resolved knobs at INFO:
+
+```text
+MM collator: train_on_inputs=False roles_to_train=['assistant'] train_on_eos=turn role_boundaries_override=none
+```
+
+If `roles_to_train` logs as `None`, the YAML knobs are not reaching the
+scanner — check that they are under `datasets[0]`, not at the root.
+
+Each verified strategy additionally logs its resolved boundary token ids at
+strategy init (e.g. `<|turn>model` → `[105, 4368]`, `<turn|>` → `[106]` for
+Gemma 4). If a strategy emits the "has no built-in role boundaries ... only
+pad and media tokens are masked" one-shot warning instead, it is on the
+fallback path — declare per-role markers in YAML via `cfg.role_boundaries`
+(below) to activate masking. The strategies currently on this path are
+listed in the audit table above under `fallback + warn`.
+
+## Config-based override: `cfg.role_boundaries`
+
+For the "unverified" strategies above, or for custom chat templates that
+don't match a built-in strategy's markers, users can declare role boundaries
+directly in YAML without subclassing:
+
+```yaml
+role_boundaries:
+  - role: assistant
+    start: "<|turn>model"
+    end: "<turn|>"
+  - role: user
+    start: "<|turn>user"
+    end: "<turn|>"
+  # Optional keys:
+  # include_start: false   # default False
+  # include_end: true      # default True, respects cfg.train_on_eos
+  # end: eos_token         # sentinel: resolves to tokenizer.eos_token_id
+  # end: null              # span runs to end of sequence
+```
+
+Semantics:
+
+- `start` and `end` are literal strings; axolotl encodes them at strategy
+  init via `tokenizer.encode(..., add_special_tokens=False)` and logs the
+  resolved token-id sequences at INFO level.
+- The special value `end: eos_token` is the portable way to express
+  "Pixtral-style assistant turns end at EOS" without hard-coding an id.
+- `role_boundaries` is an **opt-in override**. A non-empty list **replaces**
+  the strategy's built-in declarations wholesale (partial overlays are
+  intentionally unsupported — they're hard to reason about at review time).
+  Leaving the field unset *or* setting it to an empty list (`[]`) both mean
+  "use the strategy's built-ins." Writing `role_boundaries: []` is almost
+  always a typo or leftover — honoring it literally would produce all-masked
+  labels and zero gradient, so it is treated the same as unset.
+- `cfg.roles_to_train` still governs which declared roles contribute to
+  loss. You can declare `user` and `assistant` boundaries and set
+  `roles_to_train: ["assistant"]` to have the scanner correctly identify
+  user spans as masking boundaries without training on their content.
+- Invalid specs fail loudly at strategy init (missing `role`/`start`,
+  unencodable markers), not silently at loss-compute time.
--- a/docs/optimizations.qmd
+++ b/docs/optimizations.qmd
@@ -22,12 +22,12 @@ Improves GPU utilization by combining multiple short sequences into a single pac

 Using an optimized attention implementation is critical for training speed.

- **[Flash Attention 2](https://github.com/Dao-AILab/flash-attention)**: `flash_attention: true`. **(Recommended)** The industry standard for fast attention on modern GPUs. Requires Ampere or higher. For AMD, check [AMD Support](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#amd-rocm-support).
- **[Flex Attention](https://pytorch.org/blog/flexattention/)**: `flex_attention: true`.
- **[SDP Attention](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)**: `sdp_attention: true`. PyTorch's native implementation.
- **[Xformers](https://github.com/facebookresearch/xformers)**: `xformers_attention: true`. Works with FP16.
+- **[Flash Attention 2](https://github.com/Dao-AILab/flash-attention)**: `attn_implementation: flash_attention_2`. **(Recommended)** The industry standard for fast attention on modern GPUs. Requires Ampere or higher. For AMD, check [AMD Support](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#amd-rocm-support).
+- **[Flex Attention](https://pytorch.org/blog/flexattention/)**: `attn_implementation: flex_attention`.
+- **[SDP Attention](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)**: `attn_implementation: sdpa`. PyTorch's native implementation.
+- **[Xformers](https://github.com/facebookresearch/xformers)**: `attn_implementation: xformers`. Works with FP16.

-*Note: You should only enable one attention backend.*
+See [Attention](attention.qmd) for the full list of backends and the canonical values.

 ### LoRA Optimizations

--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -1147,8 +1147,7 @@ datasets:
    type: ebft_strided_structured.transform
    split: train[:1%]

-flash_attention: false
-flex_attention: true     # Strided mode uses flex_attention
+attn_implementation: flex_attention   # Strided mode uses flex_attention
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: true    # Required for flex_attention
--- a/docs/scripts/examples-allowlist.yml
+++ b/docs/scripts/examples-allowlist.yml
@@ -20,6 +20,8 @@ examples:
    title: Arcee AFM

  # MistralAI
+  - name: mistral-medium-3_5
+    title: Mistral Medium 3.5
  - name: ministral3/think
    title: Ministral 3 Thinking
  - name: ministral3/vision
--- a/docs/sequence_parallelism.qmd
+++ b/docs/sequence_parallelism.qmd
@@ -55,7 +55,7 @@ To use sequence parallelism, you need:

 ## Limitations

- Flash attention must be enabled for this to work (`flash_attention: true` in config YAML)
+- Flash attention must be enabled for this to work (`attn_implementation: flash_attention_2` in config YAML)
 - May have a small performance overhead due to communication between GPUs

 ## Example
--- a/docs/training_stability.qmd
+++ b/docs/training_stability.qmd
@@ -245,7 +245,7 @@ For GRPO, also reduce `max_completion_length`. Memory scales quadratically with
 Reduces attention memory from O(n^2) to O(n):

 ```yaml
-flash_attention: true
+attn_implementation: flash_attention_2
 ```

 ### Step 6: Offload with DeepSpeed
--- a/examples/LiquidAI/README.md
+++ b/examples/LiquidAI/README.md
@@ -15,7 +15,7 @@ Thanks to the team at LiquidAI for giving us early access to prepare for these r
    Here is an example of how to install from pip:
    ```bash
    # Ensure you have a compatible version of Pytorch installed
-    uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+    uv pip install --no-build-isolation 'axolotl>=0.16.1'
    ```

 2.  Run one of the finetuning examples below.
--- a/examples/LiquidAI/lfm2-350m-fft.yaml
+++ b/examples/LiquidAI/lfm2-350m-fft.yaml
@@ -39,7 +39,7 @@ tf32: true
 gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 2
--- a/examples/LiquidAI/lfm2-8b-a1b-lora.yaml
+++ b/examples/LiquidAI/lfm2-8b-a1b-lora.yaml
@@ -48,7 +48,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 2
--- a/examples/LiquidAI/lfm2-vl-lora.yaml
+++ b/examples/LiquidAI/lfm2-vl-lora.yaml
@@ -50,8 +50,7 @@ tf32: true

 gradient_checkpointing: true
 logging_steps: 1
-flash_attention: true
-eager_attention:
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/alst/llama3-8b-deepspeed-alst.yaml
+++ b/examples/alst/llama3-8b-deepspeed-alst.yaml
@@ -39,7 +39,7 @@ activation_offloading: legacy

 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_steps: 100
 saves_per_epoch: 1
--- a/examples/alst/llama3-8b-fsdp2-alst.yaml
+++ b/examples/alst/llama3-8b-fsdp2-alst.yaml
@@ -39,7 +39,7 @@ activation_offloading: legacy

 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_steps: 100
 saves_per_epoch: 1
--- a/examples/apertus/README.md
+++ b/examples/apertus/README.md
@@ -11,11 +11,11 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
    Here is an example of how to install from main for pip:

 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
+# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl

-uv pip install --no-build-isolation -e '.[flash-attn]'
+uv pip install --no-build-isolation -e '.'

 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
--- a/examples/apertus/apertus-8b-qlora.yaml
+++ b/examples/apertus/apertus-8b-qlora.yaml
@@ -55,7 +55,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/arcee/README.md
+++ b/examples/arcee/README.md
@@ -13,11 +13,11 @@ Thanks to the team at Arcee.ai for using Axolotl in supervised fine-tuning the A
    Here is an example of how to install from main for pip:

 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
+# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl

-uv pip install --no-build-isolation -e '.[flash-attn]'
+uv pip install --no-build-isolation -e '.'

 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
--- a/examples/arcee/afm-4.5b-qlora.yaml
+++ b/examples/arcee/afm-4.5b-qlora.yaml
@@ -55,7 +55,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/archived/cerebras/btlm-ft.yml
+++ b/examples/archived/cerebras/btlm-ft.yml
@@ -59,8 +59,7 @@ gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1

-flash_attention: true
-sdp_attention:
+attn_implementation: flash_attention_2
 flash_optimum:

 gptq_groupsize:
--- a/examples/archived/cerebras/qlora.yml
+++ b/examples/archived/cerebras/qlora.yml
@@ -39,8 +39,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-xformers_attention: true
-flash_attention:
+attn_implementation: xformers
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/code-llama/13b/lora.yml
+++ b/examples/archived/code-llama/13b/lora.yml
@@ -45,7 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/code-llama/13b/qlora.yml
+++ b/examples/archived/code-llama/13b/qlora.yml
@@ -46,7 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/code-llama/34b/lora.yml
+++ b/examples/archived/code-llama/34b/lora.yml
@@ -45,7 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/code-llama/34b/qlora.yml
+++ b/examples/archived/code-llama/34b/qlora.yml
@@ -46,7 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/code-llama/7b/lora.yml
+++ b/examples/archived/code-llama/7b/lora.yml
@@ -45,7 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/code-llama/7b/qlora.yml
+++ b/examples/archived/code-llama/7b/qlora.yml
@@ -46,7 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/dbrx/16bit-lora.yaml
+++ b/examples/archived/dbrx/16bit-lora.yaml
@@ -52,7 +52,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/archived/dbrx/8bit-lora.yaml
+++ b/examples/archived/dbrx/8bit-lora.yaml
@@ -55,7 +55,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/archived/dbrx/fft-ds-zero3.yaml
+++ b/examples/archived/dbrx/fft-ds-zero3.yaml
@@ -39,7 +39,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
+++ b/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
@@ -45,7 +45,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/archived/falcon/config-7b-lora.yml
+++ b/examples/archived/falcon/config-7b-lora.yml
@@ -43,8 +43,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-xformers_attention: true
-flash_attention:
+attn_implementation: xformers
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/falcon/config-7b-qlora.yml
+++ b/examples/archived/falcon/config-7b-qlora.yml
@@ -73,8 +73,7 @@ early_stopping_patience: 3
 resume_from_checkpoint:
 auto_resume_from_checkpoints: true
 logging_steps: 1
-xformers_attention: true
-flash_attention:
+attn_implementation: xformers
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/falcon/config-7b.yml
+++ b/examples/archived/falcon/config-7b.yml
@@ -40,8 +40,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-xformers_attention: true
-flash_attention:
+attn_implementation: xformers
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/gemma/qlora.yml
+++ b/examples/archived/gemma/qlora.yml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/gptj/qlora.yml
+++ b/examples/archived/gptj/qlora.yml
@@ -36,8 +36,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-xformers_attention: true
-flash_attention:
+attn_implementation: xformers
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/jeopardy-bot/config.yml
+++ b/examples/archived/jeopardy-bot/config.yml
@@ -37,8 +37,7 @@ bf16: auto
 tf32: true
 resume_from_checkpoint:
 logging_steps: 5
-xformers_attention: true
-flash_attention:
+attn_implementation: xformers
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/mpt-7b/config.yml
+++ b/examples/archived/mpt-7b/config.yml
@@ -39,7 +39,6 @@ bf16: auto
 tf32: true
 resume_from_checkpoint:
 logging_steps: 5
-flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/openllama-3b/config.yml
+++ b/examples/archived/openllama-3b/config.yml
@@ -39,7 +39,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/openllama-3b/lora.yml
+++ b/examples/archived/openllama-3b/lora.yml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/openllama-3b/qlora.yml
+++ b/examples/archived/openllama-3b/qlora.yml
@@ -40,7 +40,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/qwen/lora.yml
+++ b/examples/archived/qwen/lora.yml
@@ -47,7 +47,6 @@ tf32: false
 gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention:

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/qwen/qlora.yml
+++ b/examples/archived/qwen/qlora.yml
@@ -47,7 +47,6 @@ tf32: false
 gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention:

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/qwen/qwen2-moe-lora.yaml
+++ b/examples/archived/qwen/qwen2-moe-lora.yaml
@@ -43,7 +43,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/qwen/qwen2-moe-qlora.yaml
+++ b/examples/archived/qwen/qwen2-moe-qlora.yaml
@@ -46,7 +46,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/redpajama/config-3b.yml
+++ b/examples/archived/redpajama/config-3b.yml
@@ -40,7 +40,6 @@ bf16: auto
 tf32: true
 resume_from_checkpoint:
 logging_steps: 5
-flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/replit-3b/config-lora.yml
+++ b/examples/archived/replit-3b/config-lora.yml
@@ -38,7 +38,6 @@ tf32: true
 gradient_checkpointing:
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/stablelm-2/1.6b/fft.yml
+++ b/examples/archived/stablelm-2/1.6b/fft.yml
@@ -44,7 +44,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
 flash_attn_fuse_mlp: true
--- a/examples/archived/stablelm-2/1.6b/lora.yml
+++ b/examples/archived/stablelm-2/1.6b/lora.yml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true

--- a/examples/archived/starcoder2/qlora.yml
+++ b/examples/archived/starcoder2/qlora.yml
@@ -46,7 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/tiny-llama/lora-mps.yml
+++ b/examples/archived/tiny-llama/lora-mps.yml
@@ -47,7 +47,6 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: false

 warmup_ratio: 0.1
 evals_per_epoch: 0
--- a/examples/archived/tiny-llama/lora.yml
+++ b/examples/archived/tiny-llama/lora.yml
@@ -45,7 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/tiny-llama/pretrain.yml
+++ b/examples/archived/tiny-llama/pretrain.yml
@@ -36,7 +36,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/archived/tiny-llama/qlora.yml
+++ b/examples/archived/tiny-llama/qlora.yml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml
+++ b/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml
@@ -71,8 +71,7 @@ early_stopping_patience: 3
 resume_from_checkpoint:
 auto_resume_from_checkpoints: true
 logging_steps: 1
-xformers_attention: true
-flash_attention:
+attn_implementation: xformers
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/yi-34B-chat/qlora.yml
+++ b/examples/archived/yi-34B-chat/qlora.yml
@@ -10,7 +10,7 @@ load_in_4bit: true
 sequence_len: 1024
 bf16: auto
 tf32: false
-flash_attention: true
+attn_implementation: flash_attention_2
 special_tokens:
  bos_token: "<|startoftext|>"
  eos_token: "<|endoftext|>"
--- a/examples/cohere/command-r-7b-qlora.yml
+++ b/examples/cohere/command-r-7b-qlora.yml
@@ -48,7 +48,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -36,12 +36,7 @@
    "id": "msOCO4NRmRLa"
   },
   "outputs": [],
-   "source": [
-    "%%capture\n",
-    "# This step can take ~5-10 minutes to install dependencies\n",
-    "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
-    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@fec1a88\""
-   ]
+   "source": "%%capture\n# This step can take ~5-10 minutes to install dependencies\n!pip install --no-build-isolation \"axolotl>=0.16.1\"\n!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@fec1a88\""
  },
  {
   "cell_type": "markdown",
--- a/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
@@ -45,7 +45,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
@@ -45,7 +45,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/deepseek-v2/fft-fsdp-16b.yaml
+++ b/examples/deepseek-v2/fft-fsdp-16b.yaml
@@ -35,7 +35,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 2
--- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -59,7 +59,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch: 2
--- a/examples/devstral/README.md
+++ b/examples/devstral/README.md
@@ -15,8 +15,8 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r
    Here is an example of how to install from pip:

 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
+uv pip install --no-build-isolation 'axolotl>=0.16.1'
 ```

 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage
--- a/examples/devstral/devstral-small-qlora.yml
+++ b/examples/devstral/devstral-small-qlora.yml
@@ -26,7 +26,6 @@ lora_model_dir:
 sequence_len: 2048
 sample_packing: true

-
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0
@@ -51,8 +50,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
-scaling_softmax: true
+attn_implementation: flash_attention_2
+# scaling_softmax: true  # needs flex_attention

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml
+++ b/examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml
@@ -29,7 +29,7 @@ output_dir: ./outputs/ndp-out/

 sequence_len: 2048
 sample_packing: true
-flash_attention: true
+attn_implementation: flash_attention_2

 gradient_accumulation_steps: 1
 micro_batch_size: 1
--- a/examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml
+++ b/examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml
@@ -26,7 +26,7 @@ output_dir: ./outputs/ndp-out/

 sequence_len: 8192
 sample_packing: true
-flash_attention: true
+attn_implementation: flash_attention_2

 gradient_accumulation_steps: 1
 micro_batch_size: 1  # must be 1 when using context parallel
--- a/examples/eaft/eaft-example.yml
+++ b/examples/eaft/eaft-example.yml
@@ -65,8 +65,7 @@ early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
-xformers_attention:
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 weight_decay: 0.0
--- a/examples/ebft/llama-1b-ebft-opencode-novllm.yaml
+++ b/examples/ebft/llama-1b-ebft-opencode-novllm.yaml
@@ -46,7 +46,7 @@ lora_dropout: 0.05
 lora_target_linear: true

 bf16: auto
-flash_attention: true
+attn_implementation: flash_attention_2
 gradient_checkpointing: true

 special_tokens:
--- a/examples/ebft/llama-1b-ebft-opencode.yaml
+++ b/examples/ebft/llama-1b-ebft-opencode.yaml
@@ -66,7 +66,7 @@ lora_target_linear: true

 # --- Hardware ---
 bf16: auto
-flash_attention: true
+attn_implementation: flash_attention_2
 gradient_checkpointing: true

 special_tokens:
--- a/examples/ebft/llama-1b-ebft-strided-structured.yaml
+++ b/examples/ebft/llama-1b-ebft-strided-structured.yaml
@@ -47,8 +47,7 @@ lora_dropout: 0.05
 lora_target_linear: true

 bf16: auto
-flash_attention: false  # strided EBFT overrides to flex_attention (or eager fallback) at runtime
-flex_attention: true    # fused flex_attention kernel compiles itself; don't set torch_compile: true
+attn_implementation: flex_attention
                        # (full-model compile conflicts with gradient checkpointing + flex_attention)
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
--- a/examples/ebft/llama-1b-ebft-strided.yaml
+++ b/examples/ebft/llama-1b-ebft-strided.yaml
@@ -46,7 +46,6 @@ lora_dropout: 0.05
 lora_target_linear: true

 bf16: auto
-flash_attention: false  # strided EBFT overrides to flex_attention (or eager fallback) at runtime
 gradient_checkpointing: true

 special_tokens:
--- a/examples/ebft/llama-3b-ebft-strided-fft.yaml
+++ b/examples/ebft/llama-3b-ebft-strided-fft.yaml
@@ -48,7 +48,6 @@ lora_target_linear: true

 bf16: auto
 torch_dtype: bfloat16
-flash_attention: false
 gradient_checkpointing: true
 torch_compile: true
 gradient_checkpointing_kwargs:
--- a/examples/ebft/llama-8b-ebft-strided-fft.yaml
+++ b/examples/ebft/llama-8b-ebft-strided-fft.yaml
@@ -41,7 +41,6 @@ warmup_steps: 10
 weight_decay: 0.01

 bf16: auto
-flash_attention: false  # strided EBFT uses flex_attention at runtime
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
--- a/examples/ebft/qwen35-4b-ebft-structured-async.yaml
+++ b/examples/ebft/qwen35-4b-ebft-structured-async.yaml
@@ -72,7 +72,7 @@ lora_dropout: 0.0
 lora_target_modules: ".*\\.layers\\.(3|7|11|15|19|23|27|31)\\.self_attn\\.(q|k|v|o)_proj|.*\\.mlp\\.(gate|up|down)_proj"

 bf16: auto
-flash_attention: true
+attn_implementation: flash_attention_2
 gradient_checkpointing: true

 special_tokens:
--- a/examples/ebft/qwen35-4b-ebft-structured.yaml
+++ b/examples/ebft/qwen35-4b-ebft-structured.yaml
@@ -63,7 +63,7 @@ lora_dropout: 0.0
 lora_target_modules: ".*\\.layers\\.(3|7|11|15|19|23|27|31)\\.self_attn\\.(q|k|v|o)_proj|.*\\.mlp\\.(gate|up|down)_proj"

 bf16: auto
-flash_attention: true
+attn_implementation: flash_attention_2
 gradient_checkpointing: true

 special_tokens:
--- a/examples/ebft/qwen35-9b-ebft-structured.yaml
+++ b/examples/ebft/qwen35-9b-ebft-structured.yaml
@@ -68,7 +68,7 @@ lora_dropout: 0.0
 lora_target_modules: ".*\\.layers\\.(3|7|11|15|19|23|27|31)\\.self_attn\\.(q|k|v|o)_proj|.*\\.mlp\\.(gate|up|down)_proj"

 bf16: auto
-flash_attention: true
+attn_implementation: flash_attention_2
 gradient_checkpointing: true

 special_tokens:
--- a/examples/falcon-e/falcon-e-3b-dpo.yaml
+++ b/examples/falcon-e/falcon-e-3b-dpo.yaml
@@ -0,0 +1,93 @@
+base_model: axolotl-ai-co/Falcon-E-1.2-3B-Exp-prequantized
+output_dir: ./output
+
+plugins:
+  - axolotl.integrations.kernels.KernelsPlugin
+
+use_kernels: false
+use_scattermoe: false
+use_sonicmoe: false
+use_onebitllms: true
+
+load_in_8bit: false
+load_in_4bit: false
+
+chat_template: tokenizer_default
+
+rl: dpo
+datasets:
+  - path: allenai/Dolci-Think-DPO-7B
+    split: train
+    type: chatml.ultra
+
+dataset_prepared_path: ./axolotl_dataset_cache
+
+sequence_len: 8192
+trust_remote_code: false
+
+gradient_accumulation_steps: 4 # This can run on 4 GPUs
+
+# Very important to enable gradient accumulation with FSDP
+# https://github.com/huggingface/transformers/issues/29425
+accelerator_config:
+  gradient_accumulation_kwargs:
+    sync_each_batch: True
+
+
+micro_batch_size: 1
+num_epochs: 3
+optimizer: adamw_torch
+lr_scheduler: cosine
+learning_rate: 1.0e-5
+# adamw hyperparams
+adam_beta1: 0.9
+adam_beta2: 0.95
+
+bf16: true
+tf32: false
+
+logging_steps: 1
+
+flash_attention: true
+
+loss_watchdog_threshold: 15.0
+loss_watchdog_patience: 3
+
+warmup_steps: 128
+evals_per_epoch: 0
+
+save_steps: 500
+save_strategy: steps
+
+weight_decay: 0.01
+
+shuffle_merged_datasets: true
+experimental_skip_move_to_device: true
+
+fsdp_version: 2
+fsdp_config:
+  offload_params: false
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  state_dict_type: FULL_STATE_DICT
+  reshard_after_forward: true
+  activation_checkpointing: true
+
+# Comment to disable CP
+# The number of GPUs to shard the model parameters across (FSDP dimension).
+dp_shard_size: 1
+
+# The number of times to replicate the sharded model (DDP dimension).
+dp_replicate_size: 1
+
+# Number of GPUs for Tensor Parallelism.
+tensor_parallel_size: 1  # (default is 1, no TP)
+
+# Number of GPUs for Context/Sequence Parallelism.
+context_parallel_size: 1 # (default is 1, no CP)
+
+special_tokens:
+  eos_token: <|end_of_text|>
+
+eot_tokens:
+  - <|im_end|>
--- a/examples/falcon-e/falcon-e-3b-ft.yaml
+++ b/examples/falcon-e/falcon-e-3b-ft.yaml
@@ -0,0 +1,100 @@
+base_model: tiiuae/Falcon-E-3B-Base-prequantized
+output_dir: ./output
+
+plugins:
+  - axolotl.integrations.kernels.KernelsPlugin
+
+use_kernels: false
+use_scattermoe: false
+use_sonicmoe: false
+use_onebitllms: true
+
+load_in_8bit: false
+load_in_4bit: false
+
+chat_template: tokenizer_default
+
+datasets:
+  - path: cgato/SlimOrcaDedupCleaned
+    type: chat_template
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+
+dataset_prepared_path: ./axolotl_dataset_cache
+
+sequence_len: 32768
+trust_remote_code: false
+
+
+gradient_accumulation_steps: 4 # This can run on 4 GPUs
+
+# Very important to enable gradient accumulation with FSDP
+# https://github.com/huggingface/transformers/issues/29425
+accelerator_config:
+  gradient_accumulation_kwargs:
+    sync_each_batch: True
+
+
+micro_batch_size: 1
+num_epochs: 3
+optimizer: adamw_torch
+lr_scheduler: cosine
+learning_rate: 5.0e-4
+# adamw hyperparams
+adam_beta1: 0.9
+adam_beta2: 0.95
+
+bf16: true
+tf32: false
+
+logging_steps: 1
+
+flash_attention: true
+
+loss_watchdog_threshold: 15.0
+loss_watchdog_patience: 3
+
+warmup_steps: 128
+evals_per_epoch: 0
+
+save_steps: 500
+save_strategy: steps
+
+weight_decay: 0.01
+
+sample_packing: true
+pad_to_sequence_len: true
+
+shuffle_merged_datasets: true
+experimental_skip_move_to_device: true
+
+fsdp_version: 2
+fsdp_config:
+  offload_params: false
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  state_dict_type: FULL_STATE_DICT
+  reshard_after_forward: true
+  activation_checkpointing: true
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
+
+# Comment to disable CP
+# The number of GPUs to shard the model parameters across (FSDP dimension).
+dp_shard_size: 1
+
+# The number of times to replicate the sharded model (DDP dimension).
+dp_replicate_size: 1
+
+# Number of GPUs for Tensor Parallelism.
+tensor_parallel_size: 1  # (default is 1, no TP)
+
+# Number of GPUs for Context/Sequence Parallelism.
+context_parallel_size: 1 # (default is 1, no CP)
+
+special_tokens:
+  eos_token: <|end_of_text|>
+
+eot_tokens:
+  - <|im_end|>
--- a/examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml
@@ -62,7 +62,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/falcon-h1/falcon-h1-1b-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-1b-qlora.yaml
@@ -61,7 +61,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/Show More
+++ b/Show More