configurable weight scale normalization for MoE expert drift

2026-04-09 15:37:16 +00:00
378 changed files with 2460 additions and 12547 deletions
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -31,10 +31,7 @@ PRs are **greatly welcome**!

 Please run below to setup env
 ```bash
-# Install axolotl + dev and test dependencies from lockfile
-export UV_TORCH_BACKEND=cu128  # or cu130
-uv sync --extra flash-attn --extra deepspeed --group dev --group test
-source .venv/bin/activate
+pip3 install -r requirements-dev.txt -r requirements-tests.txt
 pre-commit install

 # test
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -6,7 +6,7 @@ on:
      types: [opened, synchronize, reopened, ready_for_review]
      paths:
       - '**.py'
-       - 'pyproject.toml'
+       - 'requirements.txt'
       - '.github/workflows/*.yml'
       - "*.[q]md"
       - "examples/**/*.y[a]?ml"
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -3,15 +3,17 @@ name: docker-multigpu-tests-biweekly
 on:
  pull_request:
    paths:
-      - "tests/e2e/multigpu/**.py"
-      - "pyproject.toml"
-      - ".github/workflows/multi-gpu-e2e.yml"
-      - "scripts/cutcrossentropy_install.py"
-      - "src/axolotl/core/trainers/mixins/sequence_parallel.py"
-      - "src/axolotl/utils/distributed.py"
+      - 'tests/e2e/multigpu/**.py'
+      - 'requirements.txt'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - '.github/workflows/multi-gpu-e2e.yml'
+      - 'scripts/cutcrossentropy_install.py'
+      - 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
+      - 'src/axolotl/utils/distributed.py'
  workflow_dispatch:
  schedule:
-    - cron: "0 0 * * 1,4" # Runs at 00:00 UTC every monday & thursday
+    - cron: '0 0 * * 1,4'  # Runs at 00:00 UTC every monday & thursday

 # Cancel jobs on the same ref if a new one is triggered
 concurrency:
@@ -31,19 +33,19 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          #          - cuda: 129
-          #            cuda_version: 12.9.1
-          #            python_version: "3.12"
-          #            pytorch: 2.9.1
-          #            axolotl_extras: "fbgemm-gpu"
-          #            num_gpus: 2
-          #            dockerfile: "Dockerfile-uv.jinja"
+#          - cuda: 129
+#            cuda_version: 12.9.1
+#            python_version: "3.12"
+#            pytorch: 2.9.1
+#            axolotl_extras: "fbgemm-gpu"
+#            num_gpus: 2
+#            dockerfile: "Dockerfile-uv.jinja"
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
-            #            axolotl_extras: fbgemm-gpu
+#            axolotl_extras: fbgemm-gpu
            num_gpus: 2
          - cuda: 128
            cuda_version: 12.8.1
@@ -51,6 +53,7 @@ jobs:
            pytorch: 2.10.0
            axolotl_extras: "fbgemm-gpu"
            num_gpus: 2
+            dockerfile: "Dockerfile-uv.jinja"
    runs-on: [self-hosted, modal]
    timeout-minutes: 120
    steps:
@@ -72,7 +75,7 @@ jobs:
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -8,9 +8,6 @@ on:

 permissions: {}

-env:
-  UV_SYSTEM_PYTHON: "1"
-
 jobs:
  setup_release:
    name: Create Release
@@ -44,15 +41,11 @@ jobs:
        with:
          python-version: "3.11"

-      - name: Install uv
-        uses: astral-sh/setup-uv@v7
-
      - name: Install dependencies
        run: |
-          uv pip install wheel packaging
-          uv pip install --no-build-isolation -e .
-          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
-            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
+          pip3 install wheel packaging==26.0
+          pip3 install --no-build-isolation -e .
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt

      - name: Extract tag name
        id: tag
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -2,18 +2,15 @@ name: Tests Nightly against upstream main
 on:
  workflow_dispatch:
  schedule:
-    - cron: "0 0 * * *" # Runs at 00:00 UTC every day
+    - cron: '0 0 * * *'  # Runs at 00:00 UTC every day
  pull_request:
    types: [opened, synchronize, reopened, ready_for_review]
    paths:
-      - ".github/workflows/tests-nightly.yml"
+      - '.github/workflows/tests-nightly.yml'

 permissions:
  contents: read

-env:
-  UV_SYSTEM_PYTHON: "1"
-
 jobs:
  pre-commit:
    name: pre-commit
@@ -23,7 +20,7 @@ jobs:
      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"
-          cache: "pip" # caching pip dependencies
+          cache: 'pip' # caching pip dependencies
      - uses: pre-commit/action@v3.0.1
        env:
          SKIP: no-commit-to-branch
@@ -46,7 +43,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python_version: ["3.12"] # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
+        python_version: ["3.12"]  # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
        pytorch_version: ["2.9.1", "2.10.0"]
    timeout-minutes: 20

@@ -64,34 +61,36 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
+          cache: 'pip' # caching pip dependencies

-      - name: Install uv
-        uses: astral-sh/setup-uv@v7
+      - name: upgrade pip
+        run: |
+          pip3 install --upgrade pip
+          pip3 install --upgrade packaging==26.0 setuptools==78.1.1 wheel

      - name: Install PyTorch
        run: |
-          uv pip install torch==${{ matrix.pytorch_version }} torchvision
-          uv pip freeze | grep -E "^(torch|torchvision)==" > /tmp/torch-pin.txt
+          pip3 install torch==${{ matrix.pytorch_version }} torchvision
+
+      - name: Update requirements.txt
+        run: |
+          sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt
+          sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt
+          sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt
+          sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt
+          sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt

      - name: Install dependencies
        run: |
-          uv pip install --no-build-isolation -e . --override /tmp/torch-pin.txt
-          python scripts/cutcrossentropy_install.py --uv | sh
-          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
-            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
-
-      - name: Override with nightly HF packages
-        run: |
-          uv pip install --no-deps \
-            "transformers @ git+https://github.com/huggingface/transformers.git@main" \
-            "peft @ git+https://github.com/huggingface/peft.git@main" \
-            "accelerate @ git+https://github.com/huggingface/accelerate.git@main" \
-            "trl @ git+https://github.com/huggingface/trl.git@main" \
-            "datasets @ git+https://github.com/huggingface/datasets.git@main"
+          pip3 show torch
+          pip3 install --no-build-isolation -U -e .
+          python scripts/unsloth_install.py | sh
+          python scripts/cutcrossentropy_install.py | sh
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt

      - name: Make sure PyTorch version wasn't clobbered
        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__, f'Expected torch ${{ matrix.pytorch_version }} but got {torch.__version__}'"
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"

      - name: Ensure axolotl CLI was installed
        run: |
@@ -103,6 +102,9 @@ jobs:
          pytest -v --durations=10 tests/patched/
          pytest -v --durations=10 tests/cli/

+      - name: cleanup pip cache
+        run: |
+          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

  docker-e2e-tests:
    if: github.repository_owner == 'axolotl-ai-cloud'
@@ -134,6 +136,7 @@ jobs:
            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
+            dockerfile: "Dockerfile-uv.jinja"
            nightly_build: "true"
    steps:
      - name: Checkout
@@ -154,7 +157,7 @@ jobs:
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -6,19 +6,21 @@ on:
    branches:
      - "main"
    paths:
-      - "**.py"
-      - "pyproject.toml"
-      - ".github/workflows/*.yml"
-      - "cicd/cicd.sh"
-      - "cicd/Dockerfile-uv.jinja"
+      - '**.py'
+      - 'requirements.txt'
+      - '.github/workflows/*.yml'
+      - 'requirements-tests.txt'
+      - 'cicd/cicd.sh'
+      - 'cicd/Dockerfile.jinja'
  pull_request:
-    types: [opened, synchronize, reopened, ready_for_review]
-    paths:
-      - "**.py"
-      - "pyproject.toml"
-      - ".github/workflows/*.yml"
-      - "cicd/cicd.sh"
-      - "cicd/Dockerfile-uv.jinja"
+      types: [opened, synchronize, reopened, ready_for_review]
+      paths:
+       - '**.py'
+       - 'requirements.txt'
+       - '.github/workflows/*.yml'
+       - 'requirements-tests.txt'
+       - 'cicd/cicd.sh'
+       - 'cicd/Dockerfile.jinja'
  workflow_dispatch:

 # Cancel jobs on the same ref if a new one is triggered
@@ -31,7 +33,6 @@ permissions:

 env:
  TRANSFORMERS_IS_CI: "yes"
-  UV_SYSTEM_PYTHON: "1"

 jobs:
  pre-commit:
@@ -43,7 +44,7 @@ jobs:
      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"
-          cache: "pip" # caching pip dependencies
+          cache: 'pip' # caching pip dependencies
      - uses: pre-commit/action@v3.0.1
        env:
          SKIP: no-commit-to-branch
@@ -93,25 +94,32 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
+          cache: 'pip' # caching pip dependencies

-      - name: Install uv
-        uses: astral-sh/setup-uv@v7
+      - name: upgrade pip
+        run: |
+          pip3 install --upgrade pip
+          pip3 install --upgrade packaging==26.0 setuptools==75.8.0 wheel

      - name: Install PyTorch
        run: |
-          uv pip install torch==${{ matrix.pytorch_version }} torchvision
-          uv pip freeze | grep -E "^(torch|torchvision)==" > /tmp/torch-pin.txt
+          pip3 install --no-cache-dir torch==${{ matrix.pytorch_version }} torchvision

      - name: Install dependencies
        run: |
-          uv pip install --no-build-isolation -e . --override /tmp/torch-pin.txt
-          python scripts/cutcrossentropy_install.py --uv | sh
-          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
-            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
+          pip3 show torch
+          pip3 install --no-cache-dir --no-build-isolation -U -e .
+          python scripts/unsloth_install.py | sh
+          python scripts/cutcrossentropy_install.py | sh
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+
+      - name: cleanup pip cache
+        run: |
+          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

      - name: Make sure PyTorch version wasn't clobbered
        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__, f'Expected torch ${{ matrix.pytorch_version }} but got {torch.__version__}'"
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"

      - name: Ensure axolotl CLI was installed
        run: |
@@ -180,42 +188,38 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
+          cache: 'pip' # caching pip dependencies

-      - name: Install uv
-        uses: astral-sh/setup-uv@v7
+      - name: upgrade pip
+        run: |
+          pip3 install --upgrade pip
+          pip3 install --upgrade packaging==26.0 setuptools==75.8.0 setuptools_scm build wheel psutil

      - name: Install PyTorch
        run: |
-          uv pip install torch==${{ matrix.pytorch_version }} torchvision
-          uv pip freeze | grep -E "^(torch|torchvision)==" > /tmp/torch-pin.txt
+          pip3 install --no-cache-dir torch==${{ matrix.pytorch_version }} torchvision

      - name: Install dependencies
        run: |
-          uv pip install packaging setuptools_scm build wheel psutil
+          pip3 show torch
          python -m build --no-isolation --sdist
-          uv pip install --no-build-isolation dist/axolotl*.tar.gz --override /tmp/torch-pin.txt
-          python scripts/cutcrossentropy_install.py --uv | sh
-          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
-            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
+          pip3 install --no-cache-dir --no-build-isolation dist/axolotl*.tar.gz
+          python scripts/unsloth_install.py | sh
+          python scripts/cutcrossentropy_install.py | sh
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+
+      - name: cleanup pip cache
+        run: |
+          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

      - name: Make sure PyTorch version wasn't clobbered
        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__, f'Expected torch ${{ matrix.pytorch_version }} but got {torch.__version__}'"
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"

      - name: Ensure axolotl CLI was installed
        run: |
          axolotl --help

-      - name: Verify agent docs are discoverable
-        run: |
-          # Agent docs live in docs/agents/ (source of truth) and are resolved
-          # at runtime from the repo checkout or via `axolotl fetch docs`
-          axolotl agent-docs --list
-          axolotl agent-docs | grep -q "Fine-tuning framework"
-          axolotl agent-docs grpo | grep -q "GRPO"
-          axolotl agent-docs sft | grep -q "SFT"
-          python -c "from axolotl.cli.agent_docs import get_doc, list_topics; assert len(list_topics()) >= 5; assert 'GRPO' in get_doc('grpo')"
-
      - name: Show HF cache
        run: hf cache ls

@@ -277,6 +281,7 @@ jobs:
            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
+            dockerfile: "Dockerfile-uv.jinja"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -297,7 +302,7 @@ jobs:
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
@@ -359,7 +364,7 @@ jobs:
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "GPU_TYPE=${{ matrix.gpu_type || 'L40S'}}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -16,9 +16,6 @@ axolotl inference config.yaml          # Interactive inference
 axolotl merge-lora config.yaml         # Merge LoRA adapter into base model
 axolotl vllm-serve config.yaml         # Start vLLM server for GRPO/EBFT training
 axolotl fetch examples                 # Download example configs
-axolotl agent-docs                     # Show agent-optimized docs (bundled with pip package)
-axolotl agent-docs grpo                # Topic-specific agent reference
-axolotl config-schema                  # Dump config JSON schema
 ```

 ## Training Methods
@@ -26,7 +23,7 @@ axolotl config-schema                  # Dump config JSON schema
 | Method | Config Key | When to Use |
 |--------|-----------|-------------|
 | SFT | *(default)* | Input-output pairs, instruction tuning |
-| DPO/IPO | `rl: dpo` / `rl: dpo, dpo_loss_type: ["ipo"]` | Paired preference data (chosen vs rejected) |
+| DPO/IPO | `rl: dpo` / `rl: ipo` | Paired preference data (chosen vs rejected) |
 | KTO | `rl: kto` | Unpaired binary preference labels |
 | ORPO | `rl: orpo` | Single-stage alignment, no ref model |
 | GRPO | `rl: grpo` | RL with verifiable reward functions (math, code) |
@@ -38,8 +35,6 @@ Agent-specific references:
 - [docs/agents/grpo.md](docs/agents/grpo.md) — GRPO online RL with reward functions
 - [docs/agents/reward_modelling.md](docs/agents/reward_modelling.md) — outcome and process reward models
 - [docs/agents/pretraining.md](docs/agents/pretraining.md) — continual pretraining
- [docs/agents/model_architectures.md](docs/agents/model_architectures.md) — model-specific quirks (Gemma4, Qwen3.5 MoE, etc.)
- [docs/agents/new_model_support.md](docs/agents/new_model_support.md) — debugging and adding support for new model architectures

 ## Config Pattern

--- a/ATTN_REFACTOR_REVIEW.md
+++ b/ATTN_REFACTOR_REVIEW.md
@@ -1,142 +0,0 @@
-# `attn-implementation-refactor` branch review
-
-Review target: `attn-implementation-refactor` (5 commits ahead of main, merge base `69904781`).
-Scope: 16 files, +682 / −96.
-
-## 1. What the branch is trying to do
-
-Collapse seven boolean attention flags (`flash_attention`, `sdp_attention`, `xformers_attention`, `flex_attention`, `sage_attention`, `s2_attention`, `eager_attention`) into a single `attn_implementation` field, with derived capability flags (`attn_supports_packing`, `attn_uses_flash_lib`, `attn_needs_dtype_cast`) for the gates that used to be ad-hoc OR-chains.
-
-Mechanism: `normalize_attn_implementation` (a `@model_validator(mode="before")` on `AxolotlInputConfig`) maps bidirectionally between the new field and the legacy flags, with a priority list for legacy combos (`s2 + flash → s2`), and then computes the three capability flags from frozen sets in `enums.py`.
-
-Adjacent changes: `xformers` and `sage` now register as their own entries in `ALL_ATTENTION_FUNCTIONS` (with FA2 mask behavior) instead of stomping the `flash_attention_2` slot. New `fp8` backend wires `torchao.prototype.attention.apply_low_precision_attention` in `apply_post_model_load_patches`.
-
-## 2. Target design
-
-**`cfg.attn_implementation` is the single source of truth on the validated config.**
-
- Its type is `str | None`. Accepted values are **canonical names only** — no short-form aliases:
-  - HF-native: `eager`, `sdpa`, `flash_attention_2`, `flash_attention_3`, `flex_attention`. (`flash_attention_3` is net-new to axolotl — the current branch only encodes `flash_attention_2` under the short name `flash`.)
-  - Axolotl-owned (registered into `ALL_ATTENTION_FUNCTIONS` under exactly these names): `xformers`, `sage`, `s2`, `fp8`.
-  - Hub-kernel paths: `kernels-community/sage-attention`, `kernels-community/flash-attn3`, etc. — passthrough. Known-kernel allowlist in `enums.py` classifies the common ones into the capability tables.
-  Short forms like `flash`, `fa2`, `fa3`, `sdp`, `flex` are rejected (Pydantic validation error with a pointer to the canonical name).
- `model.py:_set_attention_config` passes `cfg.attn_implementation` to HF verbatim — no `_ATTN_IMPL_TO_HF` translation dict needed.
- Legacy booleans (`flash_attention: true`, `sdp_attention: true`, …) are the **only** input aliases, kept for backwards compatibility. The normalizer maps them to the canonical `attn_implementation` value, emits a one-time `DeprecationWarning` per flag, and removes them from `data` so they're never readable on the validated `cfg`. `deprecated=True` on each Field surfaces this in JSON schema. Mapping is 1:1 with the current legacy-flag semantics (`flash_attention → flash_attention_2`, `sdp_attention → sdpa`, `flex_attention → flex_attention`, `xformers_attention → xformers`, `sage_attention → sage`, `s2_attention → s2`, `eager_attention → eager`).
- Capability flags (`attn_supports_packing`, `attn_uses_flash_lib`, `attn_needs_dtype_cast`) are **`@computed_field` on the model**, not settable inputs. Lookup is keyed by the canonical `attn_implementation` string.
- Unknown / user-supplied strings (custom hub kernels) pass through to HF but get **conservative capability defaults** (packing=False, flash-lib=False, dtype-cast=True). Known hub kernels axolotl can classify live in a small allowlist.
- Downstream consumers read *only* `cfg.attn_implementation` and the capability flags. No `cfg.flash_attention`, `cfg.xformers_attention`, etc. anywhere in `src/`.
-
-This is strictly what the branch is already *trying* to do — the gaps below are places it hasn't landed that goal yet.
-
-## 3. Gaps and holes
-
-### A. Legacy flags are still parallel state, not input-only
-
-1. The normalizer *sets* the legacy flags on `data` (`impl_to_flag[attn_impl]` branch). It does not delete them. So `cfg.flash_attention` is still truthy after validation, and downstream code still reads it (see G).
-2. Short-form enum values (`flash`, `sdpa`, `fp8`) are persisted as-is on `cfg.attn_implementation`, which is why `model.py` needs `_ATTN_IMPL_TO_HF` to translate before passing to HF. Source-of-truth implies canonicalize at normalize-time, not translate at consume-time.
-3. Legacy flag + `attn_implementation` (consistent combo, e.g. `attn_implementation: flash + flash_attention: true`) emits no deprecation warning — only legacy-only path warns.
-4. Legacy Field descriptions (`xformers_attention`, `sdp_attention`, etc.) don't have `deprecated=True`, so JSON schema still advertises them as first-class.
-
-### B. Validators that still only check the legacy flag
-
-5. **`check_ebft_activation_offloading`** (`validation.py:1607-1619`) reads only `data.get("flex_attention")`. Users on `attn_implementation: flex_attention` bypass the incompatibility check.
-6. **`check_sample_packing_without_attention`** (`validation.py:188-203`) early-returns when `attn_implementation` is set but never validates the chosen backend actually supports packing. `attn_implementation: eager + sample_packing: true` silently passes; the old legacy-flag check warned.
-
-### C. Non-enum strings fall through the capability tables
-
-7. **HF-native `"flash_attention_2"`** is neither in `impl_to_flag` nor `FLASH_ATTN_LIB_IMPLS`. A user copy-pasting from HF docs gets `attn_uses_flash_lib=False`, silently disabling FA4 auto-apply, LLaMA flash hijack, `_patch_attention` (btlm, stablelm_epoch, mistral3, llava), and `_apply_flash_attention_peft_patches`.
-8. **Hub kernel strings** (`kernels-community/flash-attn3`, `kernels-community/sage-attention`) default to `attn_supports_packing=True` (silently enters multipack with varlen `position_ids` — correctness depends on the kernel honoring them) and `attn_uses_flash_lib=False` (so `context_parallel_size > 1` raises "requires flash attention" even for FA3 hub kernels).
-9. **Conflict trap for hub-kernel + legacy flag** (`config.py:1414-1419`): `attn_implementation: kernels-community/flash-attn3 + flash_attention: true` always raises, because `impl_to_flag.get(custom) is None` and the loop treats `flag != None` as conflict. Common combo in existing YAMLs breaks hard on upgrade.
-
-### D. Silent behaviour change for xformers
-
-10. Old `_apply_flash_attention_patches` did `self.cfg.flash_attention = True` for `xformers + sample_packing`. The new version doesn't, and xformers is not in `FLASH_ATTN_LIB_IMPLS`. Consumers that keyed off `cfg.flash_attention` now see falsy for xformers, silently dropping `_patch_attention` (btlm / stablelm_epoch+packing / mistral3 / llava model-type FA patches). Some of this is arguably correct cleanup (xformers has its own HF registry entry now), but the btlm/stablelm/mistral3 regression is not called out and not tested. Decide consciously, not by omission.
-
-### E. Capability flags are writable Pydantic fields, not computed
-
-11. `attn_supports_packing`, `attn_uses_flash_lib`, `attn_needs_dtype_cast` are declared `bool | None = Field(default=None)` on `AxolotlInputConfig`. YAML is not rejected — a user can set `attn_uses_flash_lib: true` and override the normalizer.
-
-### F. Validator ordering (not covered by tests)
-
-12. `AttentionValidationMixin.check_attention_fields` (inherited, `mode="before"`) and `normalize_attn_implementation` (subclass, `mode="before"`) both run during `model_validator` phase. Pydantic MRO may run the inherited one first. For legacy-only `s2_attention: true + flash_attention: true` (the test `test_s2_plus_flash_maps_to_s2` asserts this maps to `s2`), the inherited check may raise "multiple attention implementations set" before the normalizer runs. The test calls the classmethod directly and does not build the model, so this is unverified either way.
-
-### G. Remaining legacy reads in `src/`
-
-13. `src/axolotl/integrations/lm_eval/cli.py:120` reads `cfg.flash_attention`. Works for `attn_implementation=flash` only.
-14. `tests/e2e/multigpu/test_llama.py:524-526` writes `cfg.flash_attention = True` / `cfg.flex_attention = True`. Stale pattern.
-15. Dual-check idioms in `config.py` (lines 1464, 1478, 1570, 1586, 1774) and `validation.py` (198, 209, 221, 850, 1586, 1611) — `data.get("x_attention") or data.get("attn_implementation") == "x"` — are redundant once legacy flags are input-only; remove them.
-
-### H. fp8 operational risk
-
-16. The `fp8` docstring documents hard requirements (PyTorch ≥ 2.11, SM90+, flash-attn with FA3, torchao ≥ 0.17.0) and a runtime constraint (`config.use_cache = False`). None are validated — misconfig surfaces as a torchao runtime error. `xformers` and `sage` availability/compute-capability guards exist; `fp8` should match.
-
-### I. Test coverage gaps
-
-17. `test_attn_implementation.py` exercises the classmethod in isolation plus the constant sets. It does **not**:
-   - Build a full `AxolotlInputConfig(**data)` (so validator ordering — item 12 — is untested).
-   - Verify capability flags can't be overridden from YAML (item 11).
-   - Cover `check_sample_packing_without_attention` with `attn_implementation: eager` (item 6).
-   - Cover `check_ebft_activation_offloading` with `attn_implementation: flex_attention` (item 5).
-   - Cover hub-kernel + legacy flag combo (item 9).
-   - Cover `flash_attention_2` canonicalization (item 7).
-
-## 4. Fix plan
-
-Four phases, each commit-sized. Phases 1–2 are local and low-risk; phase 3 is the behaviour-changing cleanup; phase 4 is tests + docs.
-
-### Phase 1 — Lock down the data model
-
-1. Drop the `AttnImplementation` enum. `attn_implementation` becomes `str | None`, validated against a canonical allowlist (`eager`, `sdpa`, `flash_attention_2`, `flash_attention_3`, `flex_attention`, `xformers`, `sage`, `s2`, `fp8`) **or** a hub-kernel path (`startswith("kernels-")` / contains `/`). Reject short-form strings like `flash` / `fa2` / `sdp` / `flex` with an explicit error pointing at the canonical name.
-2. Rewrite `normalize_attn_implementation` so its only job is mapping **legacy booleans → canonical `attn_implementation`** (for BC). Mapping is fixed:
-   - `flash_attention → flash_attention_2`
-   - `sdp_attention → sdpa`
-   - `flex_attention → flex_attention`
-   - `xformers_attention → xformers`
-   - `sage_attention → sage`
-   - `s2_attention → s2`
-   - `eager_attention → eager`
-   Priority for legacy combos stays as in the current branch (`s2 > sage > xformers > flex > flash > sdp > eager`). Emit a one-time `DeprecationWarning` per unique legacy flag seen. After mapping, delete the legacy flag keys from `data` so they never appear on validated `cfg`. If both a canonical `attn_implementation` and any legacy flag are set, raise (no silent precedence).
-
-   **Merge `AttentionValidationMixin.check_attention_fields` into this normalizer and delete the mixin method.** Pydantic v2 runs inherited `mode="before"` validators before subclass ones per MRO, so leaving them as siblings causes the inherited check to reject legacy combos like `s2 + flash` before the normalizer can map them. One validator, one source of conflict detection.
-
-   **Fix the gemma4-hybrid path**: change `data["attn_implementation"] = "flash"` to `data["attn_implementation"] = "flash_attention_2"` (the short name no longer validates after step 1).
-3. Convert `attn_supports_packing`, `attn_uses_flash_lib`, `attn_needs_dtype_cast` to `@computed_field`. The three capability tables move to `enums.py` as module constants keyed by the canonical `attn_implementation` string (including `flash_attention_3` — missing from the current branch — and known hub kernels):
-   - Packing-capable: `{flash_attention_2, flash_attention_3, flex_attention, xformers, sage, kernels-community/flash-attn3, kernels-community/sage-attention}`.
-   - Flash-lib (axolotl's monkeypatch targets): `{flash_attention_2, flash_attention_3, s2, kernels-community/flash-attn3}`.
-   - No-dtype-cast: `{eager, sdpa}`.
-   Unknown strings: conservative defaults (`packing=False, flash_lib=False, dtype_cast=True`).
-4. Delete `_ATTN_IMPL_TO_HF` from `model.py` and pass `cfg.attn_implementation` straight through. The gemma4-hybrid branch continues to override to `flash_attention_2` before passing to HF.
-5. `deprecated=True` on each legacy boolean Field so JSON schema + Pydantic surface the deprecation.
-
-### Phase 2 — Fix the validators
-
-6. `check_sample_packing_without_attention`: drop the early-return and gate on `attn_supports_packing`. Warn (or raise — pick one and be consistent) if packing is enabled with a non-packing backend.
-7. `check_ebft_activation_offloading`: replace `data.get("flex_attention")` with `attn_implementation == "flex_attention"`.
-8. Sweep items (item 15): remove every `data.get("x_attention") or data.get("attn_implementation") == "x"` dual-check — after phase 1 the legacy side is always `None`. Reduces ~10 lines of noise and eliminates the "which side wins" class of bugs.
-9. fp8 preflight (item 16): require `env_capabilities.compute_capability ≥ sm_90`, `torch_version ≥ 2.11`, and `torchao_version ≥ 0.17`. Warn if `use_cache` isn't explicitly `False`.
-
-### Phase 3 — Migrate remaining consumers
-
-10. `lm_eval/cli.py:120` → `flash_attention=cfg.attn_uses_flash_lib`.
-11. `lm_eval/__init__.py:26` currently reads `(cfg.attn_implementation == "flash")` — after canonicalization `"flash"` is never stored, so this evaluates `False` for every backend. Change to `cfg.attn_uses_flash_lib`.
-12. `validation.py:1137-1142` (NPU check) currently iterates `["flash_attention", "sdp_attention", "s2_attention"]` as string keys. Replace with `cfg.attn_implementation in {"flash_attention_2", "flash_attention_3", "sdpa", "s2"}` or the equivalent canonical-string set.
-13. `tests/e2e/multigpu/test_llama.py:524-526` → `cfg.attn_implementation = "flash_attention_2"` / `"flex_attention"`.
-14. **Xformers decision** (item 10): the old `cfg.flash_attention = True` side-effect activated `_patch_attention` for btlm/stablelm_epoch+packing/mistral3/llava. Two choices:
-    - Add `xformers` to the set that gates `_patch_attention` (restore old behaviour, keeps patches live).
-    - Document that those patches don't apply to xformers post-refactor and drop the paths if they're dead.
-    Pick one explicitly and leave a commit note. Do not leave it as silent breakage.
-15. Add a repo-level check (`tests/test_no_legacy_attn_reads.py` or a ruff/grep pre-commit) that fails if anything outside `config.py`'s normalizer reads `cfg.flash_attention` / `cfg.sdp_attention` / etc. Keeps the invariant from rotting.
-
-### Phase 4 — Tests + docs
-
-14. Rewrite `test_attn_implementation.py` to build full `AxolotlInputConfig(**data)`, not just the classmethod. Covers validator ordering and the Pydantic-field-override issue.
-15. Add one test per gap closed above: `attn_implementation: eager + sample_packing`; `attn_implementation: flex_attention + activation_offloading`; short-form `flash` rejected; `flash_attention_2` passthrough; `kernels-community/flash-attn3` capability lookup; `attn_uses_flash_lib: true` in YAML rejected; legacy boolean emits `DeprecationWarning` and is absent from validated `cfg`; fp8 preflight failures.
-16. Update `docs/attention.qmd` for the single `attn_implementation` field + the deprecation table for legacy flags. One-paragraph migration note in the changelog.
-17. `examples/` contains ~170 YAML files using legacy flags (`flash_attention: true` etc.). They still validate post-refactor (normalizer maps them with deprecation), but a follow-up sweep to convert them to `attn_implementation: flash_attention_2` is worth scheduling — call this out in the migration note so users know examples will be migrated on a later pass.
-
-## 5. Ordering & risk
-
- Phase 1 is the keystone: it's the largest diff but each step is mechanical once the alias map is in place. No behaviour change for any consumer that was using `attn_implementation` correctly; behaviour change only for consumers that were reading legacy flags (phase 3 step 13 is the explicit decision point).
- Phase 2 is independent of phase 1 and can land first as a quick safety net.
- Phase 3 step 13 is the only judgment call — flag for review before choosing.
- Total: ~10-13 commits beyond what's on the branch, each scoped and individually revertable.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,7 +1,6 @@
+include requirements.txt
 include README.md
 include LICENSE
-include VERSION
+include src/setuptools_axolotl_dynamic_dependencies.py
 include src/axolotl/utils/chat_templates/templates/*.jinja
-include AGENTS.md
-recursive-include docs/agents *.md
 recursive-include axolotl *.py
--- a/README.md
+++ b/README.md
@@ -86,7 +86,7 @@ Features:
 **Requirements**:

 - NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
- Python >=3.11 (3.12 recommended)
+- Python 3.11
 - PyTorch ≥2.9.1

 ### Google Colab
@@ -95,19 +95,11 @@ Features:

 ### Installation

+#### Using pip
+
 ```bash
-# install uv if you don't already have it installed (restart shell after)
-curl -LsSf https://astral.sh/uv/install.sh | sh
-
-# change depending on system
-export UV_TORCH_BACKEND=cu128
-
-# create a new virtual environment
-uv venv --python 3.12
-source .venv/bin/activate
-
-uv pip install torch==2.10.0 torchvision
-uv pip install --no-build-isolation axolotl[deepspeed]
+pip3 install -U packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]

 # Download example axolotl configs, deepspeed configs
 axolotl fetch examples
@@ -118,7 +110,7 @@ axolotl fetch deepspeed_configs  # OPTIONAL

 Installing with Docker can be less error prone than installing in your own environment.
 ```bash
-docker run --gpus '"all"' --ipc=host --rm -it axolotlai/axolotl:main-latest
+docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
 ```

 Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html).
@@ -165,29 +157,6 @@ That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/ge
 - [API Reference](https://docs.axolotl.ai/docs/api/) - Auto-generated code documentation
 - [FAQ](https://docs.axolotl.ai/docs/faq.html) - Frequently asked questions

-## AI Agent Support
-
-Axolotl ships with built-in documentation optimized for AI coding agents (Claude Code, Cursor, Copilot, etc.). These docs are bundled with the pip package — no repo clone needed.
-
-```bash
-# Show overview and available training methods
-axolotl agent-docs
-
-# Topic-specific references
-axolotl agent-docs sft                 # supervised fine-tuning
-axolotl agent-docs grpo                # GRPO online RL
-axolotl agent-docs preference_tuning   # DPO, KTO, ORPO, SimPO
-axolotl agent-docs reward_modelling    # outcome and process reward models
-axolotl agent-docs pretraining         # continual pretraining
-axolotl agent-docs --list              # list all topics
-
-# Dump config schema for programmatic use
-axolotl config-schema
-axolotl config-schema --field adapter
-```
-
-If you're working with the source repo, agent docs are also available at `docs/agents/` and the project overview is in `AGENTS.md`.
-
 ## 🤝 Getting Help

 - Join our [Discord community](https://discord.gg/HhrNrHJPRb) for support
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -134,6 +134,7 @@ quartodoc:
        - monkeypatch.stablelm_attn_hijack_flash
        - monkeypatch.trainer_fsdp_optim
        - monkeypatch.transformers_fa_utils
+        - monkeypatch.unsloth_
        - monkeypatch.data.batch_dataset_fetcher
        - monkeypatch.mixtral
        - monkeypatch.gradient_checkpointing.offload_cpu
@@ -326,6 +327,7 @@ website:
        - section: "Advanced Features"
          contents:
            - docs/fsdp_qlora.qmd
+            - docs/unsloth.qmd
            - docs/torchao.qmd
            - docs/custom_integrations.qmd
            - docs/sequence_parallelism.qmd
--- a/cicd/Dockerfile-uv.jinja
+++ b/cicd/Dockerfile-uv.jinja
@@ -22,6 +22,15 @@ WORKDIR /workspace/axolotl
 RUN git fetch origin +$GITHUB_REF && \
    git checkout FETCH_HEAD

+# If AXOLOTL_EXTRAS is set, append it in brackets
+RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
+        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
+        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
+        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
+        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
+        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
+    fi
+
 RUN uv pip install packaging==26.0 setuptools==78.1.1
 RUN uv pip install torchvision
 RUN uv pip uninstall causal_conv1d
@@ -31,21 +40,11 @@ RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
    fi

-# Override with nightly HF packages for nightly builds
-RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
-        uv pip install --no-deps \
-            "transformers @ git+https://github.com/huggingface/transformers.git@main" \
-            "peft @ git+https://github.com/huggingface/peft.git@main" \
-            "accelerate @ git+https://github.com/huggingface/accelerate.git@main" \
-            "trl @ git+https://github.com/huggingface/trl.git@main" \
-            "datasets @ git+https://github.com/huggingface/datasets.git@main"; \
-    fi
-
+RUN python scripts/unsloth_install.py --uv | sh
 RUN python scripts/cutcrossentropy_install.py --uv | sh

 # So we can test the Docker image
-RUN uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
-    codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
+RUN uv pip install -r requirements-dev.txt -r requirements-tests.txt

 # fix so that git fetch/pull from remote works
 RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -0,0 +1,54 @@
+FROM axolotlai/axolotl-base:{{ BASE_TAG }}
+
+ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
+ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
+ENV CUDA="{{ CUDA }}"
+ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
+ENV GITHUB_REF="{{ GITHUB_REF }}"
+ENV GITHUB_SHA="{{ GITHUB_SHA }}"
+ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
+ENV HF_HOME="{{ HF_HOME }}"
+ENV AXOLOTL_DATASET_NUM_PROC="8"
+
+RUN apt-get update && \
+    apt-get install -y --allow-change-held-packages vim curl nano zstd libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm
+
+WORKDIR /workspace
+
+RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
+
+WORKDIR /workspace/axolotl
+
+RUN git fetch origin +$GITHUB_REF && \
+    git checkout FETCH_HEAD
+
+# If AXOLOTL_EXTRAS is set, append it in brackets
+RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
+        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
+        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
+        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
+        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
+        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
+    fi
+
+RUN pip install packaging==26.0 setuptools==78.1.1 psutil
+RUN pip uninstall -y causal_conv1d
+RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
+        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+    else \
+        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
+    fi
+
+RUN python scripts/unsloth_install.py | sh
+RUN python scripts/cutcrossentropy_install.py | sh
+
+# So we can test the Docker image
+RUN pip install -r requirements-dev.txt -r requirements-tests.txt
+
+# fix so that git fetch/pull from remote works
+RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
+    git config --get remote.origin.fetch
+
+# helper for huggingface-login cli
+RUN git config --global credential.helper store
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -e

-python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__, f'Expected torch $PYTORCH_VERSION but got {torch.__version__}'"
+python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"

 set -o pipefail
 for i in 1 2 3; do
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -17,7 +17,7 @@ template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
 template_env = jinja2.Environment(
    loader=template_loader, autoescape=select_autoescape()
 )
-dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile-uv.jinja")
+dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja")
 df_template = template_env.get_template(dockerfile)

 df_args = {
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
@@ -16,7 +16,7 @@ template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
 template_env = jinja2.Environment(
    loader=template_loader, autoescape=select_autoescape()
 )
-dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile-uv.jinja")
+dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja")
 df_template = template_env.get_template(dockerfile)

 df_args = {
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -32,7 +32,7 @@ RUN if [ "$TARGETARCH" = "arm64" ]; then \
        pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
        pip install --no-build-isolation -e .[$BASE_EXTRAS] $AXOLOTL_ARGS; \
-    fi && \
+    fi && \    python scripts/unsloth_install.py | sh && \
    python scripts/cutcrossentropy_install.py | sh && \
    pip install pytest && \
    pip cache purge
--- a/docker/Dockerfile-uv
+++ b/docker/Dockerfile-uv
@@ -33,6 +33,7 @@ RUN if [ "$TARGETARCH" = "arm64" ]; then \
    else \
        uv pip install --no-build-isolation -e .[$BASE_EXTRAS] $AXOLOTL_ARGS; \
    fi && \
+    python scripts/unsloth_install.py --uv | sh && \
    python scripts/cutcrossentropy_install.py --uv | sh && \
    uv pip install pytest && \
    uv cache clean
--- a/docs/agents/model_architectures.md
+++ b/docs/agents/model_architectures.md
@@ -1,198 +0,0 @@
-# Model Architectures — Agent Reference
-
-Model-specific quirks, required settings, and known issues. Check this before debugging training failures on specific model families.
-
-## VLM (Vision Language Model) Quick Start
-
-All VLM configs require these four lines:
-```yaml
-processor_type: AutoProcessor
-skip_prepare_dataset: true
-remove_unused_columns: false
-sample_packing: false
-```
-
-Decision tree for VLM config:
-```text
-Is the model multimodal (has vision/audio encoder)?
-  ├─ YES: Add `freeze_mm_modules: true` if training text only
-  │       Add `chat_template: <model_template>` (e.g. gemma4, qwen3_5, gemma3)
-  │       LoRA: use regex `lora_target_modules` to restrict to language model
-  └─ NO: Train as a regular text model
-
-Is the model MoE (e.g. Gemma4 26B-A4B, Qwen3.5 35B-A3B)?
-  ├─ YES: Add `lora_target_parameters` for expert LoRA
-  │       Consider ScatterMoE kernels (see Plugins section)
-  └─ NO: Standard LoRA config
-```
-
-## Plugins & Optimizations
-
-### Cut Cross Entropy (CCE)
-
-Computes loss from hidden states + lm_head weight without materializing the full logits tensor, saving significant VRAM. Install if not already present:
-
-```bash
-uv pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@main"
-```
-
-```yaml
-plugins:
-  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
-```
-
-### ScatterMoE Kernels
-
-Fuses expert + LoRA computation into a single kernel for MoE models. Significant speedup for models with many experts.
-
-```yaml
-plugins:
-  - axolotl.integrations.kernels.KernelsPlugin
-use_kernels: true
-use_scattermoe: true
-experts_implementation: scattermoe
-
-# Expert LoRA targets (3D parameter tensors, not nn.Linear):
-lora_target_parameters:
-  - experts.gate_up_proj
-  - experts.down_proj
-```
-
-Supported: Gemma4 (`gemma4_text`), Mixtral, Qwen MoE variants. The plugin auto-detects model type and routing function. Without ScatterMoE, expert LoRA still works but runs base expert matmul and LoRA as separate operations.
-
-## Gemma 4
-
-**Models**: `google/gemma-4-26B-A4B` (MoE), `google/gemma-4-31B` (dense), `google/gemma-4-E2B`, `google/gemma-4-E4B`
-
-**Architecture**: Multimodal wrapper (`Gemma4ForConditionalGeneration`) over a text backbone (`Gemma4TextModel`), with optional vision/audio encoders. All Gemma4 HF repos have `model_type: "gemma4"` — even text-only variants load as multimodal with a vision tower.
-
-### Required settings
-
-```yaml
-# Always needed for Gemma4:
-freeze_mm_modules: true          # Freeze vision/audio encoders for text-only training
-gradient_checkpointing_kwargs:
-  use_reentrant: false           # Shared per-layer norms cause "marked ready twice" with reentrant
-
-# LoRA target — restrict to language model only (DO NOT use lora_target_linear: true):
-lora_target_modules: 'model.language_model.layers.[\d]+.(_checkpoint_wrapped_module.)?(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'
-```
-
-### Auto-detection
-
-Axolotl auto-detects Gemma4 and applies:
- `use_reentrant: false` for gradient checkpointing
- `ddp_find_unused_parameters: true` for DDP (skipped when `activation_offloading: true`)
-
-### Multi-GPU
-
-| Strategy | Works? | Notes |
-|----------|--------|-------|
-| DDP | Yes | Auto-sets `ddp_find_unused_parameters=True` |
-| DDP + activation_offloading | Yes | `find_unused_parameters` is skipped (conflicts with checkpoint wrappers) |
-| FSDP1 | No | OOM during dequantization/sharding with QLoRA |
-| FSDP2 | Yes | Use `Gemma4TextDecoderLayer` (not `Gemma4DecoderLayer`) as wrap class |
-| FSDP2 + activation_offloading | Yes | Lowest VRAM (~26 GiB/GPU for 26B-A4B) |
-
-FSDP2 config:
-```yaml
-fsdp:
-  - full_shard
-  - auto_wrap
-fsdp_config:
-  fsdp_version: 2
-  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  fsdp_transformer_layer_cls_to_wrap: Gemma4TextDecoderLayer
-```
-
-### MoE (26B-A4B)
-
- `enable_moe_block: true`, 256 experts, top-k routing
- No separate `SparseMoeBlock` — MoE is embedded in each decoder layer
- Expert LoRA targets 3D parameter tensors:
-  ```yaml
-  lora_target_parameters:
-    - experts.gate_up_proj
-    - experts.down_proj
-  ```
- ScatterMoE kernel acceleration:
-  ```yaml
-  plugins:
-    - axolotl.integrations.kernels.KernelsPlugin
-  use_kernels: true
-  use_scattermoe: true
-  experts_implementation: scattermoe
-  ```
-
-### VLM (Vision) Training
-
-All Gemma4 models load as `Gemma4ForConditionalGeneration` with a vision tower. No custom `ProcessingStrategy` needed — the base class auto-detects the image token.
-
-```yaml
-base_model: google/gemma-4-E2B-it   # or E4B-it, 26B-A4B
-processor_type: AutoProcessor
-freeze_mm_modules: true
-chat_template: gemma4
-
-skip_prepare_dataset: true
-remove_unused_columns: false
-sample_packing: false
-```
-
-A starting VLM loss of ~8-15 is typical. In most runs, loss converges below 1.0 within ~30-50 steps, though results may vary across configurations.
-
-For the 26B-A4B MoE variant with ScatterMoE + expert LoRA + CCE, add:
-```yaml
-plugins:
-  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
-  - axolotl.integrations.kernels.KernelsPlugin
-use_kernels: true
-use_scattermoe: true
-experts_implementation: scattermoe
-lora_target_parameters:
-  - experts.gate_up_proj
-  - experts.down_proj
-```
-
-### Common issues
-
-| Symptom | Cause | Fix |
-|---------|-------|-----|
-| `mm_token_type_ids is required` in DDP | `model.config` not accessible through DDP wrapper | Already fixed — `unwrap_model()` in `compute_loss` and `prediction_step` |
-| `marked a variable ready twice` in DDP | `ddp_find_unused_parameters=True` + activation_offloading checkpoint wrappers | Auto-handled — `find_unused_parameters` is skipped when `activation_offloading: true` |
-| Loss ~12 instead of ~0.5 | Using `lora_target_linear: true` (applies LoRA to vision/audio modules) | Use the regex `lora_target_modules` pattern instead |
-| FSDP2 `Could not find Gemma4AudioLayer` | Auto-wrap detects `_no_split_modules` including audio layers that don't exist | Explicitly set `fsdp_transformer_layer_cls_to_wrap: Gemma4TextDecoderLayer` |
-| `Gemma4ClippableLinear not supported` by PEFT | Vision tower uses a non-standard linear wrapper | Axolotl patches this automatically via `_patch_peft_clippable_linear()` |
-
-### E2B/E4B dense models
-
-These have `hidden_size_per_layer_input: 256` (per-layer input embeddings) and `attention_k_eq_v: False`. Known issue: loss starts higher than expected (~12 vs ~0.5 for 26B). Root cause under investigation — may be related to the per-layer input mechanism or the `Gemma4ForConditionalGeneration` loss computation.
-
-## Gemma 3
-
-**Models**: `google/gemma-3-*`
-
- `ddp_find_unused_parameters: true` needed (multimodal unused params)
- `use_reentrant: false` recommended
- Attention mask must be dropped for sample packing (handled automatically)
- Multi-GPU test currently skipped (`tests/e2e/multigpu/test_gemma3.py`)
-
-## Qwen 3.5 MoE
-
-**Models**: `Qwen/Qwen3.5-35B-A3B`
-
- Hybrid architecture: DeltaNet linear attention (30 layers) + full attention (10 layers)
- 256 experts, 8 active per token
- Known weight scale drift in late DeltaNet layers (36-38) due to AdamW + rare expert interaction
- Fix: `normalize_weight_scales` config to detect and rescale outliers:
-  ```yaml
-  normalize_weight_scales:
-    - name_pattern: 'linear_attn\.conv1d\.weight'
-      threshold: 1.3
-  ```
-
-## General MoE Notes
-
- `lora_target_linear: true` with multimodal MoE models will apply LoRA to ALL linear modules including vision/audio encoders — use regex `lora_target_modules` to restrict to language model only
- Rare experts get larger effective learning rate from AdamW (small second-moment estimates) — can cause weight drift in recurrent/SSM components. Use `normalize_weight_scales` with `dry_run: true` to detect.
- For ScatterMoE kernel support, set `experts_implementation: scattermoe` and add the KernelsPlugin
--- a/docs/agents/new_model_support.md
+++ b/docs/agents/new_model_support.md
@@ -1,181 +0,0 @@
-# New Model Support — Agent Reference
-
-Guide for debugging and adding support for new model architectures in axolotl. Based on lessons learned from Gemma4, Gemma3, Qwen2-VL, and other multimodal/MoE models.
-
-## Quick Validation Checklist
-
-When testing a new model, run through these checks in order:
-
-1. **Does the model load?** `axolotl preprocess config.yaml` — catches config schema errors
-2. **Does LoRA apply?** Check for "Unsupported layer type" warnings from PEFT
-3. **Is the initial loss sane?** First-step loss for a pretrained model should be 0.5–2.0 for SFT
-4. **Does sample packing work?** Compare loss with `sample_packing: true` vs `false` — should be similar
-5. **Is CCE active?** Check for "Applying Cut Cross Entropy" log and verify peak VRAM is lower
-
-## Loss Debugging
-
-### Expected initial loss
-A pretrained model doing SFT should start with loss roughly in the 0.5–2.0 range. If loss starts above 3.0, something is wrong. If it's near `log(vocab_size)` (≈ 12 for 262K vocab), the model is predicting at random — attention masking or model weights are broken.
-
-### Direct comparison technique
-The fastest way to isolate a loss issue — bypass the trainer entirely:
-
-```python
-# Load model via axolotl's pipeline (applies all patches)
-from axolotl.cli.config import load_cfg
-from axolotl.utils.config import normalize_config, prepare_plugins
-from axolotl.loaders.tokenizer import load_tokenizer
-from axolotl.loaders.model import ModelLoader
-
-cfg = load_cfg("your_config.yaml")
-normalize_config(cfg)
-prepare_plugins(cfg)
-tokenizer = load_tokenizer(cfg)
-model, _ = ModelLoader(cfg, tokenizer).load()
-
-# Forward pass on preprocessed data
-model.train()
-out = model(input_ids, labels=labels)
-print(f"Direct loss: {out.loss.item()}")  # Compare to trainer's reported loss
-```
-
-If direct loss is correct (~1.0) but trainer reports 3–4x higher, check `model_accepts_loss_kwargs` (see below).
-
-### `model_accepts_loss_kwargs` inflation
-HF Trainer checks if the model's `forward()` has `**kwargs` and sets `model_accepts_loss_kwargs=True`. This changes loss normalization: the trainer does NOT divide loss by `gradient_accumulation_steps` before logging. The gradient is correct — only the logged loss is inflated.
-
-**Symptom**: Logged loss ≈ actual_loss × gradient_accumulation_steps.
-
-**Which models are affected**: Any model with `**kwargs` in forward (common in multimodal models for extra inputs like `mm_token_type_ids`, `pixel_values`, etc.).
-
-**Fix location**: `src/axolotl/core/trainers/base.py` `__init__()` — after `super().__init__()`, check if the unwrapped model actually has `num_items_in_batch` in its forward signature. If not, set `self.model_accepts_loss_kwargs = False`.
-
-## Multimodal Models (ForConditionalGeneration)
-
-Many recent models use `ForConditionalGeneration` as the top-level class, not `ForCausalLM`:
- Gemma3 → `Gemma3ForConditionalGeneration`
- Gemma4 → `Gemma4ForConditionalGeneration`
- Qwen2-VL → `Qwen2VLForConditionalGeneration`
- LLaVA → `LlavaForConditionalGeneration`
-
-### Why this matters
-
-| Component | Targets `ForCausalLM` | Needs `ForConditionalGeneration` |
-|-----------|----------------------|--------------------------------|
-| CCE patches | ✅ (default) | ❌ silently inactive if not patched |
-| PEFT LoRA | ✅ | May fail on custom layer types |
-| HF Trainer label handling | ✅ | May need extra inputs |
-
-### Required extra inputs
-Multimodal models require special inputs during training even for text-only data:
-
-| Model | Required Input | Value for Text-Only |
-|-------|---------------|-------------------|
-| Gemma4 | `mm_token_type_ids` | `torch.zeros_like(input_ids)` |
-| Gemma3 | `token_type_ids` | `torch.zeros_like(input_ids)` |
-
-Auto-inject in `compute_loss()` when not provided by the data collator. See `core/trainers/base.py`.
-
-### Custom layer types and PEFT
-Vision towers often use custom module wrappers that PEFT doesn't support:
-
-| Model | Custom Layer | Wraps | Fix |
-|-------|-------------|-------|-----|
-| Gemma4 | `Gemma4ClippableLinear` | `nn.Linear` | Redirect to `.linear` child |
-
-Fix location: `src/axolotl/loaders/adapter.py` `_patch_peft_clippable_linear()`.
-
-## Sample Packing
-
-### How packed sequence detection works (transformers ≥ 5.x)
-`transformers.masking_utils._preprocess_mask_arguments()` detects packed sequences from `position_ids` resets. But **only when `attention_mask is None`**:
-
-```python
-# From masking_utils.py:
-if position_ids is not None and attention_mask is None and past_key_values is None:
-    packed_sequence_mask = find_packed_sequence_indices(position_ids)
-```
-
-If the collator provides an all-ones `attention_mask`, packing detection is **skipped** and the model builds a single causal mask spanning all packed sequences → cross-sequence attention leakage → very high loss.
-
-### Fix for models using `create_causal_mask_mapping`
-For Gemma3, Gemma4, and similar models that use the new transformers masking system, remove `attention_mask` from inputs when sample packing is active:
-
-```python
-# In compute_loss():
-if (
-    self.args.sample_packing
-    and model_type in ("gemma4", "gemma3")
-    and "attention_mask" in inputs
-    and "position_ids" in inputs
-):
-    del inputs["attention_mask"]
-```
-
-Fix location: `src/axolotl/core/trainers/base.py` `compute_loss()`.
-
-### Models that DON'T need this fix
-Older models that use `_prepare_4d_causal_attention_mask` (Llama, Mistral, Qwen2, etc.) handle sample packing via axolotl's multipack attention monkeypatch instead. Only models using the new `create_causal_mask_mapping` / `create_causal_mask` masking system need the `attention_mask` removal.
-
-## Attention Backend Selection
-
-| Backend | Config | head_dim limit | torch_compile | Notes |
-|---------|--------|---------------|---------------|-------|
-| FA2 | `attn_implementation: flash_attention_2` | 256 | ✅ | Fastest when supported |
-| FA4 | auto with `attn_implementation: flash_attention_2` | 256 (SM90+) | ✅ | Auto-detected on H100+ |
-| SDPA | `attn_implementation: sdpa` | None | ✅ | Universal fallback |
-| flex | `attn_implementation: flex_attention` | None | ⚠️ Triton OOM for large head_dim | Good for variable head dims |
-| eager | `attn_implementation: eager` | None | ✅ | Slowest, always works |
-
-**Check model support**: Look at `_supports_flash_attn_2`, `_supports_flex_attn`, `_supports_sdpa` attributes on the model class.
-
-**head_dim gotcha**: The 256 limit is specific to flash-attn CUDA kernels, NOT PyTorch-level. SDPA and flex_attention both handle arbitrary head_dim. Models with `global_head_dim > 256` (Gemma4: 512) must use SDPA or flex.
-
-**flex + compile gotcha**: `torch_compile` with flex_attention can hit Triton shared memory OOM for large head_dim. Falls back to eager per-function (not a crash, but slower). Unsloth disables flex for Gemma4 for this reason.
-
-## Cut Cross Entropy (CCE)
-
-### How CCE patches work
-CCE replaces the model's `forward()` with a fused version that computes loss from hidden states + lm_head weight without materializing the full logits tensor. This saves ~`batch × seq_len × vocab_size × dtype_bytes` of VRAM.
-
-### Adding CCE for a new model
-1. Check if the model type is in `cut_cross_entropy.transformers.patch.PATCH_FNS`
-2. If not, axolotl's generic fallback (`integrations/cut_cross_entropy/__init__.py` `patch_llama_like()`) patches `{Prefix}ForCausalLM.forward` with `cce_forward`
-3. For multimodal models (`ForConditionalGeneration`), a model-specific patch is needed in `ml-cross-entropy` repo
-4. The multimodal `cce_forward` must accept all extra kwargs (pixel_values, mm_token_type_ids, etc.) and pop any that would conflict before calling `self.model()`
-
-### Common CCE pitfall
-If CCE appears active (log says "Applying Cut Cross Entropy") but peak VRAM doesn't decrease, check which class was patched. If the model loads as `ForConditionalGeneration` but CCE patched `ForCausalLM`, the patch is silently inactive.
-
-## MoE Models
-
-### Dense MLP vs MoE experts
-Some MoE models (e.g., Gemma4) have BOTH dense MLP layers and MoE expert layers at every decoder layer:
- `gate_proj/up_proj/down_proj` → targets the **dense MLP** (`Gemma4TextMLP`)
- `experts.gate_up_proj/experts.down_proj` → targets the **MoE experts** (`Gemma4TextExperts`)
-
-LoRA on the dense MLP works normally. Expert LoRA via `lora_target_parameters` requires PEFT support for the specific expert module type (may warn "Unsupported layer type").
-
-### ScatterMoE kernels
-`use_scattermoe: true` with `experts_implementation: scattermoe` registers fused expert kernels via transformers' `ExpertsInterface`. Significant speedup for MoE models. Requires the kernels plugin:
-```yaml
-plugins:
-  - axolotl.integrations.kernels.KernelsPlugin
-use_kernels: true
-use_scattermoe: true
-experts_implementation: scattermoe
-```
-
-## Where to Add Model-Specific Fixes
-
-| What | Where | Example |
-|------|-------|---------|
-| Missing forward inputs | `core/trainers/base.py` `compute_loss()` | mm_token_type_ids injection |
-| Attention mask fixes | `core/trainers/base.py` `compute_loss()` | Sample packing mask removal |
-| Loss logging fixes | `core/trainers/base.py` `__init__()` | model_accepts_loss_kwargs override |
-| PEFT/LoRA patches | `loaders/adapter.py` | ClippableLinear redirect |
-| Attention patches | `monkeypatch/attention/` | FA4 tuple fix |
-| Model-specific patches | `loaders/patch_manager.py` `_apply_model_specific_patches()` | Llama4, Kimi, NemotronH |
-| CCE patches | `ml-cross-entropy` repo `transformers/` | Per-model cce_forward |
-| Example configs | `examples/<model>/` | Validated YAML |
-| Config validation | `utils/schemas/validation.py` | Compatibility checks |
--- a/docs/agents/preference_tuning.md
+++ b/docs/agents/preference_tuning.md
@@ -38,7 +38,7 @@ No vLLM server needed (unlike GRPO). Offline RL with pre-collected preference da

 1. Paired preference data (chosen + rejected)?
   - Default → `rl: dpo`
-   - Overfitting → `rl: dpo, dpo_loss_type: ["ipo"]`
+   - Overfitting → `rl: ipo`
   - VRAM-limited → `rl: orpo` (no ref model)
   - Length-sensitive → `rl: simpo` (no ref model)
 2. Only binary labels (good/bad)? → `rl: kto`
--- a/docs/agents/sft.md
+++ b/docs/agents/sft.md
@@ -83,7 +83,7 @@ Watch for: loss never decreasing (check `train_on_inputs`, dataset, LR), loss go
 | Issue | Fix |
 |-------|-----|
 | OOM during training | Reduce `micro_batch_size`, enable `gradient_checkpointing`, reduce `sequence_len` |
-| `sample_packing` + SDPA + bf16 = 0.0 loss | Use `attn_implementation: flash_attention_2` or disable `sample_packing` |
+| `sample_packing` + SDPA + bf16 = 0.0 loss | Use `flash_attention: true` or disable `sample_packing` |
 | Missing chat template error | Set `chat_template: chatml` explicitly |
 | Label masking wrong | Run `axolotl preprocess config.yaml --debug` and inspect labels |
 | Loss NaN | Use `bf16: auto`, lower LR, check data for empty samples |
@@ -91,30 +91,6 @@ Watch for: loss never decreasing (check `train_on_inputs`, dataset, LR), loss go
 | FSDP save hangs | Use `fsdp_state_dict_type: FULL_STATE_DICT` |
 | DeepSpeed CheckpointError | Set `use_reentrant: true` in `gradient_checkpointing_kwargs` |

-## Profiling
-
-To profile training and identify optimization opportunities:
-
-```yaml
-# Profile steps 3-7 (after warmup/autotuning settles)
-profiler_steps_start: 3
-profiler_steps: 5
-```
-
-This produces `profiler_trace.json` (Chrome trace) and `snapshot.pickle` (memory snapshot) in `output_dir`.
-View the Chrome trace at `chrome://tracing`.
-
-To programmatically inspect the trace:
-```bash
-python scripts/analyze_profile.py output_dir/
-```
-
-The trace shows per-kernel CUDA times, memory allocations, and operator-level breakdown. Look for:
- **Large matmul kernels**: candidates for fusion or quantization
- **Memory copies (H2D/D2H)**: unnecessary data movement
- **Small frequent kernels**: candidates for kernel fusion
- **Gaps between kernels**: pipeline bubbles from CPU overhead
-
 Full troubleshooting: [training_stability.qmd](../training_stability.qmd), [debugging.qmd](../debugging.qmd)

 ## File Map
--- a/docs/attention.qmd
+++ b/docs/attention.qmd
@@ -3,71 +3,28 @@ title: Attention
 description: Supported attention modules in Axolotl
 ---

-Axolotl routes attention via a single config field:
+## SDP Attention
+
+This is the default built-in attention in PyTorch.

 ```yaml
-attn_implementation: <backend>
+sdp_attention: true
 ```

-`attn_implementation` is passed through to `transformers` verbatim (via
-`model.config._attn_implementation`). Accepted values are the HF-native
-backends, axolotl-registered backends, or a hub-kernel path.
+For more details: [PyTorch docs](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)

-## Backends
+## Flash Attention

-| `attn_implementation` | Description |
-|---|---|
-| `eager` | Plain PyTorch attention. No packing support. |
-| `sdpa` | PyTorch `scaled_dot_product_attention`. No packing support. |
-| `flash_attention_2` | Dao-AILab Flash Attention 2. |
-| `flash_attention_3` | Dao-AILab Flash Attention 3 (Hopper+). |
-| `flex_attention` | Torch Flex Attention (requires torch ≥ 2.6). |
-| `xformers` | xFormers memory-efficient attention. |
-| `sage` | SageAttention (QK int8 / PV fp16). |
-| `s2` | Shifted-Sparse Attention (LLaMA only, FA2 under the hood). |
-| `fp8` | torchao FP8 low-precision attention (requires SM90+, torch ≥ 2.11). Loaded as SDPA and patched post-load. |
-| `kernels-community/flash-attn3` | HF hub FA3 kernel. |
-| `kernels-community/sage-attention` | HF hub SageAttention kernel. |
-| Other `<org>/<name>` path | Any hub-kernel path supported by `transformers`. |
-
-Short-form aliases (`flash`, `fa2`, `flex`, `sdp`, etc.) are **not accepted** —
-set the canonical name above.
-
-### Capability flags
-
-Axolotl derives three boolean capability flags from `attn_implementation` and
-exposes them on the validated config:
-
- `cfg.attn_supports_packing` — backend supports varlen sample packing via
-  `position_ids`. Gates multipack patches and `sample_packing_drop_attention_mask`.
- `cfg.attn_uses_flash_lib` — backend needs the `flash_attn` (Dao-AILab)
-  monkeypatches (FA4 auto, LLaMA flash hijack, ring-FA).
- `cfg.attn_needs_dtype_cast` — backend requires fp16/bf16 embeddings
-  (everything except `eager` and `sdpa`).
-
-These are **computed** — they cannot be overridden from YAML.
-
-## Per-backend notes
-
-### SDPA
-
-Default PyTorch attention. See
-[PyTorch docs](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html).
+Axolotl supports Flash Attention 2, 3, and 4. The best available version is used automatically
+based on your installed packages and GPU.

 ```yaml
-attn_implementation: sdpa
+flash_attention: true
 ```

-### Flash Attention
+For more details: [Flash Attention](https://github.com/Dao-AILab/flash-attention/)

-Axolotl supports FA2, FA3, and FA4. The best available version is used
-automatically based on your installed packages and GPU.
-
-```yaml
-attn_implementation: flash_attention_2  # or flash_attention_3
-```
-
-#### Flash Attention 2
+### Flash Attention 2

 Requirements: Ampere, Ada, or Hopper GPUs (Turing or lower not supported)

@@ -82,20 +39,20 @@ Alternatively, try reinstall or downgrade a version.

 :::

-#### Flash Attention 3
+### Flash Attention 3

 Requirements: Hopper only and CUDA 12.8 (recommended)

 ```bash
 git clone https://github.com/Dao-AILab/flash-attention.git
 cd flash-attention/hopper
+
 python setup.py install
 ```

-#### Flash Attention 4
+### Flash Attention 4

-Requirements: Hopper or Blackwell GPUs. Auto-applied when `attn_uses_flash_lib`
-is true and FA4 is importable.
+Requirements: Hopper or Blackwell GPUs

 ```bash
 pip install flash-attn-4
@@ -106,6 +63,7 @@ Or from source:
 ```bash
 git clone https://github.com/Dao-AILab/flash-attention.git
 cd flash-attention/flash_attn/cute
+
 pip install -e .

 # FA2's flash_attn package includes a cute/ stub that shadows FA4.
@@ -128,113 +86,93 @@ and falls back to FA2/3.

 :::

+For more details: [flash-attention/flash_attn/cute](https://github.com/Dao-AILab/flash-attention/tree/main/flash_attn/cute)
+
 ### AMD

-Requirements: ROCm 6.0 and above. See
-[Flash Attention AMD docs](https://github.com/Dao-AILab/flash-attention/tree/main?tab=readme-ov-file#amd-rocm-support).
+Requirements: ROCm 6.0 and above.

-### Flex Attention
+See [Flash Attention AMD docs](https://github.com/Dao-AILab/flash-attention/tree/main?tab=readme-ov-file#amd-rocm-support).
+
+## Flex Attention
+
+A flexible PyTorch API for attention used in combination with `torch.compile`.

 ```yaml
-attn_implementation: flex_attention
-torch_compile: true  # recommended
+flex_attention: true
+
+# recommended
+torch_compile: true
 ```

-Requires torch ≥ 2.6. See [PyTorch docs](https://pytorch.org/blog/flexattention/).
+::: {.callout-note}

-### SageAttention
+We recommend using latest stable version of PyTorch for best performance.

-Requirements: Ampere, Ada, or Hopper GPUs.
+:::
+
+For more details: [PyTorch docs](https://pytorch.org/blog/flexattention/)
+
+## SageAttention
+
+Attention kernels with QK Int8 and PV FP16 accumulator.

 ```yaml
-attn_implementation: sage
+sage_attention: true
 ```

+Requirements: Ampere, Ada, or Hopper GPUs
+
 ```bash
 pip install sageattention==2.2.0 --no-build-isolation
 ```

 ::: {.callout-warning}

-Only LoRA/QLoRA recommended. Full finetuning has been observed to drop loss to 0. See
-[GitHub Issue](https://github.com/thu-ml/SageAttention/issues/198).
+Only LoRA/QLoRA recommended at the moment. We found loss drop to 0 for full finetuning. See [GitHub Issue](https://github.com/thu-ml/SageAttention/issues/198).

 :::

-For more details: [Sage Attention](https://github.com/thu-ml/SageAttention).
+For more details: [Sage Attention](https://github.com/thu-ml/SageAttention)

-### xFormers
+::: {.callout-note}
+
+We do not support SageAttention 3 at the moment. If you are interested on adding this or improving SageAttention implementation, please make an Issue.
+
+:::
+
+
+## xFormers

 ```yaml
-attn_implementation: xformers
+xformers_attention: true
 ```

 ::: {.callout-tip}

-Recommended for Turing GPUs or below (e.g. Colab T4).
+We recommend using with Turing GPUs or below (such as on Colab).

 :::

-### Shifted Sparse Attention
+For more details: [xFormers](https://github.com/facebookresearch/xformers)
+
+## Shifted Sparse Attention

 ::: {.callout-warning}

-Planned for deprecation. Prefer one of the backends above.
+We plan to deprecate this! If you use this feature, we recommend switching to methods above.

 :::

-Requirements: LLaMA model architecture. Loaded as FA2 under the hood and
-patched to implement shifted-sparse attention. Does not support sample packing.
+Requirements: LLaMA model architecture

 ```yaml
-attn_implementation: s2
+flash_attention: true
+s2_attention: true
 ```

-### FP8
+::: {.callout-tip}

-torchao low-precision attention. Loaded as SDPA and patched post-load.
-
-Requirements: SM90+ (Hopper/Blackwell), PyTorch ≥ 2.11, torchao ≥ 0.17,
-flash-attn with FA3. KV caching must be disabled.
-
-```yaml
-attn_implementation: fp8
-```
-
-### Hub kernels
-
-```yaml
-attn_implementation: kernels-community/flash-attn3
-```
-
-Passed through to `transformers`; axolotl does not install the kernel itself.
-For recognized hub paths the capability flags are set automatically; for
-arbitrary paths axolotl uses conservative defaults (`attn_supports_packing=False`,
-`attn_uses_flash_lib=False`).
-
-## Migrating from legacy boolean flags
-
-The following legacy config fields are **deprecated** and will be removed in a
-future release. Each emits a `DeprecationWarning` when set and is stripped from
-the validated config.
-
-| Legacy | Canonical |
-|---|---|
-| `flash_attention: true` | `attn_implementation: flash_attention_2` |
-| `sdp_attention: true` | `attn_implementation: sdpa` |
-| `xformers_attention: true` | `attn_implementation: xformers` |
-| `flex_attention: true` | `attn_implementation: flex_attention` |
-| `sage_attention: true` | `attn_implementation: sage` |
-| `s2_attention: true` | `attn_implementation: s2` |
-| `eager_attention: true` | `attn_implementation: eager` |
-
-Combining `attn_implementation` with a legacy flag (e.g. `attn_implementation:
-flash_attention_2` **and** `flash_attention: true`) raises — pick one.
-
-::: {.callout-note}
-
-Existing example configs under `examples/` still use the legacy flags. They
-continue to work with a deprecation warning; they will be migrated in a
-follow-up pass.
+No sample packing support!

 :::
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -108,14 +108,6 @@ datasets:
    type: chat_template
 ```

-::: {.callout-tip}
-`chat_template_jinja` also accepts a file path to a `.jinja2` file instead of an inline string:
-
-```yaml
-chat_template_jinja: ./path/to/my_template.jinja2
-```
-:::
-
 ::: {.callout-important}
 Please make sure that your `tokenizer.eos_token` is same as EOS (End-of-Sequence) token in template. Otherwise, set `eos_token` under `special_tokens: `.
 :::
@@ -302,113 +294,6 @@ datasets:
 It is not necessary to set both `message_field_training` and `message_field_training_detail` at once.
 :::

-#### Content parts with per-part training control
-
-Instead of using character offsets with `train_detail`, you can split a message's content into a list of parts, each with its own training flag. This is useful when you want to mask specific sections of a response (e.g., mask reasoning but train on the answer).
-
-```{.json filename="data.jsonl"}
-{
-  "messages": [
-    {"role": "user", "content": [{"type": "text", "text": "What is 2+2?"}]},
-    {
-      "role": "assistant",
-      "content": [
-        {"type": "text", "text": "Let me think step by step...", "train": false},
-        {"type": "text", "text": " The answer is 4.", "train": true}
-      ]
-    }
-  ]
-}
-```
-
-The configuration is the same as standard `chat_template` — no extra fields needed:
-
-```yaml
-datasets:
-  - path: ...
-    type: chat_template
-    roles_to_train: ["assistant"]
-```
-
-Each content part supports:
-
- `type`: `"text"` (required)
- `text`: the text value (also accepts `content` or `value` as the key)
- `train`: `true`/`false` (optional) — whether to train on this part
- `weight`: `0`/`1` (optional) — alternative to `train`
-
-If a part has no `train` or `weight` flag, it inherits the turn-level training decision (from `roles_to_train`, `message_field_training`, or `train_on_inputs`).
-
-::: {.callout-warning title="Whitespace at part boundaries"}
-BPE tokenizers (used by Llama, Qwen, Mistral, GPT, etc.) prepend spaces to word tokens. For example, `" answer"` is a single token — the space is part of it. This means **where you place whitespace between content parts matters**:
-
-**Split BEFORE spaces** (space goes with the next part):
-
-```json
-[
-  {"type": "text", "text": "Let me think...", "train": false},
-  {"type": "text", "text": " The answer is 4.", "train": true}
-]
-```
-
-**DON'T put trailing spaces** on a part (the space merges with the next word into one token that straddles the boundary, and straddling tokens are masked):
-
-```json
-[
-  {"type": "text", "text": "Let me think... ", "train": false},
-  {"type": "text", "text": "The answer is 4.", "train": true}
-]
-```
-
-In the bad example, `" The"` becomes a single token that spans both parts. Because it straddles the boundary, it is conservatively **masked** (not trained) — even though the second part has `train: true`.
-
-**Newlines** typically merge with preceding punctuation (e.g., `":\n"` is one token). Keep newlines with the preceding part:
-
-```json
-[
-  {"type": "text", "text": "Thinking:\n", "train": false},
-  {"type": "text", "text": "The answer is 4.", "train": true}
-]
-```
-
-Axolotl will log a warning if it detects trailing whitespace at a boundary between parts with different training flags.
-:::
-
-::: {.callout-note}
-When all content parts in a message are strings, they are concatenated before being passed to the chat template. This means content parts work with **any** Jinja template — the template sees a plain string, and the per-part training flags are applied during tokenization.
-:::
-
-##### Per-part training on reasoning_content
-
-For templates that support a separate `reasoning_content` field (e.g., `qwen3`), the same content-parts format works on `reasoning_content`. This is useful for masking incorrect reasoning steps while training on self-corrections:
-
-```{.json filename="data.jsonl"}
-{
-  "messages": [
-    {"role": "user", "content": [{"type": "text", "text": "What is 2+2?"}]},
-    {
-      "role": "assistant",
-      "reasoning_content": [
-        {"type": "text", "text": "Hmm maybe 2+2=5.", "train": false},
-        {"type": "text", "text": " Wait no, 2+2=4.", "train": true}
-      ],
-      "content": [
-        {"type": "text", "text": "The answer is 4.", "train": true}
-      ]
-    }
-  ]
-}
-```
-
-The `reasoning_content` and `content` fields are handled independently — each has its own token boundaries and per-part masking. No additional configuration is needed beyond what the template already requires.
-
-::: {.callout-tip}
-When `reasoning_content` is provided as a separate field, `split_thinking` is not needed — the reasoning is already separated from the content in the data.
-:::
-
-The same whitespace rules apply to `reasoning_content` parts as to `content` parts — split before spaces, keep newlines with the preceding part.
-
-
 #### Reasoning split

 (For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.
--- a/docs/debugging.qmd
+++ b/docs/debugging.qmd
@@ -76,9 +76,8 @@ datasets:
 Make sure you have an [editable install](https://setuptools.pypa.io/en/latest/userguide/development_mode.html) of Axolotl, which ensures that changes you make to the code are reflected at runtime.  Run the following commands from the root of this project:

 ```bash
-export UV_TORCH_BACKEND=cu128  # or cu130
-uv sync --extra flash-attn --extra deepspeed --group dev --group test
-source .venv/bin/activate
+pip3 install packaging
+pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
 ```

 #### Remote Hosts
@@ -209,17 +208,17 @@ cd axolotl
 Next, run the desired docker image and mount the current directory. Below is a docker command you can run to do this:[^2]

 ```bash
-docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl-uv:main-latest
+docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl:main-py3.10-cu118-2.0.1
 ```

 >[!Tip]
 > To understand which containers are available, see the [Docker section of the README](../README.md#docker) and the [DockerHub repo](https://hub.docker.com/r/axolotlai/axolotl/tags).  For details of how the Docker containers are built, see axolotl's [Docker CI builds](../.github/workflows/main.yml).

-You will now be in the container.  Next, install Axolotl with dev dependencies:
+You will now be in the container.  Next, perform an editable install of Axolotl:

 ```bash
-uv sync --extra flash-attn --extra deepspeed --group dev --group test
-source .venv/bin/activate
+pip3 install packaging
+pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
 ```

 ### Attach To Container
--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -6,30 +6,23 @@ format:
    toc-depth: 4
 ---

-This section describes the different Docker images that are released by AxolotlAI at
-[Docker Hub](https://hub.docker.com/u/axolotlai).
+This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai).

 ::: {.callout-important}
-For Blackwell GPUs, please use the tags with PyTorch 2.9.1 and CUDA 12.8.
-:::
-
-::: {.callout-tip}
-Each image below is available in a **uv variant** that uses [uv](https://docs.astral.sh/uv/) with
-a relocatable venv (`/workspace/axolotl-venv`) instead of Miniconda + pip. Append `-uv` to the image name
-(e.g. `axolotlai/axolotl-base-uv`). Tags follow the same format. We recommend the uv images for new deployments.
+For Blackwell GPUs, please use the tags with PyTorch 2.7.1 and CUDA 12.8.
 :::

 ## Base

-The base image is the most minimal image that can install Axolotl. It is based on the `nvidia/cuda` image.
-It includes python, torch, git, git-lfs, awscli, pydantic, and more.
+The base image is the most minimal image that can install Axolotl. It is based on the `nvidia/cuda` image. It includes python, torch, git, git-lfs, awscli, pydantic, and more.

 #### Image

-| Variant | Image | Docker Hub |
-|---------|-------|------------|
-| pip | `axolotlai/axolotl-base` | [Link](https://hub.docker.com/r/axolotlai/axolotl-base) |
-| uv | `axolotlai/axolotl-base-uv` | [Link](https://hub.docker.com/r/axolotlai/axolotl-base-uv) |
+```
+axolotlai/axolotl-base
+```
+
+Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-base)

 #### Tags format

@@ -39,10 +32,8 @@ main-base-py{python_version}-cu{cuda_version}-{pytorch_version}

 Tags examples:

+- `main-base-py3.11-cu128-2.8.0`
 - `main-base-py3.11-cu128-2.9.1`
- `main-base-py3.12-cu128-2.10.0`
- `main-base-py3.12-cu130-2.9.1`
- `main-base-py3.12-cu130-2.10.0`

 ## Main

@@ -50,10 +41,11 @@ The main image is the image that is used to run Axolotl. It is based on the `axo

 #### Image

-| Variant | Image | Docker Hub |
-|---------|-------|------------|
-| pip | `axolotlai/axolotl` | [Link](https://hub.docker.com/r/axolotlai/axolotl) |
-| uv | `axolotlai/axolotl-uv` | [Link](https://hub.docker.com/r/axolotlai/axolotl-uv) |
+```
+axolotlai/axolotl
+```
+
+Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl)

 #### Tags format {#sec-main-tags}

@@ -61,7 +53,7 @@ The main image is the image that is used to run Axolotl. It is based on the `axo
 # on push to main
 main-py{python_version}-cu{cuda_version}-{pytorch_version}

-# latest main (currently torch 2.9.1, python 3.11, cuda 12.8)
+# latest main (currently torch 2.6.0, python 3.11, cuda 12.4)
 main-latest

 # nightly build
@@ -79,12 +71,11 @@ There may be some extra tags appended to the image, like `-vllm` which installs

 Tags examples:

+- `main-py3.11-cu128-2.8.0`
 - `main-py3.11-cu128-2.9.1`
- `main-py3.12-cu128-2.10.0`
- `main-py3.12-cu130-2.9.1`
- `main-py3.12-cu130-2.10.0`
 - `main-latest`
- `main-20260315-py3.11-cu128-2.9.1`
+- `main-20250303-py3.11-cu124-2.6.0`
+- `main-20250303-py3.11-cu126-2.6.0`
 - `0.12.0`

 ## Cloud
@@ -99,10 +90,11 @@ Jupyter lab is run by default. Set `JUPYTER_DISABLE=1` in the environment variab

 #### Image

-| Variant | Image | Docker Hub |
-|---------|-------|------------|
-| pip | `axolotlai/axolotl-cloud` | [Link](https://hub.docker.com/r/axolotlai/axolotl-cloud) |
-| uv | `axolotlai/axolotl-cloud-uv` | [Link](https://hub.docker.com/r/axolotlai/axolotl-cloud-uv) |
+```
+axolotlai/axolotl-cloud
+```
+
+Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-cloud)

 #### Tags format

--- a/docs/ebft.qmd
+++ b/docs/ebft.qmd
@@ -129,7 +129,7 @@ gradient_accumulation_steps: 4
 max_steps: 20
 learning_rate: 5.0e-6
 bf16: auto
-attn_implementation: flash_attention_2
+flash_attention: true
 gradient_checkpointing: true
 output_dir: ./outputs/ebft-quickstart
 ```
@@ -304,7 +304,7 @@ lora_alpha: 32
 lora_target_linear: true

 bf16: auto
-attn_implementation: flex_attention
+flex_attention: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: true          # Required with flex_attention
--- a/docs/grpo.qmd
+++ b/docs/grpo.qmd
@@ -154,7 +154,7 @@ lr_scheduler: cosine
 warmup_steps: 10

 bf16: true
-attn_implementation: flash_attention_2
+flash_attention: true
 gradient_checkpointing: true

 special_tokens:
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -15,30 +15,64 @@ This guide covers all the ways you can install and set up Axolotl for your envir

 - NVIDIA GPU (Ampere architecture or newer for `bf16` and Flash Attention) or AMD GPU
 - Python ≥3.11
- PyTorch ≥2.9.0
+- PyTorch ≥2.6.0

-## Installation {#sec-installation}
+## Installation Methods {#sec-installation-methods}
+
+::: {.callout-important}
+Please make sure to have Pytorch installed before installing Axolotl in your local environment.
+
+Follow the instructions at: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
+:::

 ::: {.callout-important}
 For Blackwell GPUs, please use Pytorch 2.9.1 and CUDA 12.8.
 :::

-### Quick Install {#sec-uv}
+### PyPI Installation (Recommended) {#sec-pypi}

-Axolotl uses [uv](https://docs.astral.sh/uv/) as its package manager. uv is a fast, reliable Python package installer and resolver built in Rust.
+```{.bash}
+pip3 install -U packaging setuptools wheel ninja
+pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
+```

-Install uv if not already installed:
+We use `--no-build-isolation` in order to detect the installed PyTorch version (if
+installed) in order not to clobber it, and so that we set the correct version of
+dependencies that are specific to the PyTorch version or other installed
+co-dependencies.
+
+### uv Installation {#sec-uv}
+
+uv is a fast, reliable Python package installer and resolver built in Rust. It offers significant performance improvements over pip and provides better dependency resolution, making it an excellent choice for complex environments.
+
+Install uv if not already installed
 ```{.bash}
 curl -LsSf https://astral.sh/uv/install.sh | sh
 source $HOME/.local/bin/env
 ```

-Choose your CUDA version (e.g. `cu128`, `cu130`), create a venv, and install:
+Choose your CUDA version to use with PyTorch; e.g. `cu124`, `cu126`, `cu128`,
+then create the venv and activate
 ```{.bash}
-export UV_TORCH_BACKEND=cu128  # or cu130
+export UV_TORCH_BACKEND=cu126
 uv venv --no-project --relocatable
 source .venv/bin/activate
-uv pip install --no-build-isolation axolotl[flash-attn,deepspeed]
+```
+
+Install PyTorch
+- PyTorch 2.6.0 recommended
+```{.bash}
+uv pip install packaging setuptools wheel
+uv pip install torch==2.6.0
+uv pip install awscli pydantic
+```
+
+Install axolotl from PyPi
+```{.bash}
+uv pip install --no-build-isolation axolotl[deepspeed,flash-attn]
+
+# optionally install with vLLM if you're using torch==2.6.0 and want to train w/ GRPO
+uv pip install --no-build-isolation axolotl[deepspeed,flash-attn,vllm]
 ```

 ### Edge/Development Build {#sec-edge-build}
@@ -48,17 +82,14 @@ For the latest features between releases:
 ```{.bash}
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
-export UV_TORCH_BACKEND=cu128  # or cu130
-uv sync --extra flash-attn --extra deepspeed
-source .venv/bin/activate
+pip3 install -U packaging setuptools wheel ninja
+pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
 ```

-`uv sync` creates a `.venv`, installs exact pinned versions from `uv.lock`, and sets up an editable install automatically.
-
 ### Docker {#sec-docker}

 ```{.bash}
-docker run --gpus '"all"' --rm -it --ipc=host axolotlai/axolotl-uv:main-latest
+docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
 ```

 For development with Docker:
@@ -75,12 +106,12 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it \
  --ulimit memlock=-1 --ulimit stack=67108864 \
  --mount type=bind,src="${PWD}",target=/workspace/axolotl \
  -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-  axolotlai/axolotl-uv:main-latest
+  axolotlai/axolotl:main-latest
 ```
 :::

 ::: {.callout-important}
-For Blackwell GPUs, please use `axolotlai/axolotl-uv:main-py3.11-cu128-2.9.1` or the cloud variant `axolotlai/axolotl-cloud-uv:main-py3.11-cu128-2.9.1`.
+For Blackwell GPUs, please use `axolotlai/axolotl:main-py3.11-cu128-2.9.1` or the cloud variant `axolotlai/axolotl-cloud:main-py3.11-cu128-2.9.1`.
 :::

 Please refer to the [Docker documentation](docker.qmd) for more information on the different Docker images that are available.
@@ -91,7 +122,7 @@ Please refer to the [Docker documentation](docker.qmd) for more information on t

 For providers supporting Docker:

- Use `axolotlai/axolotl-cloud-uv:main-latest`
+- Use `axolotlai/axolotl-cloud:main-latest`
 - Available on:
    - [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
    - [Vast.ai](https://cloud.vast.ai?ref_id=62897&template_id=bdd4a49fa8bce926defc99471864cace&utm_source=axolotl&utm_medium=partner&utm_campaign=template_launch_july2025&utm_content=docs_link)
@@ -110,7 +141,7 @@ For providers supporting Docker:
 ### macOS {#sec-macos}

 ```{.bash}
-uv pip install --no-build-isolation -e '.'
+pip3 install --no-build-isolation -e '.'
 ```

 See @sec-troubleshooting for Mac-specific issues.
@@ -121,44 +152,21 @@ See @sec-troubleshooting for Mac-specific issues.
 We recommend using WSL2 (Windows Subsystem for Linux) or Docker.
 :::

-## Migrating from pip to uv {#sec-migrating}
+## Environment Managers {#sec-env-managers}

-If you have an existing pip-based Axolotl installation, you can migrate to uv:
+### Conda/Pip venv {#sec-conda}

-```{.bash}
-# Install uv
-curl -LsSf https://astral.sh/uv/install.sh | sh
-source $HOME/.local/bin/env
-
-# Create a fresh venv (recommended for a clean start)
-export UV_TORCH_BACKEND=cu128  # or cu130
-uv venv --no-project --relocatable
-source .venv/bin/activate
-
-# Reinstall axolotl
-uv pip install --no-build-isolation axolotl[flash-attn,deepspeed]
-```
-
-## Using pip (Alternative) {#sec-pip}
-
-If you are unable to install uv, you can still use pip directly.
-
-::: {.callout-important}
-Please make sure to have PyTorch installed before installing Axolotl with pip.
-
-Follow the instructions at: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
-:::
-
-```{.bash}
-pip3 install -U packaging setuptools wheel ninja
-pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
-```
-
-For editable/development installs:
-```{.bash}
-pip3 install -U packaging setuptools wheel ninja
-pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
-```
+1. Install Python ≥3.11
+2. Install PyTorch: https://pytorch.org/get-started/locally/
+3. Install Axolotl:
+   ```{.bash}
+   pip3 install -U packaging setuptools wheel ninja
+   pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
+   ```
+4. (Optional) Login to Hugging Face:
+   ```{.bash}
+   hf auth login
+   ```

 ## Troubleshooting {#sec-troubleshooting}

--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -8,7 +8,6 @@ format:

 ## Supported Models

- [Gemma-4](#sec-gemma-4) *(NEW)*
 - [Mllama](#sec-mllama)
 - [Llama4](#sec-llama4)
 - [Pixtral](#sec-pixtral)
@@ -139,40 +138,6 @@ base_model: mistralai/Voxtral-Mini-3B-2507
 processor_type: VoxtralProcessor
 ```

-### Gemma-4 {#sec-gemma-4}
-
-All Gemma 4 variants (E2B, E4B, 26B-A4B, 31B) load as multimodal models even for text-only training.
-
-```yaml
-base_model: google/gemma-4-E2B-it  # or E4B-it, 26B-A4B, 31B
-
-chat_template: gemma4
-freeze_mm_modules: true  # freeze vision/audio encoders for text-only or vision LoRA
-
-# For the 26B-A4B MoE model, enable ScatterMoE and expert LoRA:
-plugins:
-  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
-  - axolotl.integrations.kernels.KernelsPlugin
-use_kernels: true
-use_scattermoe: true
-experts_implementation: scattermoe
-
-lora_target_modules: 'model.language_model.layers.[\d]+.(_checkpoint_wrapped_module.)?(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'
-
-# MoE expert LoRA (3D tensors, not nn.Linear) — only for 26B-A4B:
-lora_target_parameters:
-  - experts.gate_up_proj
-  - experts.down_proj
-```
-
-::: {.callout-warning}
-Gemma 4 VLM training starts with high loss (~8-15). This is expected — see the [training stability guide](training_stability.qmd) for details.
-:::
-
-::: {.callout-tip}
-For DDP training, axolotl auto-detects Gemma4 and sets `use_reentrant=False` and `ddp_find_unused_parameters=True`. However, when `activation_offloading: true`, `ddp_find_unused_parameters` is skipped (checkpoint wrappers conflict with it); use `freeze_mm_modules: true` instead to handle unused vision/audio params. For FSDP2, use `fsdp_transformer_layer_cls_to_wrap: Gemma4TextDecoderLayer`.
-:::
-
 ### Gemma-3 {#sec-gemma-3}

 ::: {.callout-tip}
--- a/docs/optimizations.qmd
+++ b/docs/optimizations.qmd
@@ -22,12 +22,12 @@ Improves GPU utilization by combining multiple short sequences into a single pac

 Using an optimized attention implementation is critical for training speed.

- **[Flash Attention 2](https://github.com/Dao-AILab/flash-attention)**: `attn_implementation: flash_attention_2`. **(Recommended)** The industry standard for fast attention on modern GPUs. Requires Ampere or higher. For AMD, check [AMD Support](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#amd-rocm-support).
- **[Flex Attention](https://pytorch.org/blog/flexattention/)**: `attn_implementation: flex_attention`.
- **[SDP Attention](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)**: `attn_implementation: sdpa`. PyTorch's native implementation.
- **[Xformers](https://github.com/facebookresearch/xformers)**: `attn_implementation: xformers`. Works with FP16.
+- **[Flash Attention 2](https://github.com/Dao-AILab/flash-attention)**: `flash_attention: true`. **(Recommended)** The industry standard for fast attention on modern GPUs. Requires Ampere or higher. For AMD, check [AMD Support](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#amd-rocm-support).
+- **[Flex Attention](https://pytorch.org/blog/flexattention/)**: `flex_attention: true`.
+- **[SDP Attention](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)**: `sdp_attention: true`. PyTorch's native implementation.
+- **[Xformers](https://github.com/facebookresearch/xformers)**: `xformers_attention: true`. Works with FP16.

-See [Attention](attention.qmd) for the full list of backends and the canonical values.
+*Note: You should only enable one attention backend.*

 ### LoRA Optimizations

--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -320,10 +320,8 @@ The input format is a simple JSON input with customizable fields based on the ab
 As IPO is just DPO with a different loss function, all supported dataset formats for [DPO](#dpo) are also supported for IPO.

 ```yaml
-rl: dpo
-dpo_loss_type: ["ipo"]
+rl: ipo
 ```
-*Note:* Passing `rl: ipo` directly is still supported, but will soon be deprecated.

 ### ORPO

@@ -1147,7 +1145,8 @@ datasets:
    type: ebft_strided_structured.transform
    split: train[:1%]

-attn_implementation: flex_attention   # Strided mode uses flex_attention
+flash_attention: false
+flex_attention: true     # Strided mode uses flex_attention
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: true    # Required for flex_attention
--- a/docs/sequence_parallelism.qmd
+++ b/docs/sequence_parallelism.qmd
@@ -55,7 +55,7 @@ To use sequence parallelism, you need:

 ## Limitations

- Flash attention must be enabled for this to work (`attn_implementation: flash_attention_2` in config YAML)
+- Flash attention must be enabled for this to work (`flash_attention: true` in config YAML)
 - May have a small performance overhead due to communication between GPUs

 ## Example
--- a/docs/training_stability.qmd
+++ b/docs/training_stability.qmd
@@ -137,6 +137,50 @@ This means the policy has diverged significantly from the weights used by vLLM f
 - Increase `gradient_accumulation_steps` to smooth out noisy batches.
 - Check for NaN issues (see next section).

+## MoE Weight Scale Drift
+
+**Symptom**: Model works on short prompts but loses coherence on long conversations — repeating itself, "philosophizing", or generating broken code. Particularly affects MoE models with recurrent/SSM components (e.g. DeltaNet linear attention).
+
+**Root cause**: In MoE models trained with AdamW, rarely-activated experts accumulate smaller second-moment estimates. This gives them a disproportionately large effective learning rate, causing their weights to drift to higher variance than the group norm. In recurrent components like `conv1d` in DeltaNet layers, this amplifies short-range context and washes out long-range state.
+
+**Detection**: Use `normalize_weight_scales` with `dry_run: true` to scan for anomalies without modifying weights:
+
+```yaml
+normalize_weight_scales:
+  - name_pattern: 'linear_attn\.conv1d\.weight'
+    threshold: 1.3
+    dry_run: true
+```
+
+This logs any tensors matching the pattern whose standard deviation exceeds 1.3x the group median. Example output:
+
+```
+normalize_weight_scales [DRY RUN]: pattern 'linear_attn\.conv1d\.weight' —
+  3/30 tensors outside 1.3x threshold (median std=0.062733):
+    layers.36.linear_attn.conv1d.weight: std=0.101870 (1.62x median)
+    layers.37.linear_attn.conv1d.weight: std=0.102362 (1.63x median)
+    layers.38.linear_attn.conv1d.weight: std=0.089227 (1.42x median)
+```
+
+Each rule accepts:
+
+- `name_pattern`: regex matched against parameter names. All matching tensors form a group.
+- `threshold`: flag tensors whose std deviates from the group median by more than this factor (default: 1.5).
+- `dry_run`: when `true`, log anomalies without modifying weights (default: `false`).
+
+Multiple rules can target different tensor patterns:
+
+```yaml
+normalize_weight_scales:
+  - name_pattern: 'linear_attn\.conv1d\.weight'
+    threshold: 1.3
+  - name_pattern: 'experts\.gate_up_proj'
+    threshold: 1.5
+    dry_run: true  # just check these, don't fix
+```
+
+The transform runs after model loading but before adapter injection, so it modifies the base model weights directly.
+
 ## NaN and Inf Handling

 ### Common Causes
@@ -245,7 +289,7 @@ For GRPO, also reduce `max_completion_length`. Memory scales quadratically with
 Reduces attention memory from O(n^2) to O(n):

 ```yaml
-attn_implementation: flash_attention_2
+flash_attention: true
 ```

 ### Step 6: Offload with DeepSpeed
--- a/docs/unsloth.qmd
+++ b/docs/unsloth.qmd
@@ -0,0 +1,53 @@
+---
+title: "Unsloth"
+description: "Hyper-optimized QLoRA finetuning for single GPUs"
+---
+
+### Overview
+
+Unsloth provides hand-written optimized kernels for LLM finetuning that slightly improve speed and VRAM over
+standard industry baselines.
+
+::: {.callout-important}
+Due to breaking changes in transformers `v4.48.0`, users will need to downgrade to `<=v4.47.1` to use this patch.
+
+This will later be deprecated in favor of [LoRA Optimizations](lora_optims.qmd).
+:::
+
+
+### Installation
+
+The following will install the correct unsloth and extras from source.
+
+```bash
+python scripts/unsloth_install.py | sh
+```
+
+### Usage
+
+Axolotl exposes a few configuration options to try out unsloth and get most of the performance gains.
+
+Our unsloth integration is currently limited to the following model architectures:
+ - llama
+
+These options are specific to LoRA finetuning and cannot be used for multi-GPU finetuning
+```yaml
+unsloth_lora_mlp: true
+unsloth_lora_qkv: true
+unsloth_lora_o: true
+```
+
+These options are composable and can be used with multi-gpu finetuning
+```yaml
+unsloth_cross_entropy_loss: true
+unsloth_rms_norm: true
+unsloth_rope: true
+```
+
+### Limitations
+
+- Single GPU only; e.g. no multi-gpu support
+- No deepspeed or FSDP support (requires multi-gpu)
+- LoRA + QLoRA support only. No full fine tunes or fp8 support.
+- Limited model architecture support. Llama, Phi, Gemma, Mistral only
+- No MoE support.
--- a/examples/LiquidAI/README.md
+++ b/examples/LiquidAI/README.md
@@ -15,7 +15,8 @@ Thanks to the team at LiquidAI for giving us early access to prepare for these r
    Here is an example of how to install from pip:
    ```bash
    # Ensure you have a compatible version of Pytorch installed
-    uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+    pip3 install packaging setuptools wheel ninja
+    pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
    ```

 2.  Run one of the finetuning examples below.
@@ -34,7 +35,7 @@ Thanks to the team at LiquidAI for giving us early access to prepare for these r

    **LFM2-MoE**
    ```bash
-    uv pip install git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6
+    pip install git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6

    # LoRA SFT (1x48GB @ 16.2GiB)
    axolotl train examples/LiquidAI/lfm2-8b-a1b-lora.yaml
@@ -44,7 +45,7 @@ Thanks to the team at LiquidAI for giving us early access to prepare for these r

 - **Installation Error**: If you encounter `ImportError: ... undefined symbol ...` or `ModuleNotFoundError: No module named 'causal_conv1d_cuda'`, the `causal-conv1d` package may have been installed incorrectly. Try uninstalling it:
  ```bash
-  uv pip uninstall causal-conv1d
+  pip uninstall -y causal-conv1d
  ```

 - **Dataset Loading**: Read more on how to load your own dataset in our [documentation](https://docs.axolotl.ai/docs/dataset_loading.html).
--- a/examples/LiquidAI/lfm2-350m-fft.yaml
+++ b/examples/LiquidAI/lfm2-350m-fft.yaml
@@ -39,7 +39,7 @@ tf32: true
 gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 2
--- a/examples/LiquidAI/lfm2-8b-a1b-lora.yaml
+++ b/examples/LiquidAI/lfm2-8b-a1b-lora.yaml
@@ -48,7 +48,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 2
--- a/examples/LiquidAI/lfm2-vl-lora.yaml
+++ b/examples/LiquidAI/lfm2-vl-lora.yaml
@@ -50,7 +50,8 @@ tf32: true

 gradient_checkpointing: true
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true
+eager_attention:

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/alst/llama3-8b-deepspeed-alst.yaml
+++ b/examples/alst/llama3-8b-deepspeed-alst.yaml
@@ -39,7 +39,7 @@ activation_offloading: legacy

 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_steps: 100
 saves_per_epoch: 1
--- a/examples/alst/llama3-8b-fsdp2-alst.yaml
+++ b/examples/alst/llama3-8b-fsdp2-alst.yaml
@@ -39,7 +39,7 @@ activation_offloading: legacy

 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_steps: 100
 saves_per_epoch: 1
--- a/examples/apertus/README.md
+++ b/examples/apertus/README.md
@@ -15,7 +15,8 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl

-uv pip install --no-build-isolation -e '.[flash-attn]'
+pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation -e '.[flash-attn]'

 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
@@ -30,7 +31,7 @@ python scripts/cutcrossentropy_install.py | sh
 # For those using our Docker image, use the below path.
 export CUDA_HOME=/usr/local/cuda

-uv pip install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
+pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
 ```

 For any installation errors, see [XIELU Installation Issues](#xielu-installation-issues)
@@ -66,7 +67,7 @@ If those didn't help, please try the below solutions:
 1. Pass env for CMAKE and try install again:

    ```bash
-    Python_EXECUTABLE=$(which python) uv pip install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
+    Python_EXECUTABLE=$(which python) pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
    ```

 2. Git clone the repo and manually hardcode python path:
@@ -91,7 +92,7 @@ If those didn't help, please try the below solutions:
    ```

    ```bash
-    uv pip install . --no-build-isolation --no-deps
+    pip3 install . --no-build-isolation --no-deps
    ```

 ## Optimization Guides
--- a/examples/apertus/apertus-8b-qlora.yaml
+++ b/examples/apertus/apertus-8b-qlora.yaml
@@ -55,7 +55,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/arcee/README.md
+++ b/examples/arcee/README.md
@@ -17,7 +17,8 @@ Thanks to the team at Arcee.ai for using Axolotl in supervised fine-tuning the A
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl

-uv pip install --no-build-isolation -e '.[flash-attn]'
+pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation -e '.[flash-attn]'

 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
--- a/examples/arcee/afm-4.5b-qlora.yaml
+++ b/examples/arcee/afm-4.5b-qlora.yaml
@@ -55,7 +55,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/archived/cerebras/btlm-ft.yml
+++ b/examples/archived/cerebras/btlm-ft.yml
@@ -59,7 +59,8 @@ gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1

-attn_implementation: flash_attention_2
+flash_attention: true
+sdp_attention:
 flash_optimum:

 gptq_groupsize:
--- a/examples/archived/cerebras/qlora.yml
+++ b/examples/archived/cerebras/qlora.yml
@@ -39,7 +39,8 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/code-llama/13b/lora.yml
+++ b/examples/archived/code-llama/13b/lora.yml
@@ -45,7 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/code-llama/13b/qlora.yml
+++ b/examples/archived/code-llama/13b/qlora.yml
@@ -46,7 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/code-llama/34b/lora.yml
+++ b/examples/archived/code-llama/34b/lora.yml
@@ -45,7 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/code-llama/34b/qlora.yml
+++ b/examples/archived/code-llama/34b/qlora.yml
@@ -46,7 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/code-llama/7b/lora.yml
+++ b/examples/archived/code-llama/7b/lora.yml
@@ -45,7 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/code-llama/7b/qlora.yml
+++ b/examples/archived/code-llama/7b/qlora.yml
@@ -46,7 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/dbrx/16bit-lora.yaml
+++ b/examples/archived/dbrx/16bit-lora.yaml
@@ -52,7 +52,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/archived/dbrx/8bit-lora.yaml
+++ b/examples/archived/dbrx/8bit-lora.yaml
@@ -55,7 +55,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/archived/dbrx/fft-ds-zero3.yaml
+++ b/examples/archived/dbrx/fft-ds-zero3.yaml
@@ -39,7 +39,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
+++ b/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
@@ -45,7 +45,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/archived/falcon/config-7b-lora.yml
+++ b/examples/archived/falcon/config-7b-lora.yml
@@ -43,7 +43,8 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/falcon/config-7b-qlora.yml
+++ b/examples/archived/falcon/config-7b-qlora.yml
@@ -73,7 +73,8 @@ early_stopping_patience: 3
 resume_from_checkpoint:
 auto_resume_from_checkpoints: true
 logging_steps: 1
-attn_implementation: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/falcon/config-7b.yml
+++ b/examples/archived/falcon/config-7b.yml
@@ -40,7 +40,8 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/gemma/qlora.yml
+++ b/examples/archived/gemma/qlora.yml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/gptj/qlora.yml
+++ b/examples/archived/gptj/qlora.yml
@@ -36,7 +36,8 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/jeopardy-bot/config.yml
+++ b/examples/archived/jeopardy-bot/config.yml
@@ -37,7 +37,8 @@ bf16: auto
 tf32: true
 resume_from_checkpoint:
 logging_steps: 5
-attn_implementation: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/mpt-7b/config.yml
+++ b/examples/archived/mpt-7b/config.yml
@@ -39,6 +39,7 @@ bf16: auto
 tf32: true
 resume_from_checkpoint:
 logging_steps: 5
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/openllama-3b/config.yml
+++ b/examples/archived/openllama-3b/config.yml
@@ -39,7 +39,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/openllama-3b/lora.yml
+++ b/examples/archived/openllama-3b/lora.yml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/openllama-3b/qlora.yml
+++ b/examples/archived/openllama-3b/qlora.yml
@@ -40,7 +40,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/qwen/lora.yml
+++ b/examples/archived/qwen/lora.yml
@@ -47,6 +47,7 @@ tf32: false
 gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1
+flash_attention:

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/qwen/qlora.yml
+++ b/examples/archived/qwen/qlora.yml
@@ -47,6 +47,7 @@ tf32: false
 gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1
+flash_attention:

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/qwen/qwen2-moe-lora.yaml
+++ b/examples/archived/qwen/qwen2-moe-lora.yaml
@@ -43,7 +43,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/qwen/qwen2-moe-qlora.yaml
+++ b/examples/archived/qwen/qwen2-moe-qlora.yaml
@@ -46,7 +46,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/redpajama/config-3b.yml
+++ b/examples/archived/redpajama/config-3b.yml
@@ -40,6 +40,7 @@ bf16: auto
 tf32: true
 resume_from_checkpoint:
 logging_steps: 5
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/replit-3b/config-lora.yml
+++ b/examples/archived/replit-3b/config-lora.yml
@@ -38,6 +38,7 @@ tf32: true
 gradient_checkpointing:
 resume_from_checkpoint:
 logging_steps: 1
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/stablelm-2/1.6b/fft.yml
+++ b/examples/archived/stablelm-2/1.6b/fft.yml
@@ -44,7 +44,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
 flash_attn_fuse_mlp: true
--- a/examples/archived/stablelm-2/1.6b/lora.yml
+++ b/examples/archived/stablelm-2/1.6b/lora.yml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true

--- a/examples/archived/starcoder2/qlora.yml
+++ b/examples/archived/starcoder2/qlora.yml
@@ -46,7 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/tiny-llama/lora-mps.yml
+++ b/examples/archived/tiny-llama/lora-mps.yml
@@ -47,6 +47,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
+flash_attention: false

 warmup_ratio: 0.1
 evals_per_epoch: 0
--- a/examples/archived/tiny-llama/lora.yml
+++ b/examples/archived/tiny-llama/lora.yml
@@ -45,7 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/tiny-llama/pretrain.yml
+++ b/examples/archived/tiny-llama/pretrain.yml
@@ -36,7 +36,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/archived/tiny-llama/qlora.yml
+++ b/examples/archived/tiny-llama/qlora.yml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml
+++ b/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml
@@ -71,7 +71,8 @@ early_stopping_patience: 3
 resume_from_checkpoint:
 auto_resume_from_checkpoints: true
 logging_steps: 1
-attn_implementation: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/yi-34B-chat/qlora.yml
+++ b/examples/archived/yi-34B-chat/qlora.yml
@@ -10,7 +10,7 @@ load_in_4bit: true
 sequence_len: 1024
 bf16: auto
 tf32: false
-attn_implementation: flash_attention_2
+flash_attention: true
 special_tokens:
  bos_token: "<|startoftext|>"
  eos_token: "<|endoftext|>"
--- a/examples/cohere/command-r-7b-qlora.yml
+++ b/examples/cohere/command-r-7b-qlora.yml
@@ -48,7 +48,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -40,7 +40,7 @@
    "%%capture\n",
    "# This step can take ~5-10 minutes to install dependencies\n",
    "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
-    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@fec1a88\""
+    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@63b15e6\""
   ]
  },
  {
--- a/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
@@ -45,7 +45,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
@@ -45,7 +45,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/deepseek-v2/fft-fsdp-16b.yaml
+++ b/examples/deepseek-v2/fft-fsdp-16b.yaml
@@ -35,7 +35,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 2
--- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -59,7 +59,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 2
--- a/examples/devstral/README.md
+++ b/examples/devstral/README.md
@@ -16,7 +16,8 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r

 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```

 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage
--- a/examples/devstral/devstral-small-qlora.yml
+++ b/examples/devstral/devstral-small-qlora.yml
@@ -51,7 +51,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true
 scaling_softmax: true

 loss_watchdog_threshold: 5.0
--- a/examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml
+++ b/examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml
@@ -29,7 +29,7 @@ output_dir: ./outputs/ndp-out/

 sequence_len: 2048
 sample_packing: true
-attn_implementation: flash_attention_2
+flash_attention: true

 gradient_accumulation_steps: 1
 micro_batch_size: 1
--- a/examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml
+++ b/examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml
@@ -26,7 +26,7 @@ output_dir: ./outputs/ndp-out/

 sequence_len: 8192
 sample_packing: true
-attn_implementation: flash_attention_2
+flash_attention: true

 gradient_accumulation_steps: 1
 micro_batch_size: 1  # must be 1 when using context parallel
--- a/examples/eaft/eaft-example.yml
+++ b/examples/eaft/eaft-example.yml
@@ -65,7 +65,8 @@ early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
-attn_implementation: flash_attention_2
+xformers_attention:
+flash_attention: true

 warmup_ratio: 0.1
 weight_decay: 0.0
--- a/examples/ebft/llama-1b-ebft-opencode-novllm.yaml
+++ b/examples/ebft/llama-1b-ebft-opencode-novllm.yaml
@@ -46,7 +46,7 @@ lora_dropout: 0.05
 lora_target_linear: true

 bf16: auto
-attn_implementation: flash_attention_2
+flash_attention: true
 gradient_checkpointing: true

 special_tokens:
--- a/examples/ebft/llama-1b-ebft-opencode.yaml
+++ b/examples/ebft/llama-1b-ebft-opencode.yaml
@@ -66,7 +66,7 @@ lora_target_linear: true

 # --- Hardware ---
 bf16: auto
-attn_implementation: flash_attention_2
+flash_attention: true
 gradient_checkpointing: true

 special_tokens:
--- a/examples/ebft/llama-1b-ebft-strided-structured.yaml
+++ b/examples/ebft/llama-1b-ebft-strided-structured.yaml
@@ -47,7 +47,8 @@ lora_dropout: 0.05
 lora_target_linear: true

 bf16: auto
-attn_implementation: flex_attention
+flash_attention: false  # strided EBFT overrides to flex_attention (or eager fallback) at runtime
+flex_attention: true    # fused flex_attention kernel compiles itself; don't set torch_compile: true
                        # (full-model compile conflicts with gradient checkpointing + flex_attention)
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
--- a/examples/ebft/llama-1b-ebft-strided.yaml
+++ b/examples/ebft/llama-1b-ebft-strided.yaml
@@ -46,6 +46,7 @@ lora_dropout: 0.05
 lora_target_linear: true

 bf16: auto
+flash_attention: false  # strided EBFT overrides to flex_attention (or eager fallback) at runtime
 gradient_checkpointing: true

 special_tokens:
--- a/examples/ebft/llama-3b-ebft-strided-fft.yaml
+++ b/examples/ebft/llama-3b-ebft-strided-fft.yaml
@@ -48,6 +48,7 @@ lora_target_linear: true

 bf16: auto
 torch_dtype: bfloat16
+flash_attention: false
 gradient_checkpointing: true
 torch_compile: true
 gradient_checkpointing_kwargs:
--- a/examples/ebft/llama-8b-ebft-strided-fft.yaml
+++ b/examples/ebft/llama-8b-ebft-strided-fft.yaml
@@ -41,6 +41,7 @@ warmup_steps: 10
 weight_decay: 0.01

 bf16: auto
+flash_attention: false  # strided EBFT uses flex_attention at runtime
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
--- a/examples/ebft/qwen35-4b-ebft-structured-async.yaml
+++ b/examples/ebft/qwen35-4b-ebft-structured-async.yaml
@@ -72,7 +72,7 @@ lora_dropout: 0.0
 lora_target_modules: ".*\\.layers\\.(3|7|11|15|19|23|27|31)\\.self_attn\\.(q|k|v|o)_proj|.*\\.mlp\\.(gate|up|down)_proj"

 bf16: auto
-attn_implementation: flash_attention_2
+flash_attention: true
 gradient_checkpointing: true

 special_tokens:
--- a/Show More
+++ b/Show More