fix test dims

support for vllm 0.19.1
2026-04-20 20:45:19 -04:00 · 2026-04-19 18:09:46 -04:00
411 changed files with 2975 additions and 11411 deletions
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -31,11 +31,7 @@ PRs are **greatly welcome**!

 Please run below to setup env
 ```bash
-# Install axolotl + dev and test dependencies
-export UV_TORCH_BACKEND=cu128  # or cu130
-uv venv --no-project --relocatable
-source .venv/bin/activate
-uv pip install --no-build-isolation -e '.[deepspeed]' --group dev --group test
+pip3 install -r requirements-dev.txt -r requirements-tests.txt
 pre-commit install

 # test
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -30,6 +30,14 @@ jobs:
      fail-fast: false
      matrix:
        include:
+          - cuda: "128"
+            cuda_version: 12.8.1
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: 2.9.0
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-base"
+            platforms: "linux/amd64,linux/arm64"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
@@ -160,6 +168,14 @@ jobs:
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64,linux/arm64"
+          - cuda: "128"
+            cuda_version: 12.8.1
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: 2.9.0
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-uv-base"
+            platforms: "linux/amd64,linux/arm64"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -6,7 +6,7 @@ on:
      types: [opened, synchronize, reopened, ready_for_review]
      paths:
       - '**.py'
-       - 'pyproject.toml'
+       - 'requirements.txt'
       - '.github/workflows/*.yml'
       - "*.[q]md"
       - "examples/**/*.y[a]?ml"
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -18,6 +18,12 @@ jobs:
      fail-fast: false
      matrix:
        include:
+          - cuda: 128
+            cuda_version: 12.8.1
+            python_version: "3.11"
+            pytorch: 2.9.0
+            axolotl_extras:
+            platforms: "linux/amd64,linux/arm64"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
@@ -174,6 +180,12 @@ jobs:
      fail-fast: false
      matrix:
        include:
+          - cuda: 128
+            cuda_version: 12.8.1
+            python_version: "3.11"
+            pytorch: 2.9.0
+            axolotl_extras:
+            platforms: "linux/amd64,linux/arm64"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -3,15 +3,17 @@ name: docker-multigpu-tests-biweekly
 on:
  pull_request:
    paths:
-      - "tests/e2e/multigpu/**.py"
-      - "pyproject.toml"
-      - ".github/workflows/multi-gpu-e2e.yml"
-      - "scripts/cutcrossentropy_install.py"
-      - "src/axolotl/core/trainers/mixins/sequence_parallel.py"
-      - "src/axolotl/utils/distributed.py"
+      - 'tests/e2e/multigpu/**.py'
+      - 'requirements.txt'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - '.github/workflows/multi-gpu-e2e.yml'
+      - 'scripts/cutcrossentropy_install.py'
+      - 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
+      - 'src/axolotl/utils/distributed.py'
  workflow_dispatch:
  schedule:
-    - cron: "0 0 * * 1,4" # Runs at 00:00 UTC every monday & thursday
+    - cron: '0 0 * * 1,4'  # Runs at 00:00 UTC every monday & thursday

 # Cancel jobs on the same ref if a new one is triggered
 concurrency:
@@ -31,19 +33,19 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          #          - cuda: 129
-          #            cuda_version: 12.9.1
-          #            python_version: "3.12"
-          #            pytorch: 2.9.1
-          #            axolotl_extras: "fbgemm-gpu"
-          #            num_gpus: 2
-          #            dockerfile: "Dockerfile-uv.jinja"
+#          - cuda: 129
+#            cuda_version: 12.9.1
+#            python_version: "3.12"
+#            pytorch: 2.9.1
+#            axolotl_extras: "fbgemm-gpu"
+#            num_gpus: 2
+#            dockerfile: "Dockerfile-uv.jinja"
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
-            #            axolotl_extras: fbgemm-gpu
+#            axolotl_extras: fbgemm-gpu
            num_gpus: 2
          - cuda: 128
            cuda_version: 12.8.1
@@ -51,6 +53,7 @@ jobs:
            pytorch: 2.10.0
            axolotl_extras: "fbgemm-gpu"
            num_gpus: 2
+            dockerfile: "Dockerfile-uv.jinja"
    runs-on: [self-hosted, modal]
    timeout-minutes: 120
    steps:
@@ -72,7 +75,7 @@ jobs:
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -8,9 +8,6 @@ on:

 permissions: {}

-env:
-  UV_SYSTEM_PYTHON: "1"
-
 jobs:
  setup_release:
    name: Create Release
@@ -44,15 +41,11 @@ jobs:
        with:
          python-version: "3.11"

-      - name: Install uv
-        uses: astral-sh/setup-uv@v7
-
      - name: Install dependencies
        run: |
-          uv pip install wheel packaging
-          uv pip install --no-build-isolation -e .
-          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
-            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
+          pip3 install wheel packaging==26.0
+          pip3 install --no-build-isolation -e .
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt

      - name: Extract tag name
        id: tag
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -2,18 +2,15 @@ name: Tests Nightly against upstream main
 on:
  workflow_dispatch:
  schedule:
-    - cron: "0 0 * * *" # Runs at 00:00 UTC every day
+    - cron: '0 0 * * *'  # Runs at 00:00 UTC every day
  pull_request:
    types: [opened, synchronize, reopened, ready_for_review]
    paths:
-      - ".github/workflows/tests-nightly.yml"
+      - '.github/workflows/tests-nightly.yml'

 permissions:
  contents: read

-env:
-  UV_SYSTEM_PYTHON: "1"
-
 jobs:
  pre-commit:
    name: pre-commit
@@ -23,7 +20,7 @@ jobs:
      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"
-          cache: "pip" # caching pip dependencies
+          cache: 'pip' # caching pip dependencies
      - uses: pre-commit/action@v3.0.1
        env:
          SKIP: no-commit-to-branch
@@ -46,7 +43,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python_version: ["3.12"] # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
+        python_version: ["3.12"]  # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
        pytorch_version: ["2.9.1", "2.10.0"]
    timeout-minutes: 20

@@ -64,34 +61,36 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
+          cache: 'pip' # caching pip dependencies

-      - name: Install uv
-        uses: astral-sh/setup-uv@v7
+      - name: upgrade pip
+        run: |
+          pip3 install --upgrade pip
+          pip3 install --upgrade packaging==26.0 setuptools==78.1.1 wheel

      - name: Install PyTorch
        run: |
-          uv pip install torch==${{ matrix.pytorch_version }} torchvision
-          uv pip freeze | grep -E "^(torch|torchvision)==" > /tmp/torch-pin.txt
+          pip3 install torch==${{ matrix.pytorch_version }} torchvision
+
+      - name: Update requirements.txt
+        run: |
+          sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt
+          sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt
+          sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt
+          sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt
+          sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt

      - name: Install dependencies
        run: |
-          uv pip install --no-build-isolation -e . --override /tmp/torch-pin.txt
-          python scripts/cutcrossentropy_install.py --uv | sh
-          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
-            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
-
-      - name: Override with nightly HF packages
-        run: |
-          uv pip install --no-deps \
-            "transformers @ git+https://github.com/huggingface/transformers.git@main" \
-            "peft @ git+https://github.com/huggingface/peft.git@main" \
-            "accelerate @ git+https://github.com/huggingface/accelerate.git@main" \
-            "trl @ git+https://github.com/huggingface/trl.git@main" \
-            "datasets @ git+https://github.com/huggingface/datasets.git@main"
+          pip3 show torch
+          pip3 install --no-build-isolation -U -e .
+          python scripts/unsloth_install.py | sh
+          python scripts/cutcrossentropy_install.py | sh
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt

      - name: Make sure PyTorch version wasn't clobbered
        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__, f'Expected torch ${{ matrix.pytorch_version }} but got {torch.__version__}'"
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"

      - name: Ensure axolotl CLI was installed
        run: |
@@ -103,6 +102,9 @@ jobs:
          pytest -v --durations=10 tests/patched/
          pytest -v --durations=10 tests/cli/

+      - name: cleanup pip cache
+        run: |
+          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

  docker-e2e-tests:
    if: github.repository_owner == 'axolotl-ai-cloud'
@@ -134,6 +136,7 @@ jobs:
            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
+            dockerfile: "Dockerfile-uv.jinja"
            nightly_build: "true"
    steps:
      - name: Checkout
@@ -154,7 +157,7 @@ jobs:
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -6,19 +6,21 @@ on:
    branches:
      - "main"
    paths:
-      - "**.py"
-      - "pyproject.toml"
-      - ".github/workflows/*.yml"
-      - "cicd/cicd.sh"
-      - "cicd/Dockerfile-uv.jinja"
+      - '**.py'
+      - 'requirements.txt'
+      - '.github/workflows/*.yml'
+      - 'requirements-tests.txt'
+      - 'cicd/cicd.sh'
+      - 'cicd/Dockerfile.jinja'
  pull_request:
-    types: [opened, synchronize, reopened, ready_for_review]
-    paths:
-      - "**.py"
-      - "pyproject.toml"
-      - ".github/workflows/*.yml"
-      - "cicd/cicd.sh"
-      - "cicd/Dockerfile-uv.jinja"
+      types: [opened, synchronize, reopened, ready_for_review]
+      paths:
+       - '**.py'
+       - 'requirements.txt'
+       - '.github/workflows/*.yml'
+       - 'requirements-tests.txt'
+       - 'cicd/cicd.sh'
+       - 'cicd/Dockerfile.jinja'
  workflow_dispatch:

 # Cancel jobs on the same ref if a new one is triggered
@@ -31,7 +33,6 @@ permissions:

 env:
  TRANSFORMERS_IS_CI: "yes"
-  UV_SYSTEM_PYTHON: "1"

 jobs:
  pre-commit:
@@ -43,7 +44,7 @@ jobs:
      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"
-          cache: "pip" # caching pip dependencies
+          cache: 'pip' # caching pip dependencies
      - uses: pre-commit/action@v3.0.1
        env:
          SKIP: no-commit-to-branch
@@ -72,7 +73,7 @@ jobs:
        exclude:
          - python_version: "3.14"
            pytorch_version: "2.9.1"
-    timeout-minutes: 25
+    timeout-minutes: 20

    steps:
      - name: cleanup node
@@ -93,25 +94,32 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
+          cache: 'pip' # caching pip dependencies

-      - name: Install uv
-        uses: astral-sh/setup-uv@v7
+      - name: upgrade pip
+        run: |
+          pip3 install --upgrade pip
+          pip3 install --upgrade packaging==26.0 setuptools==75.8.0 wheel

      - name: Install PyTorch
        run: |
-          uv pip install torch==${{ matrix.pytorch_version }} torchvision
-          uv pip freeze | grep -E "^(torch|torchvision)==" > /tmp/torch-pin.txt
+          pip3 install --no-cache-dir torch==${{ matrix.pytorch_version }} torchvision

      - name: Install dependencies
        run: |
-          uv pip install --no-build-isolation -e . --override /tmp/torch-pin.txt
-          python scripts/cutcrossentropy_install.py --uv | sh
-          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
-            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
+          pip3 show torch
+          pip3 install --no-cache-dir --no-build-isolation -U -e .
+          python scripts/unsloth_install.py | sh
+          python scripts/cutcrossentropy_install.py | sh
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+
+      - name: cleanup pip cache
+        run: |
+          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

      - name: Make sure PyTorch version wasn't clobbered
        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__, f'Expected torch ${{ matrix.pytorch_version }} but got {torch.__version__}'"
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"

      - name: Ensure axolotl CLI was installed
        run: |
@@ -180,27 +188,33 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
+          cache: 'pip' # caching pip dependencies

-      - name: Install uv
-        uses: astral-sh/setup-uv@v7
+      - name: upgrade pip
+        run: |
+          pip3 install --upgrade pip
+          pip3 install --upgrade packaging==26.0 setuptools==75.8.0 setuptools_scm build wheel psutil

      - name: Install PyTorch
        run: |
-          uv pip install torch==${{ matrix.pytorch_version }} torchvision
-          uv pip freeze | grep -E "^(torch|torchvision)==" > /tmp/torch-pin.txt
+          pip3 install --no-cache-dir torch==${{ matrix.pytorch_version }} torchvision

      - name: Install dependencies
        run: |
-          uv pip install packaging setuptools_scm build wheel psutil
+          pip3 show torch
          python -m build --no-isolation --sdist
-          uv pip install --no-build-isolation dist/axolotl*.tar.gz --override /tmp/torch-pin.txt
-          python scripts/cutcrossentropy_install.py --uv | sh
-          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
-            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
+          pip3 install --no-cache-dir --no-build-isolation dist/axolotl*.tar.gz
+          python scripts/unsloth_install.py | sh
+          python scripts/cutcrossentropy_install.py | sh
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+
+      - name: cleanup pip cache
+        run: |
+          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

      - name: Make sure PyTorch version wasn't clobbered
        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__, f'Expected torch ${{ matrix.pytorch_version }} but got {torch.__version__}'"
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"

      - name: Ensure axolotl CLI was installed
        run: |
@@ -277,6 +291,7 @@ jobs:
            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
+            dockerfile: "Dockerfile-uv.jinja"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -297,7 +312,7 @@ jobs:
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
@@ -359,7 +374,7 @@ jobs:
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "GPU_TYPE=${{ matrix.gpu_type || 'L40S'}}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -26,7 +26,7 @@ axolotl config-schema                  # Dump config JSON schema
 | Method | Config Key | When to Use |
 |--------|-----------|-------------|
 | SFT | *(default)* | Input-output pairs, instruction tuning |
-| DPO/IPO | `rl: dpo` / `rl: dpo, dpo_loss_type: ["ipo"]` | Paired preference data (chosen vs rejected) |
+| DPO/IPO | `rl: dpo` / `rl: ipo` | Paired preference data (chosen vs rejected) |
 | KTO | `rl: kto` | Unpaired binary preference labels |
 | ORPO | `rl: orpo` | Single-stage alignment, no ref model |
 | GRPO | `rl: grpo` | RL with verifiable reward functions (math, code) |
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,6 +1,7 @@
+include requirements.txt
 include README.md
 include LICENSE
-include VERSION
+include src/setuptools_axolotl_dynamic_dependencies.py
 include src/axolotl/utils/chat_templates/templates/*.jinja
 include AGENTS.md
 recursive-include docs/agents *.md
--- a/README.md
+++ b/README.md
@@ -29,9 +29,6 @@

 ## 🎉 Latest Updates

- 2026/04:
-  - New model support has been added in Axolotl for [Mistral Medium 3.5](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/mistral-medium-3_5) and [Gemma 4](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/gemma4).
-  - Axolotl is now [uv-first](https://github.com/axolotl-ai-cloud/axolotl/pull/3545) and has [SonicMoE fused LoRA](https://github.com/axolotl-ai-cloud/axolotl/pull/3519) support.
 - 2026/03:
  - New model support has been added in Axolotl for [Mistral Small 4](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/mistral4), [Qwen3.5, Qwen3.5 MoE](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen3.5), [GLM-4.7-Flash](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm47-flash), [GLM-4.6V](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm46v), and [GLM-4.5-Air](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm45).
  - [MoE expert quantization](https://docs.axolotl.ai/docs/expert_quantization.html) support (via `quantize_moe_experts: true`) greatly reduces VRAM when training MoE models (FSDP2 compat).
@@ -98,11 +95,14 @@ Features:

 ### Installation

-```bash
-# install uv if you don't already have it installed (restart shell after)
-curl -LsSf https://astral.sh/uv/install.sh | sh
+#### Using uv (recommended)

-# change depending on system
+```bash
+# install uv if you don't already have it installed
+curl -LsSf https://astral.sh/uv/install.sh | sh
+source $HOME/.local/bin/env
+
+# CUDA 12.8.1 tends to have better package compatibility
 export UV_TORCH_BACKEND=cu128

 # create a new virtual environment
@@ -112,6 +112,23 @@ source .venv/bin/activate
 uv pip install torch==2.10.0 torchvision
 uv pip install --no-build-isolation axolotl[deepspeed]

+# recommended - install cut-cross-entropy
+uv pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@main"
+
+# (optional) - prefetch flash-attn2 and causal-conv1d kernels
+uv run --python 3.12 python -c "from kernels import get_kernel; get_kernel('kernels-community/flash-attn2'); get_kernel('kernels-community/causal-conv1d')"
+
+# Download example axolotl configs, deepspeed configs
+axolotl fetch examples
+axolotl fetch deepspeed_configs  # OPTIONAL
+```
+
+#### Using pip
+
+```bash
+pip3 install -U packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
+
 # Download example axolotl configs, deepspeed configs
 axolotl fetch examples
 axolotl fetch deepspeed_configs  # OPTIONAL
@@ -121,7 +138,7 @@ axolotl fetch deepspeed_configs  # OPTIONAL

 Installing with Docker can be less error prone than installing in your own environment.
 ```bash
-docker run --gpus '"all"' --ipc=host --rm -it axolotlai/axolotl:main-latest
+docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
 ```

 Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html).
--- a/SETUP_MIAAI.md
+++ b/SETUP_MIAAI.md
@@ -1,273 +0,0 @@
-# Axolotl Setup — miaai (RTX 5080, CUDA 13.2)
-
-## System Info
- GPU: NVIDIA RTX 5080 (16GB VRAM, sm_120 / Blackwell)
- Driver: 580.126.09 — max CUDA 13.0 shown by nvidia-smi, but nvcc from conda is 13.2
- OS: Ubuntu 25.10 (Python 3.13 system — do NOT use system Python for ML)
- Axolotl repo: `/home/tocmo0nlord/axolotl` (branch: `activeblue/main`)
- Conda env: `axolotl` at `/opt/miniconda3/envs/axolotl`
-
---
-
-## Starting from Bare Ubuntu 25.10
-
-If rebuilding from scratch, complete these steps first before anything else.
-
-### A. System packages
-```bash
-sudo apt update && sudo apt upgrade -y
-sudo apt install -y \
-  build-essential cmake git curl wget \
-  python3-dev libssl-dev zlib1g-dev \
-  ca-certificates gnupg lsb-release
-```
-
-### B. NVIDIA driver (580.xx)
-Ubuntu 25.10 is too new for NVIDIA's apt repo. Install via ubuntu-drivers:
-```bash
-sudo ubuntu-drivers autoinstall
-sudo reboot
-```
-
-After reboot, verify:
-```bash
-nvidia-smi
-# Must show: NVIDIA GeForce RTX 5080, Driver Version: 580.x
-```
-
-If ubuntu-drivers installs the wrong version, force the right one:
-```bash
-sudo apt install -y nvidia-driver-580
-sudo reboot
-```
-
-### C. Install Ollama
-```bash
-curl -fsSL https://ollama.com/install.sh | sh
-
-# Verify it's running
-systemctl status ollama
-```
-
-### D. HuggingFace CLI
-```bash
-pip3 install huggingface_hub
-huggingface-cli login
-# Paste your HF token — required for gated models like meta-llama
-```
-
-Once steps A–D are done, continue with the One-time Setup below.
-
---
-
-## Pre-Training Checklist (every session)
-
-```bash
-# 1. Stop Ollama — if it receives a request mid-training it will compete for VRAM
-sudo systemctl stop ollama
-
-# 2. Activate conda env
-export PATH="/opt/miniconda3/bin:$PATH"
-conda activate axolotl
-
-# 3. Set env vars
-export CUDA_HOME=$CONDA_PREFIX
-export PATH=$CUDA_HOME/bin:$PATH
-export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
-
-# 4. Confirm GPU is clear (should show no processes before training)
-nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv
-
-# 5. Go to axolotl directory
-cd /home/tocmo0nlord/axolotl
-```
-
-## Run Training
-```bash
-axolotl train ~/human_chat_qlora.yml
-```
-
-## After Training
-```bash
-# Restart Ollama
-sudo systemctl start ollama
-
-# Test the adapter interactively
-axolotl inference ~/human_chat_qlora.yml \
-  --lora-model-dir ~/outputs/llama31-8b-humanchat \
-  --prompter chat
-
-# (Optional) Merge adapter into base model for standalone deployment
-axolotl merge-lora ~/human_chat_qlora.yml
-```
-
---
-
-## One-time Setup (fresh machine — after bare Ubuntu steps above)
-
-### 1. Install Miniconda
-```bash
-wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
-bash miniconda.sh -b -p /opt/miniconda3
-/opt/miniconda3/bin/conda init bash
-source ~/.bashrc
-```
-
-### 2. Create Python 3.11 environment
-```bash
-conda create -n axolotl python=3.11 -y
-conda activate axolotl
-```
-
-### 3. Clone axolotl repo
-```bash
-git clone https://git.activeblue.net/tocmo0nlord/axolotl.git /home/tocmo0nlord/axolotl
-cd /home/tocmo0nlord/axolotl
-git remote add upstream https://github.com/axolotl-ai-cloud/axolotl.git
-git fetch upstream
-git rebase upstream/main        # keeps activeblue patches on top
-```
-
-### 4. Install CUDA toolkit (needed to compile flash-attn and bitsandbytes)
-```bash
-conda install -y -c "nvidia/label/cuda-12.8.0" cuda-toolkit
-export CUDA_HOME=$CONDA_PREFIX
-export PATH=$CUDA_HOME/bin:$PATH
-```
-
-> NOTE: Despite installing from the cuda-12.8.0 channel, conda resolves nvcc to **13.2.78**.
-> This is fine — use cu132 everywhere to match.
-
-### 5. Install PyTorch — use cu132 (matches nvcc from conda)
-```bash
-# torchaudio has no cu132 wheel — skip it, not needed for LLM training
-pip install torch torchvision --index-url https://download.pytorch.org/whl/cu132
-python -c "import torch; print('CUDA:', torch.version.cuda); print('GPU:', torch.cuda.get_device_name(0))"
-```
-
-### 6. Install Axolotl
-```bash
-cd /home/tocmo0nlord/axolotl
-pip install -e "."
-```
-
-### 7. Install flash-attn
-> Compiles CUDA kernels from source — takes 15–25 min on 10 cores of i7-14700K.
-```bash
-MAX_JOBS=10 pip install flash-attn --no-build-isolation
-```
-
-### 8. Compile bitsandbytes from source for sm_120 (RTX 5080 / Blackwell)
-
-Prebuilt wheels do not include sm_120. CUDA 13.2 also dropped sm_50–53.
-Must compile from source with a patched CMakeLists.txt.
-
-```bash
-# Clone bitsandbytes v0.49.1
-git clone --branch v0.49.1 --depth 1 \
-  https://github.com/bitsandbytes-foundation/bitsandbytes.git /tmp/bnb_0491
-
-# Patch CMakeLists.txt: insert sm_120 override before the foreach loop
-# (cmake >= 3.23.0 uses its own built-in arch list which does not include sm_120)
-sed -i '/    foreach(capability \${CMAKE_CUDA_ARCHITECTURES_ALL})/i\    # RTX 5080 sm_120 patch\n    set(CMAKE_CUDA_ARCHITECTURES_ALL 120)' /tmp/bnb_0491/CMakeLists.txt
-
-# Verify patch landed correctly — set() line must appear immediately before foreach
-grep -n "ARCHITECTURES_ALL\|foreach" /tmp/bnb_0491/CMakeLists.txt | tail -5
-
-# Configure — must point cmake at conda's nvcc explicitly
-cmake \
-  -DCMAKE_CUDA_COMPILER=/opt/miniconda3/envs/axolotl/bin/nvcc \
-  -DCOMPUTE_BACKEND=cuda \
-  -S /tmp/bnb_0491 \
-  -B /tmp/bnb_0491/build 2>&1 | grep -E "(Capabilit|CUDA Ver|Error)"
-# Must show: CUDA Capabilities Selected: 120
-
-# Build (adjust -j to your CPU core count)
-cmake --build /tmp/bnb_0491/build -j10
-
-# Install into conda site-packages
-cp -r /tmp/bnb_0491/bitsandbytes \
-  /opt/miniconda3/envs/axolotl/lib/python3.11/site-packages/
-
-# Verify CUDA works
-python3 -c "
-import torch, bitsandbytes as bnb
-x = torch.randn(64, 64, device='cuda')
-l = bnb.nn.Linear8bitLt(64, 64).cuda()
-print('bitsandbytes CUDA OK:', l(x).shape)
-"
-```
-
-### 9. Copy training config to home
-```bash
-cp /home/tocmo0nlord/axolotl/human_chat_qlora.yml ~/human_chat_qlora.yml
-```
-
-### 10. Verify the full stack
-```bash
-python3 -c "
-import torch, bitsandbytes as bnb, flash_attn, transformers
-print('torch      :', torch.__version__, '| CUDA:', torch.version.cuda)
-print('bitsandbytes:', bnb.__version__)
-print('flash_attn :', flash_attn.__version__)
-print('transformers:', transformers.__version__)
-print('GPU        :', torch.cuda.get_device_name(0))
-print('VRAM       :', round(torch.cuda.get_device_properties(0).total_memory/1e9, 1), 'GB')
-"
-```
-
-Expected output:
-```
-torch      : 2.x.x | CUDA: 13.2
-bitsandbytes: 0.50.0.dev0
-flash_attn : 2.x.x
-transformers: 5.x.x
-GPU        : NVIDIA GeForce RTX 5080
-VRAM       : 16.3 GB
-```
-
---
-
-## Training Config — human_chat_qlora.yml
-
-Key settings tuned for RTX 5080 (16GB):
-
-| Setting | Value | Notes |
-|---|---|---|
-| `sequence_len` | `2048` | 4096 OOMs during loss computation (logits x 128k vocab) |
-| `micro_batch_size` | `1` | Effective batch = micro x grad_accum = 8 |
-| `gradient_accumulation_steps` | `8` | Keeps effective batch size at 8 |
-| `adapter` | `qlora` | 4-bit via bitsandbytes compiled from source |
-| `attn_implementation` | `flash_attention_2` | Not the deprecated `flash_attention: true` |
-| `type` (datasets) | `chat_template` | Not the deprecated `sharegpt` |
-
-Expected training metrics (RTX 5080, ~65k samples, 2 epochs):
- VRAM: ~10–11 GB active, ~11 GB allocated
- Training duration: ~3.5 hours
- Initial eval loss: ~0.81, perplexity ~2.25
- Final loss target: ~0.55–0.60
-
-To push VRAM to ~14GB and improve training: set `micro_batch_size: 2` and `gradient_accumulation_steps: 4`.
-
---
-
-## Common Pitfalls
-
-| Problem | Cause | Fix |
-|---|---|---|
-| `externally-managed-environment` | System Python 3.13 blocks pip | Use conda env, never system pip |
-| `No module named torch` (flash-attn) | pip builds in isolated env | Use `--no-build-isolation` |
-| `CUDA_HOME not set` | CUDA toolkit not installed | `conda install cuda-toolkit` from nvidia channel |
-| `CUDA version mismatch 13.2 vs 12.8` | Conda nvcc is 13.2, torch was cu128 | Reinstall torch with `--index-url .../cu132` |
-| `torchaudio` not found for cu132 | No cu132 wheel exists | Skip torchaudio — not needed |
-| flash-attn compile is slow | Single-threaded by default | Set `MAX_JOBS=<cpu_count>` before pip install |
-| `nvcc fatal: Unsupported gpu architecture 'compute_50'` | bitsandbytes CMakeLists.txt hardcodes sm_50; CUDA 13.2 dropped it | Patch CMakeLists.txt (see step 8 above) |
-| `CUDA Capabilities Selected: 50;52;...` ignores -D flag | cmake >= 3.23 built-in arch list lacks sm_120; CMakeLists.txt overrides -D | Insert `set(CMAKE_CUDA_ARCHITECTURES_ALL 120)` before foreach loop |
-| `BackendUnavailable: scikit_build_core` | pip install of bnb triggers cmake rebuild | Copy .so directly to site-packages instead |
-| `torch.OutOfMemoryError` during eval | logits tensor (batch x 4096 x 128k vocab) too large | Set `sequence_len: 2048`, `micro_batch_size: 1` |
-| `type: sharegpt` deprecation warning | axolotl removed sharegpt type | Use `type: chat_template` with field mappings |
-| `flash_attention: true` deprecation | Old config key removed | Use `attn_implementation: flash_attention_2` |
-| Capybara dataset `field_messages null` | Capybara uses input/output format, not conversations | Switch to SlimOrca or OpenHermes-2.5 |
-| Ollama loads model mid-training | Ollama is enabled and receives a request | `sudo systemctl stop ollama` before training |
-| Training much slower than eval speed | The fast it/s on screen is the eval loop (forward only) | Normal — training includes backward pass and optimizer (~3.5h total) |
-| ubuntu-drivers installs wrong NVIDIA version | Multiple driver candidates available | Force with `apt install nvidia-driver-580` |
--- a/2
+++ b/2
@@ -1 +1 @@
-0.16.2.dev0
+0.16.0.dev0
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -134,6 +134,7 @@ quartodoc:
        - monkeypatch.stablelm_attn_hijack_flash
        - monkeypatch.trainer_fsdp_optim
        - monkeypatch.transformers_fa_utils
+        - monkeypatch.unsloth_
        - monkeypatch.data.batch_dataset_fetcher
        - monkeypatch.mixtral
        - monkeypatch.gradient_checkpointing.offload_cpu
@@ -311,7 +312,6 @@ website:
            - docs/dataset_loading.qmd
            - docs/qat.qmd
            - docs/quantize.qmd
-            - docs/1_58bit_finetuning.qmd
            - docs/optimizations.qmd

        - section: "Core Concepts"
@@ -327,6 +327,7 @@ website:
        - section: "Advanced Features"
          contents:
            - docs/fsdp_qlora.qmd
+            - docs/unsloth.qmd
            - docs/torchao.qmd
            - docs/custom_integrations.qmd
            - docs/sequence_parallelism.qmd
--- a/cicd/Dockerfile-uv.jinja
+++ b/cicd/Dockerfile-uv.jinja
@@ -22,6 +22,15 @@ WORKDIR /workspace/axolotl
 RUN git fetch origin +$GITHUB_REF && \
    git checkout FETCH_HEAD

+# If AXOLOTL_EXTRAS is set, append it in brackets
+RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
+        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
+        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
+        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
+        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
+        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
+    fi
+
 RUN uv pip install packaging==26.0 setuptools==78.1.1
 RUN uv pip install torchvision
 RUN uv pip uninstall causal_conv1d
@@ -31,21 +40,11 @@ RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
    fi

-# Override with nightly HF packages for nightly builds
-RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
-        uv pip install --no-deps \
-            "transformers @ git+https://github.com/huggingface/transformers.git@main" \
-            "peft @ git+https://github.com/huggingface/peft.git@main" \
-            "accelerate @ git+https://github.com/huggingface/accelerate.git@main" \
-            "trl @ git+https://github.com/huggingface/trl.git@main" \
-            "datasets @ git+https://github.com/huggingface/datasets.git@main"; \
-    fi
-
+RUN python scripts/unsloth_install.py --uv | sh
 RUN python scripts/cutcrossentropy_install.py --uv | sh

 # So we can test the Docker image
-RUN uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
-    codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
+RUN uv pip install -r requirements-dev.txt -r requirements-tests.txt

 # fix so that git fetch/pull from remote works
 RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -0,0 +1,54 @@
+FROM axolotlai/axolotl-base:{{ BASE_TAG }}
+
+ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
+ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
+ENV CUDA="{{ CUDA }}"
+ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
+ENV GITHUB_REF="{{ GITHUB_REF }}"
+ENV GITHUB_SHA="{{ GITHUB_SHA }}"
+ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
+ENV HF_HOME="{{ HF_HOME }}"
+ENV AXOLOTL_DATASET_NUM_PROC="8"
+
+RUN apt-get update && \
+    apt-get install -y --allow-change-held-packages vim curl nano zstd libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm
+
+WORKDIR /workspace
+
+RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
+
+WORKDIR /workspace/axolotl
+
+RUN git fetch origin +$GITHUB_REF && \
+    git checkout FETCH_HEAD
+
+# If AXOLOTL_EXTRAS is set, append it in brackets
+RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
+        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
+        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
+        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
+        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
+        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
+    fi
+
+RUN pip install packaging==26.0 setuptools==78.1.1 psutil
+RUN pip uninstall -y causal_conv1d
+RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
+        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+    else \
+        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
+    fi
+
+RUN python scripts/unsloth_install.py | sh
+RUN python scripts/cutcrossentropy_install.py | sh
+
+# So we can test the Docker image
+RUN pip install -r requirements-dev.txt -r requirements-tests.txt
+
+# fix so that git fetch/pull from remote works
+RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
+    git config --get remote.origin.fetch
+
+# helper for huggingface-login cli
+RUN git config --global credential.helper store
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -e

-python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__, f'Expected torch $PYTORCH_VERSION but got {torch.__version__}'"
+python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"

 set -o pipefail
 for i in 1 2 3; do
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -17,7 +17,7 @@ template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
 template_env = jinja2.Environment(
    loader=template_loader, autoescape=select_autoescape()
 )
-dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile-uv.jinja")
+dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja")
 df_template = template_env.get_template(dockerfile)

 df_args = {
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
@@ -16,7 +16,7 @@ template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
 template_env = jinja2.Environment(
    loader=template_loader, autoescape=select_autoescape()
 )
-dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile-uv.jinja")
+dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja")
 df_template = template_env.get_template(dockerfile)

 df_args = {
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -24,15 +24,15 @@ WORKDIR /workspace/axolotl
 # If AXOLOTL_EXTRAS is set, append it in brackets; don't install deepspeed with arm64
 RUN pip uninstall -y causal_conv1d
 RUN if [ "$TARGETARCH" = "arm64" ]; then \
-        BASE_EXTRAS="optimizers,ray"; \
+        BASE_EXTRAS="flash-attn,ring-flash-attn,optimizers,ray"; \
    else \
-        BASE_EXTRAS="deepspeed,optimizers,ray"; \
+        BASE_EXTRAS="deepspeed,flash-attn,ring-flash-attn,optimizers,ray"; \
    fi && \
    if [ "$AXOLOTL_EXTRAS" != "" ]; then \
        pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
        pip install --no-build-isolation -e .[$BASE_EXTRAS] $AXOLOTL_ARGS; \
-    fi && \
+    fi && \    python scripts/unsloth_install.py | sh && \
    python scripts/cutcrossentropy_install.py | sh && \
    pip install pytest && \
    pip cache purge
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -58,3 +58,19 @@ RUN git lfs install --skip-repo && \
    # The base image ships with `pydantic==1.8.2` which is not working
    pip3 install -U --no-cache-dir pydantic==1.10.10 && \
    pip3 cache purge
+
+# Map Python version (e.g., 3.12 -> cp312)
+RUN PYTHON_CP="cp$(echo $PYTHON_VERSION | tr -d '.')" && \
+    # Map PyTorch version (e.g., 2.9.1 -> torch2.9, 2.10.0 -> torch2.10)
+    TORCH_TAG="torch$(echo $PYTORCH_VERSION | grep -oP '^\d+\.\d+')" && \
+    # Map architecture
+    case "$TARGETARCH" in \
+        amd64) ARCH_TAG="x86_64" ;; \
+        arm64) ARCH_TAG="aarch64" ;; \
+        *) echo "Unsupported architecture: $TARGETARCH"; exit 1 ;; \
+    esac && \
+    WHL_VERSION="v0.7.16" && \
+    WHL_FILE="flash_attn-2.8.3+cu${CUDA}${TORCH_TAG}-${PYTHON_CP}-${PYTHON_CP}-linux_${ARCH_TAG}.whl" && \
+    wget -nv "https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}" && \
+    pip3 install --no-cache-dir "${WHL_FILE}" && \
+    rm "${WHL_FILE}"
--- a/docker/Dockerfile-base-next
+++ b/docker/Dockerfile-base-next
@@ -1,15 +1,16 @@
-ARG CUDA_VERSION="12.8.2"
+ARG CUDA_VERSION="12.8.1"
+ARG CUDNN_VERSION="8"
 ARG UBUNTU_VERSION="22.04"
 ARG MAX_JOBS=4

-FROM nvidia/cuda:12.8.2-devel-ubuntu22.04 AS base-builder
+FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder

-ENV PATH="/root/miniforge3/bin:${PATH}"
+ENV PATH="/root/miniconda3/bin:${PATH}"

 ARG PYTHON_VERSION="3.11"
 ARG PYTORCH_VERSION="next"
 ARG CUDA="128"
-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0 12.0+PTX"
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"

 ENV PYTHON_VERSION=$PYTHON_VERSION
 ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
@@ -17,13 +18,13 @@ ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
 RUN apt-get update \
    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
    && wget \
-    https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh \
+    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
    && mkdir /root/.conda \
-    && bash Miniforge3-Linux-x86_64.sh -b \
-    && rm -f Miniforge3-Linux-x86_64.sh \
-    && /root/miniforge3/bin/conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
+    && bash Miniconda3-latest-Linux-x86_64.sh -b \
+    && rm -f Miniconda3-latest-Linux-x86_64.sh \
+    && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"

-ENV PATH="/root/miniforge3/envs/py${PYTHON_VERSION}/bin:${PATH}"
+ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"

 WORKDIR /workspace

--- a/docker/Dockerfile-tests
+++ b/docker/Dockerfile-tests
@@ -24,9 +24,9 @@ RUN git fetch origin +$GITHUB_REF && \

 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install --no-build-isolation -e .[deepspeed,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        pip install --no-build-isolation -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
-        pip install --no-build-isolation -e .[deepspeed,mamba-ssm] $AXOLOTL_ARGS; \
+        pip install --no-build-isolation -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \
    fi

 # So we can test the Docker image
--- a/docker/Dockerfile-uv
+++ b/docker/Dockerfile-uv
@@ -24,15 +24,16 @@ WORKDIR /workspace/axolotl
 # If AXOLOTL_EXTRAS is set, append it in brackets; don't install deepspeed with arm64
 RUN uv pip uninstall causal_conv1d
 RUN if [ "$TARGETARCH" = "arm64" ]; then \
-        BASE_EXTRAS="optimizers,ray"; \
+        BASE_EXTRAS="flash-attn,ring-flash-attn,optimizers,ray"; \
    else \
-        BASE_EXTRAS="deepspeed,optimizers,ray"; \
+        BASE_EXTRAS="deepspeed,flash-attn,ring-flash-attn,optimizers,ray"; \
    fi && \
    if [ "$AXOLOTL_EXTRAS" != "" ]; then \
        uv pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
        uv pip install --no-build-isolation -e .[$BASE_EXTRAS] $AXOLOTL_ARGS; \
    fi && \
+    python scripts/unsloth_install.py --uv | sh && \
    python scripts/cutcrossentropy_install.py --uv | sh && \
    uv pip install pytest && \
    uv cache clean
--- a/docker/Dockerfile-uv-base
+++ b/docker/Dockerfile-uv-base
@@ -38,3 +38,20 @@ RUN uv pip install packaging setuptools wheel psutil \
 RUN if [ "$TARGETARCH" = "amd64" ]; then \
        MAMBA_SKIP_CUDA_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE uv pip install --no-build-isolation mamba_ssm causal_conv1d; \
    fi
+
+# Map Python version (e.g., 3.12 -> cp312)
+RUN PYTHON_CP="cp$(echo $PYTHON_VERSION | tr -d '.')" && \
+    # Map PyTorch version (e.g., 2.9.1 -> torch2.9, 2.10.0 -> torch2.10)
+    TORCH_TAG="torch$(echo $PYTORCH_VERSION | grep -oP '^\d+\.\d+')" && \
+    LINUX_TAG="manylinux_" && \
+    # Map architecture
+    case "$TARGETARCH" in \
+        amd64) ARCH_TAG="2_24_x86_64.manylinux_2_28_x86_64" ;; \
+        arm64) ARCH_TAG="2_34_aarch64" ;; \
+        *) echo "Unsupported architecture: $TARGETARCH"; exit 1 ;; \
+    esac && \
+    WHL_VERSION="v0.7.16" && \
+    WHL_FILE="flash_attn-2.8.3+cu${CUDA}${TORCH_TAG}-${PYTHON_CP}-${PYTHON_CP}-${LINUX_TAG}${ARCH_TAG}.whl" && \
+    wget -nv "https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}" && \
+    uv pip install --no-cache-dir "${WHL_FILE}" && \
+    rm "${WHL_FILE}"
--- a/docs/1_58bit_finetuning.qmd
+++ b/docs/1_58bit_finetuning.qmd
@@ -1,70 +0,0 @@
---
-title: "1.58-bit Finetuning"
-back-to-top-navigation: true
-toc: true
-toc-expand: 2
-toc-depth: 4
---
-
-## Overview
-
-1.58-bit finetuning allows you to finetune BitNet models when their prequantized weights are provided. In theory, it will be possible to fine-tune any LLM in 1.58bit format but the performance degradation will be dramatic.
-
-Axolotl supports 1.58-bit finetuning via the [`onebitllms`](https://github.com/tiiuae/onebitllms) library, which replaces standard linear layers with BitNet-compatible counterparts ready to use for training.
-
-::: {.callout-note}
-LoRA is not supported for BitNet models
-:::
-
-## Installation
-
-Install the `onebitllms` package before using this feature:
-
-```bash
-uv pip install onebitllms
-```
-
-Or from source:
-
-```bash
-uv pip install git+https://github.com/tiiuae/onebitllms
-```
-
-## Supported models
-
-For now, only `Falcon-E` series of models are supported. Make sure to use their `-prequantized` version:
-
-```bash
-tiiuae/Falcon-E-3B-Base-prequantized
-tiiuae/Falcon-E-1B-Base-prequantized
-```
-
-In theory, any other model would 'work' but the performance degradation will be huge. This remains an area of exploration.
-
-## Configuration
-
-To enable 1.58-bit finetuning, set the following in your configuration file:
-
-```yaml
-base_model: tiiuae/Falcon-E-3B-Base-prequantized  # A BitNet-compatible model
-
-use_onebitllms: true
-```
-
-::: {.callout-note}
-For BitNet models, it is recommended to use a higher learning rate than classic models (usually in the order of magnitude of 10x).
-:::
-
-## Considerations after training
-
-Once your model has been trained with 1.58bit fine-tuning, you can convert the trained model in ternary format using the `onebitllms` CLI:
-
-```bash
-onebitllms quantize_to_1bit INPUT_PATH OUTPUT_PATH
-```
-
-After that, you can use supported packages such as `llama.cpp` or Apple MLX package to run the trained model.
-
-## Example Configuration
-
-You can find example configurations in `examples/falcon-e` which contain one configuration for SFT and one configuration for DPO.
--- a/docs/agents/new_model_support.md
+++ b/docs/agents/new_model_support.md
@@ -121,11 +121,11 @@ Older models that use `_prepare_4d_causal_attention_mask` (Llama, Mistral, Qwen2

 | Backend | Config | head_dim limit | torch_compile | Notes |
 |---------|--------|---------------|---------------|-------|
-| FA2 | `attn_implementation: flash_attention_2` | 256 | ✅ | Fastest when supported |
-| FA4 | auto with `attn_implementation: flash_attention_2` | 256 (SM90+) | ✅ | Auto-detected on H100+ |
-| SDPA | `attn_implementation: sdpa` | None | ✅ | Universal fallback |
-| flex | `attn_implementation: flex_attention` | None | ⚠️ Triton OOM for large head_dim | Good for variable head dims |
-| eager | `attn_implementation: eager` | None | ✅ | Slowest, always works |
+| FA2 | `flash_attention: true` | 256 | ✅ | Fastest when supported |
+| FA4 | auto with `flash_attention: true` | 256 (SM90+) | ✅ | Auto-detected on H100+ |
+| SDPA | `sdp_attention: true` | None | ✅ | Universal fallback |
+| flex | `flex_attention: true` | None | ⚠️ Triton OOM for large head_dim | Good for variable head dims |
+| eager | neither set | None | ✅ | Slowest, always works |

 **Check model support**: Look at `_supports_flash_attn_2`, `_supports_flex_attn`, `_supports_sdpa` attributes on the model class.

--- a/docs/agents/preference_tuning.md
+++ b/docs/agents/preference_tuning.md
@@ -38,7 +38,7 @@ No vLLM server needed (unlike GRPO). Offline RL with pre-collected preference da

 1. Paired preference data (chosen + rejected)?
   - Default → `rl: dpo`
-   - Overfitting → `rl: dpo, dpo_loss_type: ["ipo"]`
+   - Overfitting → `rl: ipo`
   - VRAM-limited → `rl: orpo` (no ref model)
   - Length-sensitive → `rl: simpo` (no ref model)
 2. Only binary labels (good/bad)? → `rl: kto`
--- a/docs/agents/sft.md
+++ b/docs/agents/sft.md
@@ -83,7 +83,7 @@ Watch for: loss never decreasing (check `train_on_inputs`, dataset, LR), loss go
 | Issue | Fix |
 |-------|-----|
 | OOM during training | Reduce `micro_batch_size`, enable `gradient_checkpointing`, reduce `sequence_len` |
-| `sample_packing` + SDPA + bf16 = 0.0 loss | Use `attn_implementation: flash_attention_2` or disable `sample_packing` |
+| `sample_packing` + SDPA + bf16 = 0.0 loss | Use `flash_attention: true` or disable `sample_packing` |
 | Missing chat template error | Set `chat_template: chatml` explicitly |
 | Label masking wrong | Run `axolotl preprocess config.yaml --debug` and inspect labels |
 | Loss NaN | Use `bf16: auto`, lower LR, check data for empty samples |
--- a/docs/attention.qmd
+++ b/docs/attention.qmd
@@ -3,71 +3,28 @@ title: Attention
 description: Supported attention modules in Axolotl
 ---

-Axolotl routes attention via a single config field:
+## SDP Attention
+
+This is the default built-in attention in PyTorch.

 ```yaml
-attn_implementation: <backend>
+sdp_attention: true
 ```

-`attn_implementation` is passed through to `transformers` verbatim (via
-`model.config._attn_implementation`). Accepted values are the HF-native
-backends, axolotl-registered backends, or a hub-kernel path.
+For more details: [PyTorch docs](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)

-## Backends
+## Flash Attention

-| `attn_implementation` | Description |
-|---|---|
-| `eager` | Plain PyTorch attention. No packing support. |
-| `sdpa` | PyTorch `scaled_dot_product_attention`. No packing support. |
-| `flash_attention_2` | Dao-AILab Flash Attention 2. |
-| `flash_attention_3` | Dao-AILab Flash Attention 3 (Hopper+). |
-| `flex_attention` | Torch Flex Attention (requires torch ≥ 2.6). |
-| `xformers` | xFormers memory-efficient attention. |
-| `sage` | SageAttention (QK int8 / PV fp16). |
-| `s2` | Shifted-Sparse Attention (LLaMA only, FA2 under the hood). |
-| `fp8` | torchao FP8 low-precision attention (requires SM90+, torch ≥ 2.11). Loaded as SDPA and patched post-load. |
-| `kernels-community/flash-attn3` | HF hub FA3 kernel. |
-| `kernels-community/sage-attention` | HF hub SageAttention kernel. |
-| Other `<org>/<name>` path | Any hub-kernel path supported by `transformers`. |
-
-Short-form aliases (`flash`, `fa2`, `flex`, `sdp`, etc.) are **not accepted** —
-set the canonical name above.
-
-### Capability flags
-
-Axolotl derives three boolean capability flags from `attn_implementation` and
-exposes them on the validated config:
-
- `cfg.attn_supports_packing` — backend supports varlen sample packing via
-  `position_ids`. Gates multipack patches and `sample_packing_drop_attention_mask`.
- `cfg.attn_uses_flash_lib` — backend needs the `flash_attn` (Dao-AILab)
-  monkeypatches (FA4 auto, LLaMA flash hijack, ring-FA).
- `cfg.attn_needs_dtype_cast` — backend requires fp16/bf16 embeddings
-  (everything except `eager` and `sdpa`).
-
-These are **computed** — they cannot be overridden from YAML.
-
-## Per-backend notes
-
-### SDPA
-
-Default PyTorch attention. See
-[PyTorch docs](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html).
+Axolotl supports Flash Attention 2, 3, and 4. The best available version is used automatically
+based on your installed packages and GPU.

 ```yaml
-attn_implementation: sdpa
+flash_attention: true
 ```

-### Flash Attention
+For more details: [Flash Attention](https://github.com/Dao-AILab/flash-attention/)

-Axolotl supports FA2, FA3, and FA4. The best available version is used
-automatically based on your installed packages and GPU.
-
-```yaml
-attn_implementation: flash_attention_2  # or flash_attention_3
-```
-
-#### Flash Attention 2
+### Flash Attention 2

 Requirements: Ampere, Ada, or Hopper GPUs (Turing or lower not supported)

@@ -82,25 +39,23 @@ Alternatively, try reinstall or downgrade a version.

 :::

-#### Flash Attention 3
+### Flash Attention 3

 Requirements: Hopper only and CUDA 12.8 (recommended)

 ```bash
 git clone https://github.com/Dao-AILab/flash-attention.git
 cd flash-attention/hopper
+
 python setup.py install
 ```

-#### Flash Attention 4
+### Flash Attention 4

-Requirements: Hopper or Blackwell GPUs. Auto-applied when `attn_uses_flash_lib`
-is true and FA4 is importable.
-
-FA4 is still a pre-release on PyPI, so `--pre` is required:
+Requirements: Hopper or Blackwell GPUs

 ```bash
-pip install --pre flash-attn-4
+pip install flash-attn-4
 ```

 Or from source:
@@ -108,6 +63,7 @@ Or from source:
 ```bash
 git clone https://github.com/Dao-AILab/flash-attention.git
 cd flash-attention/flash_attn/cute
+
 pip install -e .

 # FA2's flash_attn package includes a cute/ stub that shadows FA4.
@@ -130,113 +86,93 @@ and falls back to FA2/3.

 :::

+For more details: [flash-attention/flash_attn/cute](https://github.com/Dao-AILab/flash-attention/tree/main/flash_attn/cute)
+
 ### AMD

-Requirements: ROCm 6.0 and above. See
-[Flash Attention AMD docs](https://github.com/Dao-AILab/flash-attention/tree/main?tab=readme-ov-file#amd-rocm-support).
+Requirements: ROCm 6.0 and above.

-### Flex Attention
+See [Flash Attention AMD docs](https://github.com/Dao-AILab/flash-attention/tree/main?tab=readme-ov-file#amd-rocm-support).
+
+## Flex Attention
+
+A flexible PyTorch API for attention used in combination with `torch.compile`.

 ```yaml
-attn_implementation: flex_attention
-torch_compile: true  # recommended
+flex_attention: true
+
+# recommended
+torch_compile: true
 ```

-Requires torch ≥ 2.6. See [PyTorch docs](https://pytorch.org/blog/flexattention/).
+::: {.callout-note}

-### SageAttention
+We recommend using latest stable version of PyTorch for best performance.

-Requirements: Ampere, Ada, or Hopper GPUs.
+:::
+
+For more details: [PyTorch docs](https://pytorch.org/blog/flexattention/)
+
+## SageAttention
+
+Attention kernels with QK Int8 and PV FP16 accumulator.

 ```yaml
-attn_implementation: sage
+sage_attention: true
 ```

+Requirements: Ampere, Ada, or Hopper GPUs
+
 ```bash
 pip install sageattention==2.2.0 --no-build-isolation
 ```

 ::: {.callout-warning}

-Only LoRA/QLoRA recommended. Full finetuning has been observed to drop loss to 0. See
-[GitHub Issue](https://github.com/thu-ml/SageAttention/issues/198).
+Only LoRA/QLoRA recommended at the moment. We found loss drop to 0 for full finetuning. See [GitHub Issue](https://github.com/thu-ml/SageAttention/issues/198).

 :::

-For more details: [Sage Attention](https://github.com/thu-ml/SageAttention).
+For more details: [Sage Attention](https://github.com/thu-ml/SageAttention)

-### xFormers
+::: {.callout-note}
+
+We do not support SageAttention 3 at the moment. If you are interested on adding this or improving SageAttention implementation, please make an Issue.
+
+:::
+
+
+## xFormers

 ```yaml
-attn_implementation: xformers
+xformers_attention: true
 ```

 ::: {.callout-tip}

-Recommended for Turing GPUs or below (e.g. Colab T4).
+We recommend using with Turing GPUs or below (such as on Colab).

 :::

-### Shifted Sparse Attention
+For more details: [xFormers](https://github.com/facebookresearch/xformers)
+
+## Shifted Sparse Attention

 ::: {.callout-warning}

-Planned for deprecation. Prefer one of the backends above.
+We plan to deprecate this! If you use this feature, we recommend switching to methods above.

 :::

-Requirements: LLaMA model architecture. Loaded as FA2 under the hood and
-patched to implement shifted-sparse attention. Does not support sample packing.
+Requirements: LLaMA model architecture

 ```yaml
-attn_implementation: s2
+flash_attention: true
+s2_attention: true
 ```

-### FP8
+::: {.callout-tip}

-torchao low-precision attention. Loaded as SDPA and patched post-load.
-
-Requirements: SM90+ (Hopper/Blackwell), PyTorch ≥ 2.11, torchao ≥ 0.17,
-flash-attn with FA3. KV caching must be disabled.
-
-```yaml
-attn_implementation: fp8
-```
-
-### Hub kernels
-
-```yaml
-attn_implementation: kernels-community/flash-attn3
-```
-
-Passed through to `transformers`; axolotl does not install the kernel itself.
-For recognized hub paths the capability flags are set automatically; for
-arbitrary paths axolotl uses conservative defaults (`attn_supports_packing=False`,
-`attn_uses_flash_lib=False`).
-
-## Migrating from legacy boolean flags
-
-The following legacy config fields are **deprecated** and will be removed in a
-future release. Each emits a `DeprecationWarning` when set and is stripped from
-the validated config.
-
-| Legacy | Canonical |
-|---|---|
-| `flash_attention: true` | `attn_implementation: flash_attention_2` |
-| `sdp_attention: true` | `attn_implementation: sdpa` |
-| `xformers_attention: true` | `attn_implementation: xformers` |
-| `flex_attention: true` | `attn_implementation: flex_attention` |
-| `sage_attention: true` | `attn_implementation: sage` |
-| `s2_attention: true` | `attn_implementation: s2` |
-| `eager_attention: true` | `attn_implementation: eager` |
-
-Combining `attn_implementation` with a legacy flag (e.g. `attn_implementation:
-flash_attention_2` **and** `flash_attention: true`) raises — pick one.
-
-::: {.callout-note}
-
-Existing example configs under `examples/` still use the legacy flags. They
-continue to work with a deprecation warning; they will be migrated in a
-follow-up pass.
+No sample packing support!

 :::
--- a/docs/debugging.qmd
+++ b/docs/debugging.qmd
@@ -76,10 +76,8 @@ datasets:
 Make sure you have an [editable install](https://setuptools.pypa.io/en/latest/userguide/development_mode.html) of Axolotl, which ensures that changes you make to the code are reflected at runtime.  Run the following commands from the root of this project:

 ```bash
-export UV_TORCH_BACKEND=cu128  # or cu130
-uv venv --no-project --relocatable
-source .venv/bin/activate
-uv pip install --no-build-isolation -e '.[deepspeed]' --group dev --group test
+pip3 install packaging
+pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
 ```

 #### Remote Hosts
@@ -210,18 +208,17 @@ cd axolotl
 Next, run the desired docker image and mount the current directory. Below is a docker command you can run to do this:[^2]

 ```bash
-docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl-uv:main-latest
+docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl:main-py3.10-cu118-2.0.1
 ```

 >[!Tip]
 > To understand which containers are available, see the [Docker section of the README](../README.md#docker) and the [DockerHub repo](https://hub.docker.com/r/axolotlai/axolotl/tags).  For details of how the Docker containers are built, see axolotl's [Docker CI builds](../.github/workflows/main.yml).

-You will now be in the container.  Next, install Axolotl with dev dependencies:
+You will now be in the container.  Next, perform an editable install of Axolotl:

 ```bash
-uv venv --no-project --relocatable
-source .venv/bin/activate
-uv pip install --no-build-isolation -e '.[deepspeed]' --group dev --group test
+pip3 install packaging
+pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
 ```

 ### Attach To Container
--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -6,33 +6,23 @@ format:
    toc-depth: 4
 ---

-This section describes the different Docker images that are released by AxolotlAI at
-[Docker Hub](https://hub.docker.com/u/axolotlai).
+This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai).

 ::: {.callout-important}
-### Switch to the `-uv` images
-
-Each image below ships a **uv variant** that uses [uv](https://docs.astral.sh/uv/) with a relocatable venv
-(`/workspace/axolotl-venv`) instead of Miniconda + pip. Append `-uv` to the image name
-(e.g. `axolotlai/axolotl-uv`, `axolotlai/axolotl-base-uv`, `axolotlai/axolotl-cloud-uv`). Tags follow the
-same format as their non-uv counterparts.
-
-**We recommend switching to the `-uv` images early.** In the near future we will publish the uv-based
-build to the non-uv tags as well. The non-uv names will continue to work, but they will start serving
-the uv image.
+For Blackwell GPUs, please use the tags with PyTorch 2.7.1 and CUDA 12.8.
 :::

 ## Base

-The base image is the most minimal image that can install Axolotl. It is based on the `nvidia/cuda` image.
-It includes python, torch, git, git-lfs, awscli, pydantic, and more.
+The base image is the most minimal image that can install Axolotl. It is based on the `nvidia/cuda` image. It includes python, torch, git, git-lfs, awscli, pydantic, and more.

 #### Image

-| Variant | Image | Docker Hub |
-|---------|-------|------------|
-| pip | `axolotlai/axolotl-base` | [Link](https://hub.docker.com/r/axolotlai/axolotl-base) |
-| uv | `axolotlai/axolotl-base-uv` | [Link](https://hub.docker.com/r/axolotlai/axolotl-base-uv) |
+```
+axolotlai/axolotl-base
+```
+
+Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-base)

 #### Tags format

@@ -42,10 +32,8 @@ main-base-py{python_version}-cu{cuda_version}-{pytorch_version}

 Tags examples:

+- `main-base-py3.11-cu128-2.8.0`
 - `main-base-py3.11-cu128-2.9.1`
- `main-base-py3.12-cu128-2.10.0`
- `main-base-py3.12-cu130-2.9.1`
- `main-base-py3.12-cu130-2.10.0`

 ## Main

@@ -53,10 +41,11 @@ The main image is the image that is used to run Axolotl. It is based on the `axo

 #### Image

-| Variant | Image | Docker Hub |
-|---------|-------|------------|
-| pip | `axolotlai/axolotl` | [Link](https://hub.docker.com/r/axolotlai/axolotl) |
-| uv | `axolotlai/axolotl-uv` | [Link](https://hub.docker.com/r/axolotlai/axolotl-uv) |
+```
+axolotlai/axolotl
+```
+
+Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl)

 #### Tags format {#sec-main-tags}

@@ -64,7 +53,7 @@ The main image is the image that is used to run Axolotl. It is based on the `axo
 # on push to main
 main-py{python_version}-cu{cuda_version}-{pytorch_version}

-# latest main (currently torch 2.9.1, python 3.11, cuda 12.8)
+# latest main (currently torch 2.6.0, python 3.11, cuda 12.4)
 main-latest

 # nightly build
@@ -82,13 +71,12 @@ There may be some extra tags appended to the image, like `-vllm` which installs

 Tags examples:

+- `main-py3.11-cu128-2.8.0`
 - `main-py3.11-cu128-2.9.1`
- `main-py3.12-cu128-2.10.0`
- `main-py3.12-cu130-2.9.1`
- `main-py3.12-cu130-2.10.0`
 - `main-latest`
- `main-20260315-py3.11-cu128-2.9.1`
- `0.16.1`
+- `main-20250303-py3.11-cu124-2.6.0`
+- `main-20250303-py3.11-cu126-2.6.0`
+- `0.12.0`

 ## Cloud

@@ -102,10 +90,11 @@ Jupyter lab is run by default. Set `JUPYTER_DISABLE=1` in the environment variab

 #### Image

-| Variant | Image | Docker Hub |
-|---------|-------|------------|
-| pip | `axolotlai/axolotl-cloud` | [Link](https://hub.docker.com/r/axolotlai/axolotl-cloud) |
-| uv | `axolotlai/axolotl-cloud-uv` | [Link](https://hub.docker.com/r/axolotlai/axolotl-cloud-uv) |
+```
+axolotlai/axolotl-cloud
+```
+
+Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-cloud)

 #### Tags format

--- a/docs/ebft.qmd
+++ b/docs/ebft.qmd
@@ -129,7 +129,7 @@ gradient_accumulation_steps: 4
 max_steps: 20
 learning_rate: 5.0e-6
 bf16: auto
-attn_implementation: flash_attention_2
+flash_attention: true
 gradient_checkpointing: true
 output_dir: ./outputs/ebft-quickstart
 ```
@@ -304,7 +304,7 @@ lora_alpha: 32
 lora_target_linear: true

 bf16: auto
-attn_implementation: flex_attention
+flex_attention: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: true          # Required with flex_attention
--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -57,7 +57,7 @@ description: Frequently asked questions

 **Q: vLLM is not working with Axolotl**

-> A: We currently recommend torch 2.10 for use with `vllm`. Please ensure you use the right version. For Docker, please use the `main-py3.12-cu128-2.10.0` tag (note: torch 2.10 images are built with Python 3.12).
+> A: We currently recommend torch 2.6.0 for use with `vllm`. Please ensure you use the right version. For Docker, please use the `main-py3.11-cu124-2.6.0` tag.

 **Q: FA2 2.8.0 `undefined symbol` runtime error on CUDA 12.4**

--- a/docs/grpo.qmd
+++ b/docs/grpo.qmd
@@ -154,7 +154,7 @@ lr_scheduler: cosine
 warmup_steps: 10

 bf16: true
-attn_implementation: flash_attention_2
+flash_attention: true
 gradient_checkpointing: true

 special_tokens:
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -15,30 +15,64 @@ This guide covers all the ways you can install and set up Axolotl for your envir

 - NVIDIA GPU (Ampere architecture or newer for `bf16` and Flash Attention) or AMD GPU
 - Python ≥3.11
- PyTorch ≥2.9.1
+- PyTorch ≥2.6.0

-## Installation {#sec-installation}
+## Installation Methods {#sec-installation-methods}
+
+::: {.callout-important}
+Please make sure to have Pytorch installed before installing Axolotl in your local environment.
+
+Follow the instructions at: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
+:::

 ::: {.callout-important}
 For Blackwell GPUs, please use Pytorch 2.9.1 and CUDA 12.8.
 :::

-### Quick Install {#sec-uv}
+### PyPI Installation (Recommended) {#sec-pypi}

-Axolotl uses [uv](https://docs.astral.sh/uv/) as its package manager. uv is a fast, reliable Python package installer and resolver built in Rust.
+```{.bash}
+pip3 install -U packaging setuptools wheel ninja
+pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
+```

-Install uv if not already installed:
+We use `--no-build-isolation` in order to detect the installed PyTorch version (if
+installed) in order not to clobber it, and so that we set the correct version of
+dependencies that are specific to the PyTorch version or other installed
+co-dependencies.
+
+### uv Installation {#sec-uv}
+
+uv is a fast, reliable Python package installer and resolver built in Rust. It offers significant performance improvements over pip and provides better dependency resolution, making it an excellent choice for complex environments.
+
+Install uv if not already installed
 ```{.bash}
 curl -LsSf https://astral.sh/uv/install.sh | sh
 source $HOME/.local/bin/env
 ```

-Choose your CUDA version (e.g. `cu128`, `cu130`), create a venv, and install:
+Choose your CUDA version to use with PyTorch; e.g. `cu124`, `cu126`, `cu128`,
+then create the venv and activate
 ```{.bash}
-export UV_TORCH_BACKEND=cu128  # or cu130
-uv venv
+export UV_TORCH_BACKEND=cu126
+uv venv --no-project --relocatable
 source .venv/bin/activate
-uv pip install --no-build-isolation axolotl[deepspeed]
+```
+
+Install PyTorch
+- PyTorch 2.6.0 recommended
+```{.bash}
+uv pip install packaging setuptools wheel
+uv pip install torch==2.6.0
+uv pip install awscli pydantic
+```
+
+Install axolotl from PyPi
+```{.bash}
+uv pip install --no-build-isolation axolotl[deepspeed,flash-attn]
+
+# optionally install with vLLM if you're using torch==2.6.0 and want to train w/ GRPO
+uv pip install --no-build-isolation axolotl[deepspeed,flash-attn,vllm]
 ```

 ### Edge/Development Build {#sec-edge-build}
@@ -48,16 +82,14 @@ For the latest features between releases:
 ```{.bash}
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
-export UV_TORCH_BACKEND=cu128  # or cu130
-uv venv
-source .venv/bin/activate
-uv pip install --no-build-isolation -e '.[deepspeed]'
+pip3 install -U packaging setuptools wheel ninja
+pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
 ```

 ### Docker {#sec-docker}

 ```{.bash}
-docker run --gpus '"all"' --rm -it --ipc=host axolotlai/axolotl-uv:main-latest
+docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
 ```

 For development with Docker:
@@ -74,12 +106,12 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it \
  --ulimit memlock=-1 --ulimit stack=67108864 \
  --mount type=bind,src="${PWD}",target=/workspace/axolotl \
  -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-  axolotlai/axolotl-uv:main-latest
+  axolotlai/axolotl:main-latest
 ```
 :::

 ::: {.callout-important}
-For Blackwell GPUs, please use `axolotlai/axolotl-uv:main-py3.11-cu128-2.9.1` or the cloud variant `axolotlai/axolotl-cloud-uv:main-py3.11-cu128-2.9.1`.
+For Blackwell GPUs, please use `axolotlai/axolotl:main-py3.11-cu128-2.9.1` or the cloud variant `axolotlai/axolotl-cloud:main-py3.11-cu128-2.9.1`.
 :::

 Please refer to the [Docker documentation](docker.qmd) for more information on the different Docker images that are available.
@@ -90,7 +122,7 @@ Please refer to the [Docker documentation](docker.qmd) for more information on t

 For providers supporting Docker:

- Use `axolotlai/axolotl-cloud-uv:main-latest`
+- Use `axolotlai/axolotl-cloud:main-latest`
 - Available on:
    - [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
    - [Vast.ai](https://cloud.vast.ai?ref_id=62897&template_id=bdd4a49fa8bce926defc99471864cace&utm_source=axolotl&utm_medium=partner&utm_campaign=template_launch_july2025&utm_content=docs_link)
@@ -109,7 +141,7 @@ For providers supporting Docker:
 ### macOS {#sec-macos}

 ```{.bash}
-uv pip install --no-build-isolation -e '.'
+pip3 install --no-build-isolation -e '.'
 ```

 See @sec-troubleshooting for Mac-specific issues.
@@ -120,44 +152,21 @@ See @sec-troubleshooting for Mac-specific issues.
 We recommend using WSL2 (Windows Subsystem for Linux) or Docker.
 :::

-## Migrating from pip to uv {#sec-migrating}
+## Environment Managers {#sec-env-managers}

-If you have an existing pip-based Axolotl installation, you can migrate to uv:
+### Conda/Pip venv {#sec-conda}

-```{.bash}
-# Install uv
-curl -LsSf https://astral.sh/uv/install.sh | sh
-source $HOME/.local/bin/env
-
-# Create a fresh venv (recommended for a clean start)
-export UV_TORCH_BACKEND=cu128  # or cu130
-uv venv
-source .venv/bin/activate
-
-# Reinstall axolotl
-uv pip install --no-build-isolation axolotl[deepspeed]
-```
-
-## Using pip (Alternative) {#sec-pip}
-
-If you are unable to install uv, you can still use pip directly.
-
-::: {.callout-important}
-Please make sure to have PyTorch installed before installing Axolotl with pip.
-
-Follow the instructions at: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
-:::
-
-```{.bash}
-pip3 install -U packaging setuptools wheel ninja
-pip3 install --no-build-isolation axolotl[deepspeed]
-```
-
-For editable/development installs:
-```{.bash}
-pip3 install -U packaging setuptools wheel ninja
-pip3 install --no-build-isolation -e '.[deepspeed]'
-```
+1. Install Python ≥3.11
+2. Install PyTorch: https://pytorch.org/get-started/locally/
+3. Install Axolotl:
+   ```{.bash}
+   pip3 install -U packaging setuptools wheel ninja
+   pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
+   ```
+4. (Optional) Login to Hugging Face:
+   ```{.bash}
+   hf auth login
+   ```

 ## Troubleshooting {#sec-troubleshooting}

--- a/docs/multimodal_assistant_mask.md
+++ b/docs/multimodal_assistant_mask.md
@@ -1,84 +0,0 @@
-# Multimodal assistant-only loss masking
-
-## Correct placement
-
-```yaml
-# Top-level: only train_on_inputs lives here.
-train_on_inputs: false
-
-datasets:
-  - path: data/train.jsonl
-    type: chat_template
-    roles_to_train:          # per-dataset — this is what the MM scanner reads
-      - assistant
-    train_on_eos: turn       # per-dataset — same
-
-test_datasets:
-  - path: data/val.jsonl
-    type: chat_template
-    split: train
-    roles_to_train:
-      - assistant
-    train_on_eos: turn
-```
-
-## How to verify at runtime
-
-`build_collator` logs the resolved knobs at INFO:
-
-```text
-MM collator: train_on_inputs=False roles_to_train=['assistant'] train_on_eos=turn role_boundaries_override=none
-```
-
-If `roles_to_train` logs as `None`, the YAML knobs are not reaching the
-scanner — check that they are under `datasets[0]`, not at the root.
-
-Each verified strategy additionally logs its resolved boundary token ids at
-strategy init (e.g. `<|turn>model` → `[105, 4368]`, `<turn|>` → `[106]` for
-Gemma 4). If a strategy emits the "has no built-in role boundaries ... only
-pad and media tokens are masked" one-shot warning instead, it is on the
-fallback path — declare per-role markers in YAML via `cfg.role_boundaries`
-(below) to activate masking. The strategies currently on this path are
-listed in the audit table above under `fallback + warn`.
-
-## Config-based override: `cfg.role_boundaries`
-
-For the "unverified" strategies above, or for custom chat templates that
-don't match a built-in strategy's markers, users can declare role boundaries
-directly in YAML without subclassing:
-
-```yaml
-role_boundaries:
-  - role: assistant
-    start: "<|turn>model"
-    end: "<turn|>"
-  - role: user
-    start: "<|turn>user"
-    end: "<turn|>"
-  # Optional keys:
-  # include_start: false   # default False
-  # include_end: true      # default True, respects cfg.train_on_eos
-  # end: eos_token         # sentinel: resolves to tokenizer.eos_token_id
-  # end: null              # span runs to end of sequence
-```
-
-Semantics:
-
- `start` and `end` are literal strings; axolotl encodes them at strategy
-  init via `tokenizer.encode(..., add_special_tokens=False)` and logs the
-  resolved token-id sequences at INFO level.
- The special value `end: eos_token` is the portable way to express
-  "Pixtral-style assistant turns end at EOS" without hard-coding an id.
- `role_boundaries` is an **opt-in override**. A non-empty list **replaces**
-  the strategy's built-in declarations wholesale (partial overlays are
-  intentionally unsupported — they're hard to reason about at review time).
-  Leaving the field unset *or* setting it to an empty list (`[]`) both mean
-  "use the strategy's built-ins." Writing `role_boundaries: []` is almost
-  always a typo or leftover — honoring it literally would produce all-masked
-  labels and zero gradient, so it is treated the same as unset.
- `cfg.roles_to_train` still governs which declared roles contribute to
-  loss. You can declare `user` and `assistant` boundaries and set
-  `roles_to_train: ["assistant"]` to have the scanner correctly identify
-  user spans as masking boundaries without training on their content.
- Invalid specs fail loudly at strategy init (missing `role`/`start`,
-  unencodable markers), not silently at loss-compute time.
--- a/docs/optimizations.qmd
+++ b/docs/optimizations.qmd
@@ -22,12 +22,12 @@ Improves GPU utilization by combining multiple short sequences into a single pac

 Using an optimized attention implementation is critical for training speed.

- **[Flash Attention 2](https://github.com/Dao-AILab/flash-attention)**: `attn_implementation: flash_attention_2`. **(Recommended)** The industry standard for fast attention on modern GPUs. Requires Ampere or higher. For AMD, check [AMD Support](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#amd-rocm-support).
- **[Flex Attention](https://pytorch.org/blog/flexattention/)**: `attn_implementation: flex_attention`.
- **[SDP Attention](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)**: `attn_implementation: sdpa`. PyTorch's native implementation.
- **[Xformers](https://github.com/facebookresearch/xformers)**: `attn_implementation: xformers`. Works with FP16.
+- **[Flash Attention 2](https://github.com/Dao-AILab/flash-attention)**: `flash_attention: true`. **(Recommended)** The industry standard for fast attention on modern GPUs. Requires Ampere or higher. For AMD, check [AMD Support](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#amd-rocm-support).
+- **[Flex Attention](https://pytorch.org/blog/flexattention/)**: `flex_attention: true`.
+- **[SDP Attention](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)**: `sdp_attention: true`. PyTorch's native implementation.
+- **[Xformers](https://github.com/facebookresearch/xformers)**: `xformers_attention: true`. Works with FP16.

-See [Attention](attention.qmd) for the full list of backends and the canonical values.
+*Note: You should only enable one attention backend.*

 ### LoRA Optimizations

--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -320,10 +320,8 @@ The input format is a simple JSON input with customizable fields based on the ab
 As IPO is just DPO with a different loss function, all supported dataset formats for [DPO](#dpo) are also supported for IPO.

 ```yaml
-rl: dpo
-dpo_loss_type: ["ipo"]
+rl: ipo
 ```
-*Note:* Passing `rl: ipo` directly is still supported, but will soon be deprecated.

 ### ORPO

@@ -1147,7 +1145,8 @@ datasets:
    type: ebft_strided_structured.transform
    split: train[:1%]

-attn_implementation: flex_attention   # Strided mode uses flex_attention
+flash_attention: false
+flex_attention: true     # Strided mode uses flex_attention
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: true    # Required for flex_attention
--- a/docs/scripts/examples-allowlist.yml
+++ b/docs/scripts/examples-allowlist.yml
@@ -20,8 +20,6 @@ examples:
    title: Arcee AFM

  # MistralAI
-  - name: mistral-medium-3_5
-    title: Mistral Medium 3.5
  - name: ministral3/think
    title: Ministral 3 Thinking
  - name: ministral3/vision
--- a/docs/sequence_parallelism.qmd
+++ b/docs/sequence_parallelism.qmd
@@ -55,7 +55,7 @@ To use sequence parallelism, you need:

 ## Limitations

- Flash attention must be enabled for this to work (`attn_implementation: flash_attention_2` in config YAML)
+- Flash attention must be enabled for this to work (`flash_attention: true` in config YAML)
 - May have a small performance overhead due to communication between GPUs

 ## Example
--- a/docs/training_stability.qmd
+++ b/docs/training_stability.qmd
@@ -245,7 +245,7 @@ For GRPO, also reduce `max_completion_length`. Memory scales quadratically with
 Reduces attention memory from O(n^2) to O(n):

 ```yaml
-attn_implementation: flash_attention_2
+flash_attention: true
 ```

 ### Step 6: Offload with DeepSpeed
--- a/docs/unsloth.qmd
+++ b/docs/unsloth.qmd
@@ -0,0 +1,53 @@
+---
+title: "Unsloth"
+description: "Hyper-optimized QLoRA finetuning for single GPUs"
+---
+
+### Overview
+
+Unsloth provides hand-written optimized kernels for LLM finetuning that slightly improve speed and VRAM over
+standard industry baselines.
+
+::: {.callout-important}
+Due to breaking changes in transformers `v4.48.0`, users will need to downgrade to `<=v4.47.1` to use this patch.
+
+This will later be deprecated in favor of [LoRA Optimizations](lora_optims.qmd).
+:::
+
+
+### Installation
+
+The following will install the correct unsloth and extras from source.
+
+```bash
+python scripts/unsloth_install.py | sh
+```
+
+### Usage
+
+Axolotl exposes a few configuration options to try out unsloth and get most of the performance gains.
+
+Our unsloth integration is currently limited to the following model architectures:
+ - llama
+
+These options are specific to LoRA finetuning and cannot be used for multi-GPU finetuning
+```yaml
+unsloth_lora_mlp: true
+unsloth_lora_qkv: true
+unsloth_lora_o: true
+```
+
+These options are composable and can be used with multi-gpu finetuning
+```yaml
+unsloth_cross_entropy_loss: true
+unsloth_rms_norm: true
+unsloth_rope: true
+```
+
+### Limitations
+
+- Single GPU only; e.g. no multi-gpu support
+- No deepspeed or FSDP support (requires multi-gpu)
+- LoRA + QLoRA support only. No full fine tunes or fp8 support.
+- Limited model architecture support. Llama, Phi, Gemma, Mistral only
+- No MoE support.
--- a/examples/LiquidAI/README.md
+++ b/examples/LiquidAI/README.md
@@ -15,7 +15,8 @@ Thanks to the team at LiquidAI for giving us early access to prepare for these r
    Here is an example of how to install from pip:
    ```bash
    # Ensure you have a compatible version of Pytorch installed
-    uv pip install --no-build-isolation 'axolotl>=0.16.1'
+    pip3 install packaging setuptools wheel ninja
+    pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
    ```

 2.  Run one of the finetuning examples below.
@@ -34,7 +35,7 @@ Thanks to the team at LiquidAI for giving us early access to prepare for these r

    **LFM2-MoE**
    ```bash
-    uv pip install git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6
+    pip install git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6

    # LoRA SFT (1x48GB @ 16.2GiB)
    axolotl train examples/LiquidAI/lfm2-8b-a1b-lora.yaml
@@ -44,7 +45,7 @@ Thanks to the team at LiquidAI for giving us early access to prepare for these r

 - **Installation Error**: If you encounter `ImportError: ... undefined symbol ...` or `ModuleNotFoundError: No module named 'causal_conv1d_cuda'`, the `causal-conv1d` package may have been installed incorrectly. Try uninstalling it:
  ```bash
-  uv pip uninstall causal-conv1d
+  pip uninstall -y causal-conv1d
  ```

 - **Dataset Loading**: Read more on how to load your own dataset in our [documentation](https://docs.axolotl.ai/docs/dataset_loading.html).
--- a/examples/LiquidAI/lfm2-350m-fft.yaml
+++ b/examples/LiquidAI/lfm2-350m-fft.yaml
@@ -39,7 +39,7 @@ tf32: true
 gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 2
--- a/examples/LiquidAI/lfm2-8b-a1b-lora.yaml
+++ b/examples/LiquidAI/lfm2-8b-a1b-lora.yaml
@@ -48,7 +48,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 2
--- a/examples/LiquidAI/lfm2-vl-lora.yaml
+++ b/examples/LiquidAI/lfm2-vl-lora.yaml
@@ -50,7 +50,8 @@ tf32: true

 gradient_checkpointing: true
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true
+eager_attention:

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/alst/llama3-8b-deepspeed-alst.yaml
+++ b/examples/alst/llama3-8b-deepspeed-alst.yaml
@@ -39,7 +39,7 @@ activation_offloading: legacy

 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_steps: 100
 saves_per_epoch: 1
--- a/examples/alst/llama3-8b-fsdp2-alst.yaml
+++ b/examples/alst/llama3-8b-fsdp2-alst.yaml
@@ -39,7 +39,7 @@ activation_offloading: legacy

 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_steps: 100
 saves_per_epoch: 1
--- a/examples/apertus/README.md
+++ b/examples/apertus/README.md
@@ -11,11 +11,12 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
    Here is an example of how to install from main for pip:

 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
+# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl

-uv pip install --no-build-isolation -e '.'
+pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation -e '.[flash-attn]'

 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
@@ -30,7 +31,7 @@ python scripts/cutcrossentropy_install.py | sh
 # For those using our Docker image, use the below path.
 export CUDA_HOME=/usr/local/cuda

-uv pip install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
+pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
 ```

 For any installation errors, see [XIELU Installation Issues](#xielu-installation-issues)
@@ -66,7 +67,7 @@ If those didn't help, please try the below solutions:
 1. Pass env for CMAKE and try install again:

    ```bash
-    Python_EXECUTABLE=$(which python) uv pip install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
+    Python_EXECUTABLE=$(which python) pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
    ```

 2. Git clone the repo and manually hardcode python path:
@@ -91,7 +92,7 @@ If those didn't help, please try the below solutions:
    ```

    ```bash
-    uv pip install . --no-build-isolation --no-deps
+    pip3 install . --no-build-isolation --no-deps
    ```

 ## Optimization Guides
--- a/examples/apertus/apertus-8b-qlora.yaml
+++ b/examples/apertus/apertus-8b-qlora.yaml
@@ -55,7 +55,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/arcee/README.md
+++ b/examples/arcee/README.md
@@ -13,11 +13,12 @@ Thanks to the team at Arcee.ai for using Axolotl in supervised fine-tuning the A
    Here is an example of how to install from main for pip:

 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
+# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl

-uv pip install --no-build-isolation -e '.'
+pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation -e '.[flash-attn]'

 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
--- a/examples/arcee/afm-4.5b-qlora.yaml
+++ b/examples/arcee/afm-4.5b-qlora.yaml
@@ -55,7 +55,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/archived/cerebras/btlm-ft.yml
+++ b/examples/archived/cerebras/btlm-ft.yml
@@ -59,7 +59,8 @@ gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1

-attn_implementation: flash_attention_2
+flash_attention: true
+sdp_attention:
 flash_optimum:

 gptq_groupsize:
--- a/examples/archived/cerebras/qlora.yml
+++ b/examples/archived/cerebras/qlora.yml
@@ -39,7 +39,8 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/code-llama/13b/lora.yml
+++ b/examples/archived/code-llama/13b/lora.yml
@@ -45,7 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/code-llama/13b/qlora.yml
+++ b/examples/archived/code-llama/13b/qlora.yml
@@ -46,7 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/code-llama/34b/lora.yml
+++ b/examples/archived/code-llama/34b/lora.yml
@@ -45,7 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/code-llama/34b/qlora.yml
+++ b/examples/archived/code-llama/34b/qlora.yml
@@ -46,7 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/code-llama/7b/lora.yml
+++ b/examples/archived/code-llama/7b/lora.yml
@@ -45,7 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/code-llama/7b/qlora.yml
+++ b/examples/archived/code-llama/7b/qlora.yml
@@ -46,7 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/dbrx/16bit-lora.yaml
+++ b/examples/archived/dbrx/16bit-lora.yaml
@@ -52,7 +52,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/archived/dbrx/8bit-lora.yaml
+++ b/examples/archived/dbrx/8bit-lora.yaml
@@ -55,7 +55,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/archived/dbrx/fft-ds-zero3.yaml
+++ b/examples/archived/dbrx/fft-ds-zero3.yaml
@@ -39,7 +39,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
+++ b/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
@@ -45,7 +45,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/archived/falcon/config-7b-lora.yml
+++ b/examples/archived/falcon/config-7b-lora.yml
@@ -43,7 +43,8 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/falcon/config-7b-qlora.yml
+++ b/examples/archived/falcon/config-7b-qlora.yml
@@ -73,7 +73,8 @@ early_stopping_patience: 3
 resume_from_checkpoint:
 auto_resume_from_checkpoints: true
 logging_steps: 1
-attn_implementation: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/falcon/config-7b.yml
+++ b/examples/archived/falcon/config-7b.yml
@@ -40,7 +40,8 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/gemma/qlora.yml
+++ b/examples/archived/gemma/qlora.yml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/gptj/qlora.yml
+++ b/examples/archived/gptj/qlora.yml
@@ -36,7 +36,8 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/jeopardy-bot/config.yml
+++ b/examples/archived/jeopardy-bot/config.yml
@@ -37,7 +37,8 @@ bf16: auto
 tf32: true
 resume_from_checkpoint:
 logging_steps: 5
-attn_implementation: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/mpt-7b/config.yml
+++ b/examples/archived/mpt-7b/config.yml
@@ -39,6 +39,7 @@ bf16: auto
 tf32: true
 resume_from_checkpoint:
 logging_steps: 5
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/openllama-3b/config.yml
+++ b/examples/archived/openllama-3b/config.yml
@@ -39,7 +39,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/openllama-3b/lora.yml
+++ b/examples/archived/openllama-3b/lora.yml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/openllama-3b/qlora.yml
+++ b/examples/archived/openllama-3b/qlora.yml
@@ -40,7 +40,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/qwen/lora.yml
+++ b/examples/archived/qwen/lora.yml
@@ -47,6 +47,7 @@ tf32: false
 gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1
+flash_attention:

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/qwen/qlora.yml
+++ b/examples/archived/qwen/qlora.yml
@@ -47,6 +47,7 @@ tf32: false
 gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1
+flash_attention:

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/qwen/qwen2-moe-lora.yaml
+++ b/examples/archived/qwen/qwen2-moe-lora.yaml
@@ -43,7 +43,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/qwen/qwen2-moe-qlora.yaml
+++ b/examples/archived/qwen/qwen2-moe-qlora.yaml
@@ -46,7 +46,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/redpajama/config-3b.yml
+++ b/examples/archived/redpajama/config-3b.yml
@@ -40,6 +40,7 @@ bf16: auto
 tf32: true
 resume_from_checkpoint:
 logging_steps: 5
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/replit-3b/config-lora.yml
+++ b/examples/archived/replit-3b/config-lora.yml
@@ -38,6 +38,7 @@ tf32: true
 gradient_checkpointing:
 resume_from_checkpoint:
 logging_steps: 1
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/stablelm-2/1.6b/fft.yml
+++ b/examples/archived/stablelm-2/1.6b/fft.yml
@@ -44,7 +44,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
 flash_attn_fuse_mlp: true
--- a/examples/archived/stablelm-2/1.6b/lora.yml
+++ b/examples/archived/stablelm-2/1.6b/lora.yml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true

--- a/examples/archived/starcoder2/qlora.yml
+++ b/examples/archived/starcoder2/qlora.yml
@@ -46,7 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/tiny-llama/lora-mps.yml
+++ b/examples/archived/tiny-llama/lora-mps.yml
@@ -47,6 +47,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
+flash_attention: false

 warmup_ratio: 0.1
 evals_per_epoch: 0
--- a/examples/archived/tiny-llama/lora.yml
+++ b/examples/archived/tiny-llama/lora.yml
@@ -45,7 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/tiny-llama/pretrain.yml
+++ b/examples/archived/tiny-llama/pretrain.yml
@@ -36,7 +36,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/archived/tiny-llama/qlora.yml
+++ b/examples/archived/tiny-llama/qlora.yml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml
+++ b/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml
@@ -71,7 +71,8 @@ early_stopping_patience: 3
 resume_from_checkpoint:
 auto_resume_from_checkpoints: true
 logging_steps: 1
-attn_implementation: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/yi-34B-chat/qlora.yml
+++ b/examples/archived/yi-34B-chat/qlora.yml
@@ -10,7 +10,7 @@ load_in_4bit: true
 sequence_len: 1024
 bf16: auto
 tf32: false
-attn_implementation: flash_attention_2
+flash_attention: true
 special_tokens:
  bos_token: "<|startoftext|>"
  eos_token: "<|endoftext|>"
--- a/examples/cohere/command-r-7b-qlora.yml
+++ b/examples/cohere/command-r-7b-qlora.yml
@@ -48,7 +48,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -36,7 +36,12 @@
    "id": "msOCO4NRmRLa"
   },
   "outputs": [],
-   "source": "%%capture\n# This step can take ~5-10 minutes to install dependencies\n!pip install --no-build-isolation \"axolotl>=0.16.1\"\n!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@fec1a88\""
+   "source": [
+    "%%capture\n",
+    "# This step can take ~5-10 minutes to install dependencies\n",
+    "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
+    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@fec1a88\""
+   ]
  },
  {
   "cell_type": "markdown",
--- a/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
@@ -45,7 +45,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
@@ -45,7 +45,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/deepseek-v2/fft-fsdp-16b.yaml
+++ b/examples/deepseek-v2/fft-fsdp-16b.yaml
@@ -35,7 +35,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 2
--- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -59,7 +59,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 2
--- a/examples/devstral/README.md
+++ b/examples/devstral/README.md
@@ -15,8 +15,9 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r
    Here is an example of how to install from pip:

 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
-uv pip install --no-build-isolation 'axolotl>=0.16.1'
+# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
+pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```

 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage
--- a/examples/devstral/devstral-small-qlora.yml
+++ b/examples/devstral/devstral-small-qlora.yml
@@ -26,6 +26,7 @@ lora_model_dir:
 sequence_len: 2048
 sample_packing: true

+
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0
@@ -50,8 +51,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attn_implementation: flash_attention_2
-# scaling_softmax: true  # needs flex_attention
+flash_attention: true
+scaling_softmax: true

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml
+++ b/examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml
@@ -29,7 +29,7 @@ output_dir: ./outputs/ndp-out/

 sequence_len: 2048
 sample_packing: true
-attn_implementation: flash_attention_2
+flash_attention: true

 gradient_accumulation_steps: 1
 micro_batch_size: 1
--- a/examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml
+++ b/examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml
@@ -26,7 +26,7 @@ output_dir: ./outputs/ndp-out/

 sequence_len: 8192
 sample_packing: true
-attn_implementation: flash_attention_2
+flash_attention: true

 gradient_accumulation_steps: 1
 micro_batch_size: 1  # must be 1 when using context parallel
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Wing Lian	cec99c4133	fix test dims	2026-04-20 20:45:19 -04:00
Wing Lian	d248242490	support for vllm 0.19.1	2026-04-19 18:09:46 -04:00