Update SETUP_MIAAI.md: add bare Ubuntu rebuild section (driver, packages, Ollama)

Update SETUP_MIAAI.md: pre-training checklist, Ollama stop/start, verify script, corrected training time
Update human_chat_qlora.yml: working config for RTX 5080 (seq_len 2048, qlora, chat_template)
2026-05-13 21:33:02 +00:00 · 2026-05-13 21:19:15 +00:00 · 2026-05-13 18:59:19 +00:00 · 2026-05-13 18:58:51 +00:00 · 2026-05-13 14:37:01 +00:00 · 2026-05-13 13:55:52 +00:00
411 changed files with 11405 additions and 2967 deletions
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -31,7 +31,11 @@ PRs are **greatly welcome**!
 Please run below to setup env
 ```bash
-pip3 install -r requirements-dev.txt -r requirements-tests.txt
+# Install axolotl + dev and test dependencies
 export UV_TORCH_BACKEND=cu128  # or cu130
 uv venv --no-project --relocatable
 source .venv/bin/activate
 uv pip install --no-build-isolation -e '.[deepspeed]' --group dev --group test
 pre-commit install
 # test
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -30,14 +30,6 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.9.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
@@ -168,14 +160,6 @@ jobs:
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.9.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -6,7 +6,7 @@ on:
      types: [opened, synchronize, reopened, ready_for_review]
      paths:
       - '**.py'
-       - 'requirements.txt'
+       - 'pyproject.toml'
       - '.github/workflows/*.yml'
       - "*.[q]md"
       - "examples/**/*.y[a]?ml"
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -18,12 +18,6 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.0
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
@@ -180,12 +174,6 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.0
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -3,17 +3,15 @@ name: docker-multigpu-tests-biweekly
 on:
  pull_request:
    paths:
-      - 'tests/e2e/multigpu/**.py'
+      - "tests/e2e/multigpu/**.py"
-      - 'requirements.txt'
+      - "pyproject.toml"
-      - 'setup.py'
+      - ".github/workflows/multi-gpu-e2e.yml"
-      - 'pyproject.toml'
+      - "scripts/cutcrossentropy_install.py"
-      - '.github/workflows/multi-gpu-e2e.yml'
+      - "src/axolotl/core/trainers/mixins/sequence_parallel.py"
-      - 'scripts/cutcrossentropy_install.py'
+      - "src/axolotl/utils/distributed.py"
      - 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
      - 'src/axolotl/utils/distributed.py'
  workflow_dispatch:
  schedule:
-    - cron: '0 0 * * 1,4'  # Runs at 00:00 UTC every monday & thursday
+    - cron: "0 0 * * 1,4" # Runs at 00:00 UTC every monday & thursday
 # Cancel jobs on the same ref if a new one is triggered
 concurrency:
@@ -33,19 +31,19 @@ jobs:
      fail-fast: false
      matrix:
        include:
-#          - cuda: 129
+          #          - cuda: 129
-#            cuda_version: 12.9.1
+          #            cuda_version: 12.9.1
-#            python_version: "3.12"
+          #            python_version: "3.12"
-#            pytorch: 2.9.1
+          #            pytorch: 2.9.1
-#            axolotl_extras: "fbgemm-gpu"
+          #            axolotl_extras: "fbgemm-gpu"
-#            num_gpus: 2
+          #            num_gpus: 2
-#            dockerfile: "Dockerfile-uv.jinja"
+          #            dockerfile: "Dockerfile-uv.jinja"
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
-#            axolotl_extras: fbgemm-gpu
+            #            axolotl_extras: fbgemm-gpu
            num_gpus: 2
          - cuda: 128
            cuda_version: 12.8.1
@@ -53,7 +51,6 @@ jobs:
            pytorch: 2.10.0
            axolotl_extras: "fbgemm-gpu"
            num_gpus: 2
            dockerfile: "Dockerfile-uv.jinja"
    runs-on: [self-hosted, modal]
    timeout-minutes: 120
    steps:
@@ -75,7 +72,7 @@ jobs:
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -8,6 +8,9 @@ on:
 permissions: {}
 env:
  UV_SYSTEM_PYTHON: "1"
 jobs:
  setup_release:
    name: Create Release
@@ -41,11 +44,15 @@ jobs:
        with:
          python-version: "3.11"
      - name: Install uv
        uses: astral-sh/setup-uv@v7
      - name: Install dependencies
        run: |
-          pip3 install wheel packaging==26.0
+          uv pip install wheel packaging
-          pip3 install --no-build-isolation -e .
+          uv pip install --no-build-isolation -e .
-          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
      - name: Extract tag name
        id: tag
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -2,15 +2,18 @@ name: Tests Nightly against upstream main
 on:
  workflow_dispatch:
  schedule:
-    - cron: '0 0 * * *'  # Runs at 00:00 UTC every day
+    - cron: "0 0 * * *" # Runs at 00:00 UTC every day
  pull_request:
    types: [opened, synchronize, reopened, ready_for_review]
    paths:
-      - '.github/workflows/tests-nightly.yml'
+      - ".github/workflows/tests-nightly.yml"
 permissions:
  contents: read
 env:
  UV_SYSTEM_PYTHON: "1"
 jobs:
  pre-commit:
    name: pre-commit
@@ -20,7 +23,7 @@ jobs:
      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"
-          cache: 'pip' # caching pip dependencies
+          cache: "pip" # caching pip dependencies
      - uses: pre-commit/action@v3.0.1
        env:
          SKIP: no-commit-to-branch
@@ -43,7 +46,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python_version: ["3.12"]  # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
+        python_version: ["3.12"] # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
        pytorch_version: ["2.9.1", "2.10.0"]
    timeout-minutes: 20
@@ -61,36 +64,34 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
          cache: 'pip' # caching pip dependencies
-      - name: upgrade pip
+      - name: Install uv
-        run: |
+        uses: astral-sh/setup-uv@v7
          pip3 install --upgrade pip
          pip3 install --upgrade packaging==26.0 setuptools==78.1.1 wheel
      - name: Install PyTorch
        run: |
-          pip3 install torch==${{ matrix.pytorch_version }} torchvision
+          uv pip install torch==${{ matrix.pytorch_version }} torchvision
-
+          uv pip freeze | grep -E "^(torch|torchvision)==" > /tmp/torch-pin.txt
      - name: Update requirements.txt
        run: |
          sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt
          sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt
          sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt
          sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt
          sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt
      - name: Install dependencies
        run: |
-          pip3 show torch
+          uv pip install --no-build-isolation -e . --override /tmp/torch-pin.txt
-          pip3 install --no-build-isolation -U -e .
+          python scripts/cutcrossentropy_install.py --uv | sh
-          python scripts/unsloth_install.py | sh
+          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
-          python scripts/cutcrossentropy_install.py | sh
+            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
-          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+
      - name: Override with nightly HF packages
        run: |
          uv pip install --no-deps \
            "transformers @ git+https://github.com/huggingface/transformers.git@main" \
            "peft @ git+https://github.com/huggingface/peft.git@main" \
            "accelerate @ git+https://github.com/huggingface/accelerate.git@main" \
            "trl @ git+https://github.com/huggingface/trl.git@main" \
            "datasets @ git+https://github.com/huggingface/datasets.git@main"
      - name: Make sure PyTorch version wasn't clobbered
        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__, f'Expected torch ${{ matrix.pytorch_version }} but got {torch.__version__}'"
      - name: Ensure axolotl CLI was installed
        run: |
@@ -102,9 +103,6 @@ jobs:
          pytest -v --durations=10 tests/patched/
          pytest -v --durations=10 tests/cli/
      - name: cleanup pip cache
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
  docker-e2e-tests:
    if: github.repository_owner == 'axolotl-ai-cloud'
@@ -136,7 +134,6 @@ jobs:
            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
            dockerfile: "Dockerfile-uv.jinja"
            nightly_build: "true"
    steps:
      - name: Checkout
@@ -157,7 +154,7 @@ jobs:
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -6,21 +6,19 @@ on:
    branches:
      - "main"
    paths:
-      - '**.py'
+      - "**.py"
-      - 'requirements.txt'
+      - "pyproject.toml"
-      - '.github/workflows/*.yml'
+      - ".github/workflows/*.yml"
-      - 'requirements-tests.txt'
+      - "cicd/cicd.sh"
-      - 'cicd/cicd.sh'
+      - "cicd/Dockerfile-uv.jinja"
      - 'cicd/Dockerfile.jinja'
  pull_request:
-      types: [opened, synchronize, reopened, ready_for_review]
+    types: [opened, synchronize, reopened, ready_for_review]
-      paths:
+    paths:
-       - '**.py'
+      - "**.py"
-       - 'requirements.txt'
+      - "pyproject.toml"
-       - '.github/workflows/*.yml'
+      - ".github/workflows/*.yml"
-       - 'requirements-tests.txt'
+      - "cicd/cicd.sh"
-       - 'cicd/cicd.sh'
+      - "cicd/Dockerfile-uv.jinja"
       - 'cicd/Dockerfile.jinja'
  workflow_dispatch:
 # Cancel jobs on the same ref if a new one is triggered
@@ -33,6 +31,7 @@ permissions:
 env:
  TRANSFORMERS_IS_CI: "yes"
  UV_SYSTEM_PYTHON: "1"
 jobs:
  pre-commit:
@@ -44,7 +43,7 @@ jobs:
      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"
-          cache: 'pip' # caching pip dependencies
+          cache: "pip" # caching pip dependencies
      - uses: pre-commit/action@v3.0.1
        env:
          SKIP: no-commit-to-branch
@@ -73,7 +72,7 @@ jobs:
        exclude:
          - python_version: "3.14"
            pytorch_version: "2.9.1"
-    timeout-minutes: 20
+    timeout-minutes: 25
    steps:
      - name: cleanup node
@@ -94,32 +93,25 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
          cache: 'pip' # caching pip dependencies
-      - name: upgrade pip
+      - name: Install uv
-        run: |
+        uses: astral-sh/setup-uv@v7
          pip3 install --upgrade pip
          pip3 install --upgrade packaging==26.0 setuptools==75.8.0 wheel
      - name: Install PyTorch
        run: |
-          pip3 install --no-cache-dir torch==${{ matrix.pytorch_version }} torchvision
+          uv pip install torch==${{ matrix.pytorch_version }} torchvision
          uv pip freeze | grep -E "^(torch|torchvision)==" > /tmp/torch-pin.txt
      - name: Install dependencies
        run: |
-          pip3 show torch
+          uv pip install --no-build-isolation -e . --override /tmp/torch-pin.txt
-          pip3 install --no-cache-dir --no-build-isolation -U -e .
+          python scripts/cutcrossentropy_install.py --uv | sh
-          python scripts/unsloth_install.py | sh
+          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
-          python scripts/cutcrossentropy_install.py | sh
+            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
          pip3 install -r requirements-dev.txt -r requirements-tests.txt
      - name: cleanup pip cache
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
      - name: Make sure PyTorch version wasn't clobbered
        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__, f'Expected torch ${{ matrix.pytorch_version }} but got {torch.__version__}'"
      - name: Ensure axolotl CLI was installed
        run: |
@@ -188,33 +180,27 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
          cache: 'pip' # caching pip dependencies
-      - name: upgrade pip
+      - name: Install uv
-        run: |
+        uses: astral-sh/setup-uv@v7
          pip3 install --upgrade pip
          pip3 install --upgrade packaging==26.0 setuptools==75.8.0 setuptools_scm build wheel psutil
      - name: Install PyTorch
        run: |
-          pip3 install --no-cache-dir torch==${{ matrix.pytorch_version }} torchvision
+          uv pip install torch==${{ matrix.pytorch_version }} torchvision
          uv pip freeze | grep -E "^(torch|torchvision)==" > /tmp/torch-pin.txt
      - name: Install dependencies
        run: |
-          pip3 show torch
+          uv pip install packaging setuptools_scm build wheel psutil
          python -m build --no-isolation --sdist
-          pip3 install --no-cache-dir --no-build-isolation dist/axolotl*.tar.gz
+          uv pip install --no-build-isolation dist/axolotl*.tar.gz --override /tmp/torch-pin.txt
-          python scripts/unsloth_install.py | sh
+          python scripts/cutcrossentropy_install.py --uv | sh
-          python scripts/cutcrossentropy_install.py | sh
+          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
-          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
      - name: cleanup pip cache
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
      - name: Make sure PyTorch version wasn't clobbered
        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__, f'Expected torch ${{ matrix.pytorch_version }} but got {torch.__version__}'"
      - name: Ensure axolotl CLI was installed
        run: |
@@ -291,7 +277,6 @@ jobs:
            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
            dockerfile: "Dockerfile-uv.jinja"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -312,7 +297,7 @@ jobs:
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
@@ -374,7 +359,7 @@ jobs:
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "GPU_TYPE=${{ matrix.gpu_type || 'L40S'}}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -26,7 +26,7 @@ axolotl config-schema                  # Dump config JSON schema
 | Method | Config Key | When to Use |
 |--------|-----------|-------------|
 | SFT | *(default)* | Input-output pairs, instruction tuning |
-| DPO/IPO | `rl: dpo` / `rl: ipo` | Paired preference data (chosen vs rejected) |
+| DPO/IPO | `rl: dpo` / `rl: dpo, dpo_loss_type: ["ipo"]` | Paired preference data (chosen vs rejected) |
 | KTO | `rl: kto` | Unpaired binary preference labels |
 | ORPO | `rl: orpo` | Single-stage alignment, no ref model |
 | GRPO | `rl: grpo` | RL with verifiable reward functions (math, code) |
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,7 +1,6 @@
 include requirements.txt
 include README.md
 include LICENSE
-include src/setuptools_axolotl_dynamic_dependencies.py
+include VERSION
 include src/axolotl/utils/chat_templates/templates/*.jinja
 include AGENTS.md
 recursive-include docs/agents *.md
--- a/README.md
+++ b/README.md
@@ -29,6 +29,9 @@
 ## 🎉 Latest Updates
 - 2026/04:
  - New model support has been added in Axolotl for [Mistral Medium 3.5](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/mistral-medium-3_5) and [Gemma 4](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/gemma4).
  - Axolotl is now [uv-first](https://github.com/axolotl-ai-cloud/axolotl/pull/3545) and has [SonicMoE fused LoRA](https://github.com/axolotl-ai-cloud/axolotl/pull/3519) support.
 - 2026/03:
  - New model support has been added in Axolotl for [Mistral Small 4](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/mistral4), [Qwen3.5, Qwen3.5 MoE](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen3.5), [GLM-4.7-Flash](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm47-flash), [GLM-4.6V](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm46v), and [GLM-4.5-Air](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm45).
  - [MoE expert quantization](https://docs.axolotl.ai/docs/expert_quantization.html) support (via `quantize_moe_experts: true`) greatly reduces VRAM when training MoE models (FSDP2 compat).
@@ -95,14 +98,11 @@ Features:
 ### Installation
 #### Using uv (recommended)
 ```bash
-# install uv if you don't already have it installed
+# install uv if you don't already have it installed (restart shell after)
 curl -LsSf https://astral.sh/uv/install.sh | sh
 source $HOME/.local/bin/env
-# CUDA 12.8.1 tends to have better package compatibility
+# change depending on system
 export UV_TORCH_BACKEND=cu128
 # create a new virtual environment
@@ -112,23 +112,6 @@ source .venv/bin/activate
 uv pip install torch==2.10.0 torchvision
 uv pip install --no-build-isolation axolotl[deepspeed]
 # recommended - install cut-cross-entropy
 uv pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@main"
 # (optional) - prefetch flash-attn2 and causal-conv1d kernels
 uv run --python 3.12 python -c "from kernels import get_kernel; get_kernel('kernels-community/flash-attn2'); get_kernel('kernels-community/causal-conv1d')"
 # Download example axolotl configs, deepspeed configs
 axolotl fetch examples
 axolotl fetch deepspeed_configs  # OPTIONAL
 ```
 #### Using pip
 ```bash
 pip3 install -U packaging==26.0 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
 # Download example axolotl configs, deepspeed configs
 axolotl fetch examples
 axolotl fetch deepspeed_configs  # OPTIONAL
@@ -138,7 +121,7 @@ axolotl fetch deepspeed_configs  # OPTIONAL
 Installing with Docker can be less error prone than installing in your own environment.
 ```bash
-docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
+docker run --gpus '"all"' --ipc=host --rm -it axolotlai/axolotl:main-latest
 ```
 Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html).
--- a/SETUP_MIAAI.md
+++ b/SETUP_MIAAI.md
@@ -0,0 +1,273 @@
 # Axolotl Setup — miaai (RTX 5080, CUDA 13.2)
 ## System Info
 - GPU: NVIDIA RTX 5080 (16GB VRAM, sm_120 / Blackwell)
 - Driver: 580.126.09 — max CUDA 13.0 shown by nvidia-smi, but nvcc from conda is 13.2
 - OS: Ubuntu 25.10 (Python 3.13 system — do NOT use system Python for ML)
 - Axolotl repo: `/home/tocmo0nlord/axolotl` (branch: `activeblue/main`)
 - Conda env: `axolotl` at `/opt/miniconda3/envs/axolotl`
 ---
 ## Starting from Bare Ubuntu 25.10
 If rebuilding from scratch, complete these steps first before anything else.
 ### A. System packages
 ```bash
 sudo apt update && sudo apt upgrade -y
 sudo apt install -y \
  build-essential cmake git curl wget \
  python3-dev libssl-dev zlib1g-dev \
  ca-certificates gnupg lsb-release
 ```
 ### B. NVIDIA driver (580.xx)
 Ubuntu 25.10 is too new for NVIDIA's apt repo. Install via ubuntu-drivers:
 ```bash
 sudo ubuntu-drivers autoinstall
 sudo reboot
 ```
 After reboot, verify:
 ```bash
 nvidia-smi
 # Must show: NVIDIA GeForce RTX 5080, Driver Version: 580.x
 ```
 If ubuntu-drivers installs the wrong version, force the right one:
 ```bash
 sudo apt install -y nvidia-driver-580
 sudo reboot
 ```
 ### C. Install Ollama
 ```bash
 curl -fsSL https://ollama.com/install.sh | sh
 # Verify it's running
 systemctl status ollama
 ```
 ### D. HuggingFace CLI
 ```bash
 pip3 install huggingface_hub
 huggingface-cli login
 # Paste your HF token — required for gated models like meta-llama
 ```
 Once steps A–D are done, continue with the One-time Setup below.
 ---
 ## Pre-Training Checklist (every session)
 ```bash
 # 1. Stop Ollama — if it receives a request mid-training it will compete for VRAM
 sudo systemctl stop ollama
 # 2. Activate conda env
 export PATH="/opt/miniconda3/bin:$PATH"
 conda activate axolotl
 # 3. Set env vars
 export CUDA_HOME=$CONDA_PREFIX
 export PATH=$CUDA_HOME/bin:$PATH
 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
 # 4. Confirm GPU is clear (should show no processes before training)
 nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv
 # 5. Go to axolotl directory
 cd /home/tocmo0nlord/axolotl
 ```
 ## Run Training
 ```bash
 axolotl train ~/human_chat_qlora.yml
 ```
 ## After Training
 ```bash
 # Restart Ollama
 sudo systemctl start ollama
 # Test the adapter interactively
 axolotl inference ~/human_chat_qlora.yml \
  --lora-model-dir ~/outputs/llama31-8b-humanchat \
  --prompter chat
 # (Optional) Merge adapter into base model for standalone deployment
 axolotl merge-lora ~/human_chat_qlora.yml
 ```
 ---
 ## One-time Setup (fresh machine — after bare Ubuntu steps above)
 ### 1. Install Miniconda
 ```bash
 wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
 bash miniconda.sh -b -p /opt/miniconda3
 /opt/miniconda3/bin/conda init bash
 source ~/.bashrc
 ```
 ### 2. Create Python 3.11 environment
 ```bash
 conda create -n axolotl python=3.11 -y
 conda activate axolotl
 ```
 ### 3. Clone axolotl repo
 ```bash
 git clone https://git.activeblue.net/tocmo0nlord/axolotl.git /home/tocmo0nlord/axolotl
 cd /home/tocmo0nlord/axolotl
 git remote add upstream https://github.com/axolotl-ai-cloud/axolotl.git
 git fetch upstream
 git rebase upstream/main        # keeps activeblue patches on top
 ```
 ### 4. Install CUDA toolkit (needed to compile flash-attn and bitsandbytes)
 ```bash
 conda install -y -c "nvidia/label/cuda-12.8.0" cuda-toolkit
 export CUDA_HOME=$CONDA_PREFIX
 export PATH=$CUDA_HOME/bin:$PATH
 ```
 > NOTE: Despite installing from the cuda-12.8.0 channel, conda resolves nvcc to **13.2.78**.
 > This is fine — use cu132 everywhere to match.
 ### 5. Install PyTorch — use cu132 (matches nvcc from conda)
 ```bash
 # torchaudio has no cu132 wheel — skip it, not needed for LLM training
 pip install torch torchvision --index-url https://download.pytorch.org/whl/cu132
 python -c "import torch; print('CUDA:', torch.version.cuda); print('GPU:', torch.cuda.get_device_name(0))"
 ```
 ### 6. Install Axolotl
 ```bash
 cd /home/tocmo0nlord/axolotl
 pip install -e "."
 ```
 ### 7. Install flash-attn
 > Compiles CUDA kernels from source — takes 15–25 min on 10 cores of i7-14700K.
 ```bash
 MAX_JOBS=10 pip install flash-attn --no-build-isolation
 ```
 ### 8. Compile bitsandbytes from source for sm_120 (RTX 5080 / Blackwell)
 Prebuilt wheels do not include sm_120. CUDA 13.2 also dropped sm_50–53.
 Must compile from source with a patched CMakeLists.txt.
 ```bash
 # Clone bitsandbytes v0.49.1
 git clone --branch v0.49.1 --depth 1 \
  https://github.com/bitsandbytes-foundation/bitsandbytes.git /tmp/bnb_0491
 # Patch CMakeLists.txt: insert sm_120 override before the foreach loop
 # (cmake >= 3.23.0 uses its own built-in arch list which does not include sm_120)
 sed -i '/    foreach(capability \${CMAKE_CUDA_ARCHITECTURES_ALL})/i\    # RTX 5080 sm_120 patch\n    set(CMAKE_CUDA_ARCHITECTURES_ALL 120)' /tmp/bnb_0491/CMakeLists.txt
 # Verify patch landed correctly — set() line must appear immediately before foreach
 grep -n "ARCHITECTURES_ALL\|foreach" /tmp/bnb_0491/CMakeLists.txt | tail -5
 # Configure — must point cmake at conda's nvcc explicitly
 cmake \
  -DCMAKE_CUDA_COMPILER=/opt/miniconda3/envs/axolotl/bin/nvcc \
  -DCOMPUTE_BACKEND=cuda \
  -S /tmp/bnb_0491 \
  -B /tmp/bnb_0491/build 2>&1 | grep -E "(Capabilit|CUDA Ver|Error)"
 # Must show: CUDA Capabilities Selected: 120
 # Build (adjust -j to your CPU core count)
 cmake --build /tmp/bnb_0491/build -j10
 # Install into conda site-packages
 cp -r /tmp/bnb_0491/bitsandbytes \
  /opt/miniconda3/envs/axolotl/lib/python3.11/site-packages/
 # Verify CUDA works
 python3 -c "
 import torch, bitsandbytes as bnb
 x = torch.randn(64, 64, device='cuda')
 l = bnb.nn.Linear8bitLt(64, 64).cuda()
 print('bitsandbytes CUDA OK:', l(x).shape)
 "
 ```
 ### 9. Copy training config to home
 ```bash
 cp /home/tocmo0nlord/axolotl/human_chat_qlora.yml ~/human_chat_qlora.yml
 ```
 ### 10. Verify the full stack
 ```bash
 python3 -c "
 import torch, bitsandbytes as bnb, flash_attn, transformers
 print('torch      :', torch.__version__, '| CUDA:', torch.version.cuda)
 print('bitsandbytes:', bnb.__version__)
 print('flash_attn :', flash_attn.__version__)
 print('transformers:', transformers.__version__)
 print('GPU        :', torch.cuda.get_device_name(0))
 print('VRAM       :', round(torch.cuda.get_device_properties(0).total_memory/1e9, 1), 'GB')
 "
 ```
 Expected output:
 ```
 torch      : 2.x.x | CUDA: 13.2
 bitsandbytes: 0.50.0.dev0
 flash_attn : 2.x.x
 transformers: 5.x.x
 GPU        : NVIDIA GeForce RTX 5080
 VRAM       : 16.3 GB
 ```
 ---
 ## Training Config — human_chat_qlora.yml
 Key settings tuned for RTX 5080 (16GB):
 | Setting | Value | Notes |
 |---|---|---|
 | `sequence_len` | `2048` | 4096 OOMs during loss computation (logits x 128k vocab) |
 | `micro_batch_size` | `1` | Effective batch = micro x grad_accum = 8 |
 | `gradient_accumulation_steps` | `8` | Keeps effective batch size at 8 |
 | `adapter` | `qlora` | 4-bit via bitsandbytes compiled from source |
 | `attn_implementation` | `flash_attention_2` | Not the deprecated `flash_attention: true` |
 | `type` (datasets) | `chat_template` | Not the deprecated `sharegpt` |
 Expected training metrics (RTX 5080, ~65k samples, 2 epochs):
 - VRAM: ~10–11 GB active, ~11 GB allocated
 - Training duration: ~3.5 hours
 - Initial eval loss: ~0.81, perplexity ~2.25
 - Final loss target: ~0.55–0.60
 To push VRAM to ~14GB and improve training: set `micro_batch_size: 2` and `gradient_accumulation_steps: 4`.
 ---
 ## Common Pitfalls
 | Problem | Cause | Fix |
 |---|---|---|
 | `externally-managed-environment` | System Python 3.13 blocks pip | Use conda env, never system pip |
 | `No module named torch` (flash-attn) | pip builds in isolated env | Use `--no-build-isolation` |
 | `CUDA_HOME not set` | CUDA toolkit not installed | `conda install cuda-toolkit` from nvidia channel |
 | `CUDA version mismatch 13.2 vs 12.8` | Conda nvcc is 13.2, torch was cu128 | Reinstall torch with `--index-url .../cu132` |
 | `torchaudio` not found for cu132 | No cu132 wheel exists | Skip torchaudio — not needed |
 | flash-attn compile is slow | Single-threaded by default | Set `MAX_JOBS=<cpu_count>` before pip install |
 | `nvcc fatal: Unsupported gpu architecture 'compute_50'` | bitsandbytes CMakeLists.txt hardcodes sm_50; CUDA 13.2 dropped it | Patch CMakeLists.txt (see step 8 above) |
 | `CUDA Capabilities Selected: 50;52;...` ignores -D flag | cmake >= 3.23 built-in arch list lacks sm_120; CMakeLists.txt overrides -D | Insert `set(CMAKE_CUDA_ARCHITECTURES_ALL 120)` before foreach loop |
 | `BackendUnavailable: scikit_build_core` | pip install of bnb triggers cmake rebuild | Copy .so directly to site-packages instead |
 | `torch.OutOfMemoryError` during eval | logits tensor (batch x 4096 x 128k vocab) too large | Set `sequence_len: 2048`, `micro_batch_size: 1` |
 | `type: sharegpt` deprecation warning | axolotl removed sharegpt type | Use `type: chat_template` with field mappings |
 | `flash_attention: true` deprecation | Old config key removed | Use `attn_implementation: flash_attention_2` |
 | Capybara dataset `field_messages null` | Capybara uses input/output format, not conversations | Switch to SlimOrca or OpenHermes-2.5 |
 | Ollama loads model mid-training | Ollama is enabled and receives a request | `sudo systemctl stop ollama` before training |
 | Training much slower than eval speed | The fast it/s on screen is the eval loop (forward only) | Normal — training includes backward pass and optimizer (~3.5h total) |
 | ubuntu-drivers installs wrong NVIDIA version | Multiple driver candidates available | Force with `apt install nvidia-driver-580` |
--- a/2
+++ b/2
@@ -1 +1 @@
-0.16.0.dev0
+0.16.2.dev0
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -134,7 +134,6 @@ quartodoc:
        - monkeypatch.stablelm_attn_hijack_flash
        - monkeypatch.trainer_fsdp_optim
        - monkeypatch.transformers_fa_utils
        - monkeypatch.unsloth_
        - monkeypatch.data.batch_dataset_fetcher
        - monkeypatch.mixtral
        - monkeypatch.gradient_checkpointing.offload_cpu
@@ -312,6 +311,7 @@ website:
            - docs/dataset_loading.qmd
            - docs/qat.qmd
            - docs/quantize.qmd
            - docs/1_58bit_finetuning.qmd
            - docs/optimizations.qmd
        - section: "Core Concepts"
@@ -327,7 +327,6 @@ website:
        - section: "Advanced Features"
          contents:
            - docs/fsdp_qlora.qmd
            - docs/unsloth.qmd
            - docs/torchao.qmd
            - docs/custom_integrations.qmd
            - docs/sequence_parallelism.qmd
--- a/cicd/Dockerfile-uv.jinja
+++ b/cicd/Dockerfile-uv.jinja
@@ -22,15 +22,6 @@ WORKDIR /workspace/axolotl
 RUN git fetch origin +$GITHUB_REF && \
    git checkout FETCH_HEAD
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
    fi
 RUN uv pip install packaging==26.0 setuptools==78.1.1
 RUN uv pip install torchvision
 RUN uv pip uninstall causal_conv1d
@@ -40,11 +31,21 @@ RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
    fi
-RUN python scripts/unsloth_install.py --uv | sh
+# Override with nightly HF packages for nightly builds
 RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
        uv pip install --no-deps \
            "transformers @ git+https://github.com/huggingface/transformers.git@main" \
            "peft @ git+https://github.com/huggingface/peft.git@main" \
            "accelerate @ git+https://github.com/huggingface/accelerate.git@main" \
            "trl @ git+https://github.com/huggingface/trl.git@main" \
            "datasets @ git+https://github.com/huggingface/datasets.git@main"; \
    fi
 RUN python scripts/cutcrossentropy_install.py --uv | sh
 # So we can test the Docker image
-RUN uv pip install -r requirements-dev.txt -r requirements-tests.txt
+RUN uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
    codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
 # fix so that git fetch/pull from remote works
 RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -1,54 +0,0 @@
 FROM axolotlai/axolotl-base:{{ BASE_TAG }}
 ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
 ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
 ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
 ENV CUDA="{{ CUDA }}"
 ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
 ENV GITHUB_REF="{{ GITHUB_REF }}"
 ENV GITHUB_SHA="{{ GITHUB_SHA }}"
 ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
 ENV HF_HOME="{{ HF_HOME }}"
 ENV AXOLOTL_DATASET_NUM_PROC="8"
 RUN apt-get update && \
    apt-get install -y --allow-change-held-packages vim curl nano zstd libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm
 WORKDIR /workspace
 RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
 WORKDIR /workspace/axolotl
 RUN git fetch origin +$GITHUB_REF && \
    git checkout FETCH_HEAD
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
    fi
 RUN pip install packaging==26.0 setuptools==78.1.1 psutil
 RUN pip uninstall -y causal_conv1d
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
    fi
 RUN python scripts/unsloth_install.py | sh
 RUN python scripts/cutcrossentropy_install.py | sh
 # So we can test the Docker image
 RUN pip install -r requirements-dev.txt -r requirements-tests.txt
 # fix so that git fetch/pull from remote works
 RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
    git config --get remote.origin.fetch
 # helper for huggingface-login cli
 RUN git config --global credential.helper store
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -e
-python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"
+python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__, f'Expected torch $PYTORCH_VERSION but got {torch.__version__}'"
 set -o pipefail
 for i in 1 2 3; do
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -17,7 +17,7 @@ template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
 template_env = jinja2.Environment(
    loader=template_loader, autoescape=select_autoescape()
 )
-dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja")
+dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile-uv.jinja")
 df_template = template_env.get_template(dockerfile)
 df_args = {
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
@@ -16,7 +16,7 @@ template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
 template_env = jinja2.Environment(
    loader=template_loader, autoescape=select_autoescape()
 )
-dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja")
+dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile-uv.jinja")
 df_template = template_env.get_template(dockerfile)
 df_args = {
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -24,15 +24,15 @@ WORKDIR /workspace/axolotl
 # If AXOLOTL_EXTRAS is set, append it in brackets; don't install deepspeed with arm64
 RUN pip uninstall -y causal_conv1d
 RUN if [ "$TARGETARCH" = "arm64" ]; then \
-        BASE_EXTRAS="flash-attn,ring-flash-attn,optimizers,ray"; \
+        BASE_EXTRAS="optimizers,ray"; \
    else \
-        BASE_EXTRAS="deepspeed,flash-attn,ring-flash-attn,optimizers,ray"; \
+        BASE_EXTRAS="deepspeed,optimizers,ray"; \
    fi && \
    if [ "$AXOLOTL_EXTRAS" != "" ]; then \
        pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
        pip install --no-build-isolation -e .[$BASE_EXTRAS] $AXOLOTL_ARGS; \
-    fi && \    python scripts/unsloth_install.py | sh && \
+    fi && \
    python scripts/cutcrossentropy_install.py | sh && \
    pip install pytest && \
    pip cache purge
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -58,19 +58,3 @@ RUN git lfs install --skip-repo && \
    # The base image ships with `pydantic==1.8.2` which is not working
    pip3 install -U --no-cache-dir pydantic==1.10.10 && \
    pip3 cache purge
 # Map Python version (e.g., 3.12 -> cp312)
 RUN PYTHON_CP="cp$(echo $PYTHON_VERSION | tr -d '.')" && \
    # Map PyTorch version (e.g., 2.9.1 -> torch2.9, 2.10.0 -> torch2.10)
    TORCH_TAG="torch$(echo $PYTORCH_VERSION | grep -oP '^\d+\.\d+')" && \
    # Map architecture
    case "$TARGETARCH" in \
        amd64) ARCH_TAG="x86_64" ;; \
        arm64) ARCH_TAG="aarch64" ;; \
        *) echo "Unsupported architecture: $TARGETARCH"; exit 1 ;; \
    esac && \
    WHL_VERSION="v0.7.16" && \
    WHL_FILE="flash_attn-2.8.3+cu${CUDA}${TORCH_TAG}-${PYTHON_CP}-${PYTHON_CP}-linux_${ARCH_TAG}.whl" && \
    wget -nv "https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}" && \
    pip3 install --no-cache-dir "${WHL_FILE}" && \
    rm "${WHL_FILE}"
--- a/docker/Dockerfile-base-next
+++ b/docker/Dockerfile-base-next
@@ -1,16 +1,15 @@
-ARG CUDA_VERSION="12.8.1"
+ARG CUDA_VERSION="12.8.2"
 ARG CUDNN_VERSION="8"
 ARG UBUNTU_VERSION="22.04"
 ARG MAX_JOBS=4
-FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
+FROM nvidia/cuda:12.8.2-devel-ubuntu22.04 AS base-builder
-ENV PATH="/root/miniconda3/bin:${PATH}"
+ENV PATH="/root/miniforge3/bin:${PATH}"
 ARG PYTHON_VERSION="3.11"
 ARG PYTORCH_VERSION="next"
 ARG CUDA="128"
-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0 12.0+PTX"
 ENV PYTHON_VERSION=$PYTHON_VERSION
 ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
@@ -18,13 +17,13 @@ ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
 RUN apt-get update \
    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
    && wget \
-    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
+    https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh \
    && mkdir /root/.conda \
-    && bash Miniconda3-latest-Linux-x86_64.sh -b \
+    && bash Miniforge3-Linux-x86_64.sh -b \
-    && rm -f Miniconda3-latest-Linux-x86_64.sh \
+    && rm -f Miniforge3-Linux-x86_64.sh \
-    && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
+    && /root/miniforge3/bin/conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
-ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
+ENV PATH="/root/miniforge3/envs/py${PYTHON_VERSION}/bin:${PATH}"
 WORKDIR /workspace
--- a/docker/Dockerfile-tests
+++ b/docker/Dockerfile-tests
@@ -24,9 +24,9 @@ RUN git fetch origin +$GITHUB_REF && \
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        pip install --no-build-isolation -e .[deepspeed,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \
+        pip install --no-build-isolation -e .[deepspeed,mamba-ssm] $AXOLOTL_ARGS; \
    fi
 # So we can test the Docker image
--- a/docker/Dockerfile-uv
+++ b/docker/Dockerfile-uv
@@ -24,16 +24,15 @@ WORKDIR /workspace/axolotl
 # If AXOLOTL_EXTRAS is set, append it in brackets; don't install deepspeed with arm64
 RUN uv pip uninstall causal_conv1d
 RUN if [ "$TARGETARCH" = "arm64" ]; then \
-        BASE_EXTRAS="flash-attn,ring-flash-attn,optimizers,ray"; \
+        BASE_EXTRAS="optimizers,ray"; \
    else \
-        BASE_EXTRAS="deepspeed,flash-attn,ring-flash-attn,optimizers,ray"; \
+        BASE_EXTRAS="deepspeed,optimizers,ray"; \
    fi && \
    if [ "$AXOLOTL_EXTRAS" != "" ]; then \
        uv pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
        uv pip install --no-build-isolation -e .[$BASE_EXTRAS] $AXOLOTL_ARGS; \
    fi && \
    python scripts/unsloth_install.py --uv | sh && \
    python scripts/cutcrossentropy_install.py --uv | sh && \
    uv pip install pytest && \
    uv cache clean
--- a/docker/Dockerfile-uv-base
+++ b/docker/Dockerfile-uv-base
@@ -38,20 +38,3 @@ RUN uv pip install packaging setuptools wheel psutil \
 RUN if [ "$TARGETARCH" = "amd64" ]; then \
        MAMBA_SKIP_CUDA_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE uv pip install --no-build-isolation mamba_ssm causal_conv1d; \
    fi
 # Map Python version (e.g., 3.12 -> cp312)
 RUN PYTHON_CP="cp$(echo $PYTHON_VERSION | tr -d '.')" && \
    # Map PyTorch version (e.g., 2.9.1 -> torch2.9, 2.10.0 -> torch2.10)
    TORCH_TAG="torch$(echo $PYTORCH_VERSION | grep -oP '^\d+\.\d+')" && \
    LINUX_TAG="manylinux_" && \
    # Map architecture
    case "$TARGETARCH" in \
        amd64) ARCH_TAG="2_24_x86_64.manylinux_2_28_x86_64" ;; \
        arm64) ARCH_TAG="2_34_aarch64" ;; \
        *) echo "Unsupported architecture: $TARGETARCH"; exit 1 ;; \
    esac && \
    WHL_VERSION="v0.7.16" && \
    WHL_FILE="flash_attn-2.8.3+cu${CUDA}${TORCH_TAG}-${PYTHON_CP}-${PYTHON_CP}-${LINUX_TAG}${ARCH_TAG}.whl" && \
    wget -nv "https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}" && \
    uv pip install --no-cache-dir "${WHL_FILE}" && \
    rm "${WHL_FILE}"
--- a/docs/1_58bit_finetuning.qmd
+++ b/docs/1_58bit_finetuning.qmd
@@ -0,0 +1,70 @@
 ---
 title: "1.58-bit Finetuning"
 back-to-top-navigation: true
 toc: true
 toc-expand: 2
 toc-depth: 4
 ---
 ## Overview
 1.58-bit finetuning allows you to finetune BitNet models when their prequantized weights are provided. In theory, it will be possible to fine-tune any LLM in 1.58bit format but the performance degradation will be dramatic.
 Axolotl supports 1.58-bit finetuning via the [`onebitllms`](https://github.com/tiiuae/onebitllms) library, which replaces standard linear layers with BitNet-compatible counterparts ready to use for training.
 ::: {.callout-note}
 LoRA is not supported for BitNet models
 :::
 ## Installation
 Install the `onebitllms` package before using this feature:
 ```bash
 uv pip install onebitllms
 ```
 Or from source:
 ```bash
 uv pip install git+https://github.com/tiiuae/onebitllms
 ```
 ## Supported models
 For now, only `Falcon-E` series of models are supported. Make sure to use their `-prequantized` version:
 ```bash
 tiiuae/Falcon-E-3B-Base-prequantized
 tiiuae/Falcon-E-1B-Base-prequantized
 ```
 In theory, any other model would 'work' but the performance degradation will be huge. This remains an area of exploration.
 ## Configuration
 To enable 1.58-bit finetuning, set the following in your configuration file:
 ```yaml
 base_model: tiiuae/Falcon-E-3B-Base-prequantized  # A BitNet-compatible model
 use_onebitllms: true
 ```
 ::: {.callout-note}
 For BitNet models, it is recommended to use a higher learning rate than classic models (usually in the order of magnitude of 10x).
 :::
 ## Considerations after training
 Once your model has been trained with 1.58bit fine-tuning, you can convert the trained model in ternary format using the `onebitllms` CLI:
 ```bash
 onebitllms quantize_to_1bit INPUT_PATH OUTPUT_PATH
 ```
 After that, you can use supported packages such as `llama.cpp` or Apple MLX package to run the trained model.
 ## Example Configuration
 You can find example configurations in `examples/falcon-e` which contain one configuration for SFT and one configuration for DPO.
--- a/docs/agents/new_model_support.md
+++ b/docs/agents/new_model_support.md
@@ -121,11 +121,11 @@ Older models that use `_prepare_4d_causal_attention_mask` (Llama, Mistral, Qwen2
 | Backend | Config | head_dim limit | torch_compile | Notes |
 |---------|--------|---------------|---------------|-------|
-| FA2 | `flash_attention: true` | 256 | ✅ | Fastest when supported |
+| FA2 | `attn_implementation: flash_attention_2` | 256 | ✅ | Fastest when supported |
-| FA4 | auto with `flash_attention: true` | 256 (SM90+) | ✅ | Auto-detected on H100+ |
+| FA4 | auto with `attn_implementation: flash_attention_2` | 256 (SM90+) | ✅ | Auto-detected on H100+ |
-| SDPA | `sdp_attention: true` | None | ✅ | Universal fallback |
+| SDPA | `attn_implementation: sdpa` | None | ✅ | Universal fallback |
-| flex | `flex_attention: true` | None | ⚠️ Triton OOM for large head_dim | Good for variable head dims |
+| flex | `attn_implementation: flex_attention` | None | ⚠️ Triton OOM for large head_dim | Good for variable head dims |
-| eager | neither set | None | ✅ | Slowest, always works |
+| eager | `attn_implementation: eager` | None | ✅ | Slowest, always works |
 **Check model support**: Look at `_supports_flash_attn_2`, `_supports_flex_attn`, `_supports_sdpa` attributes on the model class.
--- a/docs/agents/preference_tuning.md
+++ b/docs/agents/preference_tuning.md
@@ -38,7 +38,7 @@ No vLLM server needed (unlike GRPO). Offline RL with pre-collected preference da
 1. Paired preference data (chosen + rejected)?
   - Default → `rl: dpo`
-   - Overfitting → `rl: ipo`
+   - Overfitting → `rl: dpo, dpo_loss_type: ["ipo"]`
   - VRAM-limited → `rl: orpo` (no ref model)
   - Length-sensitive → `rl: simpo` (no ref model)
 2. Only binary labels (good/bad)? → `rl: kto`
--- a/docs/agents/sft.md
+++ b/docs/agents/sft.md
@@ -83,7 +83,7 @@ Watch for: loss never decreasing (check `train_on_inputs`, dataset, LR), loss go
 | Issue | Fix |
 |-------|-----|
 | OOM during training | Reduce `micro_batch_size`, enable `gradient_checkpointing`, reduce `sequence_len` |
-| `sample_packing` + SDPA + bf16 = 0.0 loss | Use `flash_attention: true` or disable `sample_packing` |
+| `sample_packing` + SDPA + bf16 = 0.0 loss | Use `attn_implementation: flash_attention_2` or disable `sample_packing` |
 | Missing chat template error | Set `chat_template: chatml` explicitly |
 | Label masking wrong | Run `axolotl preprocess config.yaml --debug` and inspect labels |
 | Loss NaN | Use `bf16: auto`, lower LR, check data for empty samples |
--- a/docs/attention.qmd
+++ b/docs/attention.qmd
@@ -3,28 +3,71 @@ title: Attention
 description: Supported attention modules in Axolotl
 ---
-## SDP Attention
+Axolotl routes attention via a single config field:
 This is the default built-in attention in PyTorch.
 ```yaml
-sdp_attention: true
+attn_implementation: <backend>
 ```
-For more details: [PyTorch docs](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
+`attn_implementation` is passed through to `transformers` verbatim (via
 `model.config._attn_implementation`). Accepted values are the HF-native
 backends, axolotl-registered backends, or a hub-kernel path.
-## Flash Attention
+## Backends
-Axolotl supports Flash Attention 2, 3, and 4. The best available version is used automatically
+| `attn_implementation` | Description |
-based on your installed packages and GPU.
+|---|---|
 | `eager` | Plain PyTorch attention. No packing support. |
 | `sdpa` | PyTorch `scaled_dot_product_attention`. No packing support. |
 | `flash_attention_2` | Dao-AILab Flash Attention 2. |
 | `flash_attention_3` | Dao-AILab Flash Attention 3 (Hopper+). |
 | `flex_attention` | Torch Flex Attention (requires torch ≥ 2.6). |
 | `xformers` | xFormers memory-efficient attention. |
 | `sage` | SageAttention (QK int8 / PV fp16). |
 | `s2` | Shifted-Sparse Attention (LLaMA only, FA2 under the hood). |
 | `fp8` | torchao FP8 low-precision attention (requires SM90+, torch ≥ 2.11). Loaded as SDPA and patched post-load. |
 | `kernels-community/flash-attn3` | HF hub FA3 kernel. |
 | `kernels-community/sage-attention` | HF hub SageAttention kernel. |
 | Other `<org>/<name>` path | Any hub-kernel path supported by `transformers`. |
 Short-form aliases (`flash`, `fa2`, `flex`, `sdp`, etc.) are **not accepted** —
 set the canonical name above.
 ### Capability flags
 Axolotl derives three boolean capability flags from `attn_implementation` and
 exposes them on the validated config:
 - `cfg.attn_supports_packing` — backend supports varlen sample packing via
  `position_ids`. Gates multipack patches and `sample_packing_drop_attention_mask`.
 - `cfg.attn_uses_flash_lib` — backend needs the `flash_attn` (Dao-AILab)
  monkeypatches (FA4 auto, LLaMA flash hijack, ring-FA).
 - `cfg.attn_needs_dtype_cast` — backend requires fp16/bf16 embeddings
  (everything except `eager` and `sdpa`).
 These are **computed** — they cannot be overridden from YAML.
 ## Per-backend notes
 ### SDPA
 Default PyTorch attention. See
 [PyTorch docs](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html).
 ```yaml
-flash_attention: true
+attn_implementation: sdpa
 ```
-For more details: [Flash Attention](https://github.com/Dao-AILab/flash-attention/)
+### Flash Attention
-### Flash Attention 2
+Axolotl supports FA2, FA3, and FA4. The best available version is used
 automatically based on your installed packages and GPU.
 ```yaml
 attn_implementation: flash_attention_2  # or flash_attention_3
 ```
 #### Flash Attention 2
 Requirements: Ampere, Ada, or Hopper GPUs (Turing or lower not supported)
@@ -39,23 +82,25 @@ Alternatively, try reinstall or downgrade a version.
 :::
-### Flash Attention 3
+#### Flash Attention 3
 Requirements: Hopper only and CUDA 12.8 (recommended)
 ```bash
 git clone https://github.com/Dao-AILab/flash-attention.git
 cd flash-attention/hopper
 python setup.py install
 ```
-### Flash Attention 4
+#### Flash Attention 4
-Requirements: Hopper or Blackwell GPUs
+Requirements: Hopper or Blackwell GPUs. Auto-applied when `attn_uses_flash_lib`
 is true and FA4 is importable.
 FA4 is still a pre-release on PyPI, so `--pre` is required:
 ```bash
-pip install flash-attn-4
+pip install --pre flash-attn-4
 ```
 Or from source:
@@ -63,7 +108,6 @@ Or from source:
 ```bash
 git clone https://github.com/Dao-AILab/flash-attention.git
 cd flash-attention/flash_attn/cute
 pip install -e .
 # FA2's flash_attn package includes a cute/ stub that shadows FA4.
@@ -86,93 +130,113 @@ and falls back to FA2/3.
 :::
 For more details: [flash-attention/flash_attn/cute](https://github.com/Dao-AILab/flash-attention/tree/main/flash_attn/cute)
 ### AMD
-Requirements: ROCm 6.0 and above.
+Requirements: ROCm 6.0 and above. See
 [Flash Attention AMD docs](https://github.com/Dao-AILab/flash-attention/tree/main?tab=readme-ov-file#amd-rocm-support).
-See [Flash Attention AMD docs](https://github.com/Dao-AILab/flash-attention/tree/main?tab=readme-ov-file#amd-rocm-support).
+### Flex Attention
 ## Flex Attention
 A flexible PyTorch API for attention used in combination with `torch.compile`.
 ```yaml
-flex_attention: true
+attn_implementation: flex_attention
-
+torch_compile: true  # recommended
 # recommended
 torch_compile: true
 ```
-::: {.callout-note}
+Requires torch ≥ 2.6. See [PyTorch docs](https://pytorch.org/blog/flexattention/).
-We recommend using latest stable version of PyTorch for best performance.
+### SageAttention
-:::
+Requirements: Ampere, Ada, or Hopper GPUs.
 For more details: [PyTorch docs](https://pytorch.org/blog/flexattention/)
 ## SageAttention
 Attention kernels with QK Int8 and PV FP16 accumulator.
 ```yaml
-sage_attention: true
+attn_implementation: sage
 ```
 Requirements: Ampere, Ada, or Hopper GPUs
 ```bash
 pip install sageattention==2.2.0 --no-build-isolation
 ```
 ::: {.callout-warning}
-Only LoRA/QLoRA recommended at the moment. We found loss drop to 0 for full finetuning. See [GitHub Issue](https://github.com/thu-ml/SageAttention/issues/198).
+Only LoRA/QLoRA recommended. Full finetuning has been observed to drop loss to 0. See
 [GitHub Issue](https://github.com/thu-ml/SageAttention/issues/198).
 :::
-For more details: [Sage Attention](https://github.com/thu-ml/SageAttention)
+For more details: [Sage Attention](https://github.com/thu-ml/SageAttention).
-::: {.callout-note}
+### xFormers
 We do not support SageAttention 3 at the moment. If you are interested on adding this or improving SageAttention implementation, please make an Issue.
 :::
 ## xFormers
 ```yaml
-xformers_attention: true
+attn_implementation: xformers
 ```
 ::: {.callout-tip}
-We recommend using with Turing GPUs or below (such as on Colab).
+Recommended for Turing GPUs or below (e.g. Colab T4).
 :::
-For more details: [xFormers](https://github.com/facebookresearch/xformers)
+### Shifted Sparse Attention
 ## Shifted Sparse Attention
 ::: {.callout-warning}
-We plan to deprecate this! If you use this feature, we recommend switching to methods above.
+Planned for deprecation. Prefer one of the backends above.
 :::
-Requirements: LLaMA model architecture
+Requirements: LLaMA model architecture. Loaded as FA2 under the hood and
 patched to implement shifted-sparse attention. Does not support sample packing.
 ```yaml
-flash_attention: true
+attn_implementation: s2
 s2_attention: true
 ```
-::: {.callout-tip}
+### FP8
-No sample packing support!
+torchao low-precision attention. Loaded as SDPA and patched post-load.
 Requirements: SM90+ (Hopper/Blackwell), PyTorch ≥ 2.11, torchao ≥ 0.17,
 flash-attn with FA3. KV caching must be disabled.
 ```yaml
 attn_implementation: fp8
 ```
 ### Hub kernels
 ```yaml
 attn_implementation: kernels-community/flash-attn3
 ```
 Passed through to `transformers`; axolotl does not install the kernel itself.
 For recognized hub paths the capability flags are set automatically; for
 arbitrary paths axolotl uses conservative defaults (`attn_supports_packing=False`,
 `attn_uses_flash_lib=False`).
 ## Migrating from legacy boolean flags
 The following legacy config fields are **deprecated** and will be removed in a
 future release. Each emits a `DeprecationWarning` when set and is stripped from
 the validated config.
 | Legacy | Canonical |
 |---|---|
 | `flash_attention: true` | `attn_implementation: flash_attention_2` |
 | `sdp_attention: true` | `attn_implementation: sdpa` |
 | `xformers_attention: true` | `attn_implementation: xformers` |
 | `flex_attention: true` | `attn_implementation: flex_attention` |
 | `sage_attention: true` | `attn_implementation: sage` |
 | `s2_attention: true` | `attn_implementation: s2` |
 | `eager_attention: true` | `attn_implementation: eager` |
 Combining `attn_implementation` with a legacy flag (e.g. `attn_implementation:
 flash_attention_2` **and** `flash_attention: true`) raises — pick one.
 ::: {.callout-note}
 Existing example configs under `examples/` still use the legacy flags. They
 continue to work with a deprecation warning; they will be migrated in a
 follow-up pass.
 :::
--- a/docs/debugging.qmd
+++ b/docs/debugging.qmd
@@ -76,8 +76,10 @@ datasets:
 Make sure you have an [editable install](https://setuptools.pypa.io/en/latest/userguide/development_mode.html) of Axolotl, which ensures that changes you make to the code are reflected at runtime.  Run the following commands from the root of this project:
 ```bash
-pip3 install packaging
+export UV_TORCH_BACKEND=cu128  # or cu130
-pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
+uv venv --no-project --relocatable
 source .venv/bin/activate
 uv pip install --no-build-isolation -e '.[deepspeed]' --group dev --group test
 ```
 #### Remote Hosts
@@ -208,17 +210,18 @@ cd axolotl
 Next, run the desired docker image and mount the current directory. Below is a docker command you can run to do this:[^2]
 ```bash
-docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl:main-py3.10-cu118-2.0.1
+docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl-uv:main-latest
 ```
 >[!Tip]
 > To understand which containers are available, see the [Docker section of the README](../README.md#docker) and the [DockerHub repo](https://hub.docker.com/r/axolotlai/axolotl/tags).  For details of how the Docker containers are built, see axolotl's [Docker CI builds](../.github/workflows/main.yml).
-You will now be in the container.  Next, perform an editable install of Axolotl:
+You will now be in the container.  Next, install Axolotl with dev dependencies:
 ```bash
-pip3 install packaging
+uv venv --no-project --relocatable
-pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
+source .venv/bin/activate
 uv pip install --no-build-isolation -e '.[deepspeed]' --group dev --group test
 ```
 ### Attach To Container
--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -6,23 +6,33 @@ format:
    toc-depth: 4
 ---
-This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai).
+This section describes the different Docker images that are released by AxolotlAI at
 [Docker Hub](https://hub.docker.com/u/axolotlai).
 ::: {.callout-important}
-For Blackwell GPUs, please use the tags with PyTorch 2.7.1 and CUDA 12.8.
+### Switch to the `-uv` images
 Each image below ships a **uv variant** that uses [uv](https://docs.astral.sh/uv/) with a relocatable venv
 (`/workspace/axolotl-venv`) instead of Miniconda + pip. Append `-uv` to the image name
 (e.g. `axolotlai/axolotl-uv`, `axolotlai/axolotl-base-uv`, `axolotlai/axolotl-cloud-uv`). Tags follow the
 same format as their non-uv counterparts.
 **We recommend switching to the `-uv` images early.** In the near future we will publish the uv-based
 build to the non-uv tags as well. The non-uv names will continue to work, but they will start serving
 the uv image.
 :::
 ## Base
-The base image is the most minimal image that can install Axolotl. It is based on the `nvidia/cuda` image. It includes python, torch, git, git-lfs, awscli, pydantic, and more.
+The base image is the most minimal image that can install Axolotl. It is based on the `nvidia/cuda` image.
 It includes python, torch, git, git-lfs, awscli, pydantic, and more.
 #### Image
-```
+| Variant | Image | Docker Hub |
-axolotlai/axolotl-base
+|---------|-------|------------|
-```
+| pip | `axolotlai/axolotl-base` | [Link](https://hub.docker.com/r/axolotlai/axolotl-base) |
-
+| uv | `axolotlai/axolotl-base-uv` | [Link](https://hub.docker.com/r/axolotlai/axolotl-base-uv) |
 Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-base)
 #### Tags format
@@ -32,8 +42,10 @@ main-base-py{python_version}-cu{cuda_version}-{pytorch_version}
 Tags examples:
 - `main-base-py3.11-cu128-2.8.0`
 - `main-base-py3.11-cu128-2.9.1`
 - `main-base-py3.12-cu128-2.10.0`
 - `main-base-py3.12-cu130-2.9.1`
 - `main-base-py3.12-cu130-2.10.0`
 ## Main
@@ -41,11 +53,10 @@ The main image is the image that is used to run Axolotl. It is based on the `axo
 #### Image
-```
+| Variant | Image | Docker Hub |
-axolotlai/axolotl
+|---------|-------|------------|
-```
+| pip | `axolotlai/axolotl` | [Link](https://hub.docker.com/r/axolotlai/axolotl) |
-
+| uv | `axolotlai/axolotl-uv` | [Link](https://hub.docker.com/r/axolotlai/axolotl-uv) |
 Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl)
 #### Tags format {#sec-main-tags}
@@ -53,7 +64,7 @@ Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl)
 # on push to main
 main-py{python_version}-cu{cuda_version}-{pytorch_version}
-# latest main (currently torch 2.6.0, python 3.11, cuda 12.4)
+# latest main (currently torch 2.9.1, python 3.11, cuda 12.8)
 main-latest
 # nightly build
@@ -71,12 +82,13 @@ There may be some extra tags appended to the image, like `-vllm` which installs
 Tags examples:
 - `main-py3.11-cu128-2.8.0`
 - `main-py3.11-cu128-2.9.1`
 - `main-py3.12-cu128-2.10.0`
 - `main-py3.12-cu130-2.9.1`
 - `main-py3.12-cu130-2.10.0`
 - `main-latest`
- `main-20250303-py3.11-cu124-2.6.0`
+- `main-20260315-py3.11-cu128-2.9.1`
- `main-20250303-py3.11-cu126-2.6.0`
+- `0.16.1`
 - `0.12.0`
 ## Cloud
@@ -90,11 +102,10 @@ Jupyter lab is run by default. Set `JUPYTER_DISABLE=1` in the environment variab
 #### Image
-```
+| Variant | Image | Docker Hub |
-axolotlai/axolotl-cloud
+|---------|-------|------------|
-```
+| pip | `axolotlai/axolotl-cloud` | [Link](https://hub.docker.com/r/axolotlai/axolotl-cloud) |
-
+| uv | `axolotlai/axolotl-cloud-uv` | [Link](https://hub.docker.com/r/axolotlai/axolotl-cloud-uv) |
 Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-cloud)
 #### Tags format
--- a/docs/ebft.qmd
+++ b/docs/ebft.qmd
@@ -129,7 +129,7 @@ gradient_accumulation_steps: 4
 max_steps: 20
 learning_rate: 5.0e-6
 bf16: auto
-flash_attention: true
+attn_implementation: flash_attention_2
 gradient_checkpointing: true
 output_dir: ./outputs/ebft-quickstart
 ```
@@ -304,7 +304,7 @@ lora_alpha: 32
 lora_target_linear: true
 bf16: auto
-flex_attention: true
+attn_implementation: flex_attention
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: true          # Required with flex_attention
--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -57,7 +57,7 @@ description: Frequently asked questions
 **Q: vLLM is not working with Axolotl**
-> A: We currently recommend torch 2.6.0 for use with `vllm`. Please ensure you use the right version. For Docker, please use the `main-py3.11-cu124-2.6.0` tag.
+> A: We currently recommend torch 2.10 for use with `vllm`. Please ensure you use the right version. For Docker, please use the `main-py3.12-cu128-2.10.0` tag (note: torch 2.10 images are built with Python 3.12).
 **Q: FA2 2.8.0 `undefined symbol` runtime error on CUDA 12.4**
--- a/docs/grpo.qmd
+++ b/docs/grpo.qmd
@@ -154,7 +154,7 @@ lr_scheduler: cosine
 warmup_steps: 10
 bf16: true
-flash_attention: true
+attn_implementation: flash_attention_2
 gradient_checkpointing: true
 special_tokens:
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -15,64 +15,30 @@ This guide covers all the ways you can install and set up Axolotl for your envir
 - NVIDIA GPU (Ampere architecture or newer for `bf16` and Flash Attention) or AMD GPU
 - Python ≥3.11
- PyTorch ≥2.6.0
+- PyTorch ≥2.9.1
-## Installation Methods {#sec-installation-methods}
+## Installation {#sec-installation}
 ::: {.callout-important}
 Please make sure to have Pytorch installed before installing Axolotl in your local environment.
 Follow the instructions at: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
 :::
 ::: {.callout-important}
 For Blackwell GPUs, please use Pytorch 2.9.1 and CUDA 12.8.
 :::
-### PyPI Installation (Recommended) {#sec-pypi}
+### Quick Install {#sec-uv}
-```{.bash}
+Axolotl uses [uv](https://docs.astral.sh/uv/) as its package manager. uv is a fast, reliable Python package installer and resolver built in Rust.
 pip3 install -U packaging setuptools wheel ninja
 pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
 ```
-We use `--no-build-isolation` in order to detect the installed PyTorch version (if
+Install uv if not already installed:
 installed) in order not to clobber it, and so that we set the correct version of
 dependencies that are specific to the PyTorch version or other installed
 co-dependencies.
 ### uv Installation {#sec-uv}
 uv is a fast, reliable Python package installer and resolver built in Rust. It offers significant performance improvements over pip and provides better dependency resolution, making it an excellent choice for complex environments.
 Install uv if not already installed
 ```{.bash}
 curl -LsSf https://astral.sh/uv/install.sh | sh
 source $HOME/.local/bin/env
 ```
-Choose your CUDA version to use with PyTorch; e.g. `cu124`, `cu126`, `cu128`,
+Choose your CUDA version (e.g. `cu128`, `cu130`), create a venv, and install:
 then create the venv and activate
 ```{.bash}
-export UV_TORCH_BACKEND=cu126
+export UV_TORCH_BACKEND=cu128  # or cu130
-uv venv --no-project --relocatable
+uv venv
 source .venv/bin/activate
-```
+uv pip install --no-build-isolation axolotl[deepspeed]
 Install PyTorch
 - PyTorch 2.6.0 recommended
 ```{.bash}
 uv pip install packaging setuptools wheel
 uv pip install torch==2.6.0
 uv pip install awscli pydantic
 ```
 Install axolotl from PyPi
 ```{.bash}
 uv pip install --no-build-isolation axolotl[deepspeed,flash-attn]
 # optionally install with vLLM if you're using torch==2.6.0 and want to train w/ GRPO
 uv pip install --no-build-isolation axolotl[deepspeed,flash-attn,vllm]
 ```
 ### Edge/Development Build {#sec-edge-build}
@@ -82,14 +48,16 @@ For the latest features between releases:
 ```{.bash}
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
-pip3 install -U packaging setuptools wheel ninja
+export UV_TORCH_BACKEND=cu128  # or cu130
-pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
+uv venv
 source .venv/bin/activate
 uv pip install --no-build-isolation -e '.[deepspeed]'
 ```
 ### Docker {#sec-docker}
 ```{.bash}
-docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
+docker run --gpus '"all"' --rm -it --ipc=host axolotlai/axolotl-uv:main-latest
 ```
 For development with Docker:
@@ -106,12 +74,12 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it \
  --ulimit memlock=-1 --ulimit stack=67108864 \
  --mount type=bind,src="${PWD}",target=/workspace/axolotl \
  -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-  axolotlai/axolotl:main-latest
+  axolotlai/axolotl-uv:main-latest
 ```
 :::
 ::: {.callout-important}
-For Blackwell GPUs, please use `axolotlai/axolotl:main-py3.11-cu128-2.9.1` or the cloud variant `axolotlai/axolotl-cloud:main-py3.11-cu128-2.9.1`.
+For Blackwell GPUs, please use `axolotlai/axolotl-uv:main-py3.11-cu128-2.9.1` or the cloud variant `axolotlai/axolotl-cloud-uv:main-py3.11-cu128-2.9.1`.
 :::
 Please refer to the [Docker documentation](docker.qmd) for more information on the different Docker images that are available.
@@ -122,7 +90,7 @@ Please refer to the [Docker documentation](docker.qmd) for more information on t
 For providers supporting Docker:
- Use `axolotlai/axolotl-cloud:main-latest`
+- Use `axolotlai/axolotl-cloud-uv:main-latest`
 - Available on:
    - [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
    - [Vast.ai](https://cloud.vast.ai?ref_id=62897&template_id=bdd4a49fa8bce926defc99471864cace&utm_source=axolotl&utm_medium=partner&utm_campaign=template_launch_july2025&utm_content=docs_link)
@@ -141,7 +109,7 @@ For providers supporting Docker:
 ### macOS {#sec-macos}
 ```{.bash}
-pip3 install --no-build-isolation -e '.'
+uv pip install --no-build-isolation -e '.'
 ```
 See @sec-troubleshooting for Mac-specific issues.
@@ -152,21 +120,44 @@ See @sec-troubleshooting for Mac-specific issues.
 We recommend using WSL2 (Windows Subsystem for Linux) or Docker.
 :::
-## Environment Managers {#sec-env-managers}
+## Migrating from pip to uv {#sec-migrating}
-### Conda/Pip venv {#sec-conda}
+If you have an existing pip-based Axolotl installation, you can migrate to uv:
-1. Install Python ≥3.11
+```{.bash}
-2. Install PyTorch: https://pytorch.org/get-started/locally/
+# Install uv
-3. Install Axolotl:
+curl -LsSf https://astral.sh/uv/install.sh | sh
-   ```{.bash}
+source $HOME/.local/bin/env
-   pip3 install -U packaging setuptools wheel ninja
+
-   pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
+# Create a fresh venv (recommended for a clean start)
-   ```
+export UV_TORCH_BACKEND=cu128  # or cu130
-4. (Optional) Login to Hugging Face:
+uv venv
-   ```{.bash}
+source .venv/bin/activate
-   hf auth login
+
-   ```
+# Reinstall axolotl
 uv pip install --no-build-isolation axolotl[deepspeed]
 ```
 ## Using pip (Alternative) {#sec-pip}
 If you are unable to install uv, you can still use pip directly.
 ::: {.callout-important}
 Please make sure to have PyTorch installed before installing Axolotl with pip.
 Follow the instructions at: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
 :::
 ```{.bash}
 pip3 install -U packaging setuptools wheel ninja
 pip3 install --no-build-isolation axolotl[deepspeed]
 ```
 For editable/development installs:
 ```{.bash}
 pip3 install -U packaging setuptools wheel ninja
 pip3 install --no-build-isolation -e '.[deepspeed]'
 ```
 ## Troubleshooting {#sec-troubleshooting}
--- a/docs/multimodal_assistant_mask.md
+++ b/docs/multimodal_assistant_mask.md
@@ -0,0 +1,84 @@
 # Multimodal assistant-only loss masking
 ## Correct placement
 ```yaml
 # Top-level: only train_on_inputs lives here.
 train_on_inputs: false
 datasets:
  - path: data/train.jsonl
    type: chat_template
    roles_to_train:          # per-dataset — this is what the MM scanner reads
      - assistant
    train_on_eos: turn       # per-dataset — same
 test_datasets:
  - path: data/val.jsonl
    type: chat_template
    split: train
    roles_to_train:
      - assistant
    train_on_eos: turn
 ```
 ## How to verify at runtime
 `build_collator` logs the resolved knobs at INFO:
 ```text
 MM collator: train_on_inputs=False roles_to_train=['assistant'] train_on_eos=turn role_boundaries_override=none
 ```
 If `roles_to_train` logs as `None`, the YAML knobs are not reaching the
 scanner — check that they are under `datasets[0]`, not at the root.
 Each verified strategy additionally logs its resolved boundary token ids at
 strategy init (e.g. `<|turn>model` → `[105, 4368]`, `<turn|>` → `[106]` for
 Gemma 4). If a strategy emits the "has no built-in role boundaries ... only
 pad and media tokens are masked" one-shot warning instead, it is on the
 fallback path — declare per-role markers in YAML via `cfg.role_boundaries`
 (below) to activate masking. The strategies currently on this path are
 listed in the audit table above under `fallback + warn`.
 ## Config-based override: `cfg.role_boundaries`
 For the "unverified" strategies above, or for custom chat templates that
 don't match a built-in strategy's markers, users can declare role boundaries
 directly in YAML without subclassing:
 ```yaml
 role_boundaries:
  - role: assistant
    start: "<|turn>model"
    end: "<turn|>"
  - role: user
    start: "<|turn>user"
    end: "<turn|>"
  # Optional keys:
  # include_start: false   # default False
  # include_end: true      # default True, respects cfg.train_on_eos
  # end: eos_token         # sentinel: resolves to tokenizer.eos_token_id
  # end: null              # span runs to end of sequence
 ```
 Semantics:
 - `start` and `end` are literal strings; axolotl encodes them at strategy
  init via `tokenizer.encode(..., add_special_tokens=False)` and logs the
  resolved token-id sequences at INFO level.
 - The special value `end: eos_token` is the portable way to express
  "Pixtral-style assistant turns end at EOS" without hard-coding an id.
 - `role_boundaries` is an **opt-in override**. A non-empty list **replaces**
  the strategy's built-in declarations wholesale (partial overlays are
  intentionally unsupported — they're hard to reason about at review time).
  Leaving the field unset *or* setting it to an empty list (`[]`) both mean
  "use the strategy's built-ins." Writing `role_boundaries: []` is almost
  always a typo or leftover — honoring it literally would produce all-masked
  labels and zero gradient, so it is treated the same as unset.
 - `cfg.roles_to_train` still governs which declared roles contribute to
  loss. You can declare `user` and `assistant` boundaries and set
  `roles_to_train: ["assistant"]` to have the scanner correctly identify
  user spans as masking boundaries without training on their content.
 - Invalid specs fail loudly at strategy init (missing `role`/`start`,
  unencodable markers), not silently at loss-compute time.
--- a/docs/optimizations.qmd
+++ b/docs/optimizations.qmd
@@ -22,12 +22,12 @@ Improves GPU utilization by combining multiple short sequences into a single pac
 Using an optimized attention implementation is critical for training speed.
- **[Flash Attention 2](https://github.com/Dao-AILab/flash-attention)**: `flash_attention: true`. **(Recommended)** The industry standard for fast attention on modern GPUs. Requires Ampere or higher. For AMD, check [AMD Support](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#amd-rocm-support).
+- **[Flash Attention 2](https://github.com/Dao-AILab/flash-attention)**: `attn_implementation: flash_attention_2`. **(Recommended)** The industry standard for fast attention on modern GPUs. Requires Ampere or higher. For AMD, check [AMD Support](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#amd-rocm-support).
- **[Flex Attention](https://pytorch.org/blog/flexattention/)**: `flex_attention: true`.
+- **[Flex Attention](https://pytorch.org/blog/flexattention/)**: `attn_implementation: flex_attention`.
- **[SDP Attention](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)**: `sdp_attention: true`. PyTorch's native implementation.
+- **[SDP Attention](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)**: `attn_implementation: sdpa`. PyTorch's native implementation.
- **[Xformers](https://github.com/facebookresearch/xformers)**: `xformers_attention: true`. Works with FP16.
+- **[Xformers](https://github.com/facebookresearch/xformers)**: `attn_implementation: xformers`. Works with FP16.
-*Note: You should only enable one attention backend.*
+See [Attention](attention.qmd) for the full list of backends and the canonical values.
 ### LoRA Optimizations
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -320,8 +320,10 @@ The input format is a simple JSON input with customizable fields based on the ab
 As IPO is just DPO with a different loss function, all supported dataset formats for [DPO](#dpo) are also supported for IPO.
 ```yaml
-rl: ipo
+rl: dpo
 dpo_loss_type: ["ipo"]
 ```
 *Note:* Passing `rl: ipo` directly is still supported, but will soon be deprecated.
 ### ORPO
@@ -1145,8 +1147,7 @@ datasets:
    type: ebft_strided_structured.transform
    split: train[:1%]
-flash_attention: false
+attn_implementation: flex_attention   # Strided mode uses flex_attention
 flex_attention: true     # Strided mode uses flex_attention
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: true    # Required for flex_attention
--- a/docs/scripts/examples-allowlist.yml
+++ b/docs/scripts/examples-allowlist.yml
@@ -20,6 +20,8 @@ examples:
    title: Arcee AFM
  # MistralAI
  - name: mistral-medium-3_5
    title: Mistral Medium 3.5
  - name: ministral3/think
    title: Ministral 3 Thinking
  - name: ministral3/vision
--- a/docs/sequence_parallelism.qmd
+++ b/docs/sequence_parallelism.qmd
@@ -55,7 +55,7 @@ To use sequence parallelism, you need:
 ## Limitations
- Flash attention must be enabled for this to work (`flash_attention: true` in config YAML)
+- Flash attention must be enabled for this to work (`attn_implementation: flash_attention_2` in config YAML)
 - May have a small performance overhead due to communication between GPUs
 ## Example
--- a/docs/training_stability.qmd
+++ b/docs/training_stability.qmd
@@ -245,7 +245,7 @@ For GRPO, also reduce `max_completion_length`. Memory scales quadratically with
 Reduces attention memory from O(n^2) to O(n):
 ```yaml
-flash_attention: true
+attn_implementation: flash_attention_2
 ```
 ### Step 6: Offload with DeepSpeed
--- a/docs/unsloth.qmd
+++ b/docs/unsloth.qmd
@@ -1,53 +0,0 @@
 ---
 title: "Unsloth"
 description: "Hyper-optimized QLoRA finetuning for single GPUs"
 ---
 ### Overview
 Unsloth provides hand-written optimized kernels for LLM finetuning that slightly improve speed and VRAM over
 standard industry baselines.
 ::: {.callout-important}
 Due to breaking changes in transformers `v4.48.0`, users will need to downgrade to `<=v4.47.1` to use this patch.
 This will later be deprecated in favor of [LoRA Optimizations](lora_optims.qmd).
 :::
 ### Installation
 The following will install the correct unsloth and extras from source.
 ```bash
 python scripts/unsloth_install.py | sh
 ```
 ### Usage
 Axolotl exposes a few configuration options to try out unsloth and get most of the performance gains.
 Our unsloth integration is currently limited to the following model architectures:
 - llama
 These options are specific to LoRA finetuning and cannot be used for multi-GPU finetuning
 ```yaml
 unsloth_lora_mlp: true
 unsloth_lora_qkv: true
 unsloth_lora_o: true
 ```
 These options are composable and can be used with multi-gpu finetuning
 ```yaml
 unsloth_cross_entropy_loss: true
 unsloth_rms_norm: true
 unsloth_rope: true
 ```
 ### Limitations
 - Single GPU only; e.g. no multi-gpu support
 - No deepspeed or FSDP support (requires multi-gpu)
 - LoRA + QLoRA support only. No full fine tunes or fp8 support.
 - Limited model architecture support. Llama, Phi, Gemma, Mistral only
 - No MoE support.
--- a/examples/LiquidAI/README.md
+++ b/examples/LiquidAI/README.md
@@ -15,8 +15,7 @@ Thanks to the team at LiquidAI for giving us early access to prepare for these r
    Here is an example of how to install from pip:
    ```bash
    # Ensure you have a compatible version of Pytorch installed
-    pip3 install packaging setuptools wheel ninja
+    uv pip install --no-build-isolation 'axolotl>=0.16.1'
    pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
    ```
 2.  Run one of the finetuning examples below.
@@ -35,7 +34,7 @@ Thanks to the team at LiquidAI for giving us early access to prepare for these r
    **LFM2-MoE**
    ```bash
-    pip install git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6
+    uv pip install git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6
    # LoRA SFT (1x48GB @ 16.2GiB)
    axolotl train examples/LiquidAI/lfm2-8b-a1b-lora.yaml
@@ -45,7 +44,7 @@ Thanks to the team at LiquidAI for giving us early access to prepare for these r
 - **Installation Error**: If you encounter `ImportError: ... undefined symbol ...` or `ModuleNotFoundError: No module named 'causal_conv1d_cuda'`, the `causal-conv1d` package may have been installed incorrectly. Try uninstalling it:
  ```bash
-  pip uninstall -y causal-conv1d
+  uv pip uninstall causal-conv1d
  ```
 - **Dataset Loading**: Read more on how to load your own dataset in our [documentation](https://docs.axolotl.ai/docs/dataset_loading.html).
--- a/examples/LiquidAI/lfm2-350m-fft.yaml
+++ b/examples/LiquidAI/lfm2-350m-fft.yaml
@@ -39,7 +39,7 @@ tf32: true
 gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_ratio: 0.1
 evals_per_epoch: 2
--- a/examples/LiquidAI/lfm2-8b-a1b-lora.yaml
+++ b/examples/LiquidAI/lfm2-8b-a1b-lora.yaml
@@ -48,7 +48,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_ratio: 0.1
 evals_per_epoch: 2
--- a/examples/LiquidAI/lfm2-vl-lora.yaml
+++ b/examples/LiquidAI/lfm2-vl-lora.yaml
@@ -50,8 +50,7 @@ tf32: true
 gradient_checkpointing: true
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 eager_attention:
 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/alst/llama3-8b-deepspeed-alst.yaml
+++ b/examples/alst/llama3-8b-deepspeed-alst.yaml
@@ -39,7 +39,7 @@ activation_offloading: legacy
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_steps: 100
 saves_per_epoch: 1
--- a/examples/alst/llama3-8b-fsdp2-alst.yaml
+++ b/examples/alst/llama3-8b-fsdp2-alst.yaml
@@ -39,7 +39,7 @@ activation_offloading: legacy
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_steps: 100
 saves_per_epoch: 1
--- a/examples/apertus/README.md
+++ b/examples/apertus/README.md
@@ -11,12 +11,11 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
    Here is an example of how to install from main for pip:
 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
+# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+uv pip install --no-build-isolation -e '.'
 pip3 install --no-build-isolation -e '.[flash-attn]'
 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
@@ -31,7 +30,7 @@ python scripts/cutcrossentropy_install.py | sh
 # For those using our Docker image, use the below path.
 export CUDA_HOME=/usr/local/cuda
-pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
+uv pip install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
 ```
 For any installation errors, see [XIELU Installation Issues](#xielu-installation-issues)
@@ -67,7 +66,7 @@ If those didn't help, please try the below solutions:
 1. Pass env for CMAKE and try install again:
    ```bash
-    Python_EXECUTABLE=$(which python) pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
+    Python_EXECUTABLE=$(which python) uv pip install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
    ```
 2. Git clone the repo and manually hardcode python path:
@@ -92,7 +91,7 @@ If those didn't help, please try the below solutions:
    ```
    ```bash
-    pip3 install . --no-build-isolation --no-deps
+    uv pip install . --no-build-isolation --no-deps
    ```
 ## Optimization Guides
--- a/examples/apertus/apertus-8b-qlora.yaml
+++ b/examples/apertus/apertus-8b-qlora.yaml
@@ -55,7 +55,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/arcee/README.md
+++ b/examples/arcee/README.md
@@ -13,12 +13,11 @@ Thanks to the team at Arcee.ai for using Axolotl in supervised fine-tuning the A
    Here is an example of how to install from main for pip:
 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
+# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+uv pip install --no-build-isolation -e '.'
 pip3 install --no-build-isolation -e '.[flash-attn]'
 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
--- a/examples/arcee/afm-4.5b-qlora.yaml
+++ b/examples/arcee/afm-4.5b-qlora.yaml
@@ -55,7 +55,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/archived/cerebras/btlm-ft.yml
+++ b/examples/archived/cerebras/btlm-ft.yml
@@ -59,8 +59,7 @@ gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 sdp_attention:
 flash_optimum:
 gptq_groupsize:
--- a/examples/archived/cerebras/qlora.yml
+++ b/examples/archived/cerebras/qlora.yml
@@ -39,8 +39,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-xformers_attention: true
+attn_implementation: xformers
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/code-llama/13b/lora.yml
+++ b/examples/archived/code-llama/13b/lora.yml
@@ -45,7 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/code-llama/13b/qlora.yml
+++ b/examples/archived/code-llama/13b/qlora.yml
@@ -46,7 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/code-llama/34b/lora.yml
+++ b/examples/archived/code-llama/34b/lora.yml
@@ -45,7 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/code-llama/34b/qlora.yml
+++ b/examples/archived/code-llama/34b/qlora.yml
@@ -46,7 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/code-llama/7b/lora.yml
+++ b/examples/archived/code-llama/7b/lora.yml
@@ -45,7 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/code-llama/7b/qlora.yml
+++ b/examples/archived/code-llama/7b/qlora.yml
@@ -46,7 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/dbrx/16bit-lora.yaml
+++ b/examples/archived/dbrx/16bit-lora.yaml
@@ -52,7 +52,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/archived/dbrx/8bit-lora.yaml
+++ b/examples/archived/dbrx/8bit-lora.yaml
@@ -55,7 +55,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/archived/dbrx/fft-ds-zero3.yaml
+++ b/examples/archived/dbrx/fft-ds-zero3.yaml
@@ -39,7 +39,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
+++ b/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
@@ -45,7 +45,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/archived/falcon/config-7b-lora.yml
+++ b/examples/archived/falcon/config-7b-lora.yml
@@ -43,8 +43,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-xformers_attention: true
+attn_implementation: xformers
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/falcon/config-7b-qlora.yml
+++ b/examples/archived/falcon/config-7b-qlora.yml
@@ -73,8 +73,7 @@ early_stopping_patience: 3
 resume_from_checkpoint:
 auto_resume_from_checkpoints: true
 logging_steps: 1
-xformers_attention: true
+attn_implementation: xformers
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/falcon/config-7b.yml
+++ b/examples/archived/falcon/config-7b.yml
@@ -40,8 +40,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-xformers_attention: true
+attn_implementation: xformers
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/gemma/qlora.yml
+++ b/examples/archived/gemma/qlora.yml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/gptj/qlora.yml
+++ b/examples/archived/gptj/qlora.yml
@@ -36,8 +36,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-xformers_attention: true
+attn_implementation: xformers
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/jeopardy-bot/config.yml
+++ b/examples/archived/jeopardy-bot/config.yml
@@ -37,8 +37,7 @@ bf16: auto
 tf32: true
 resume_from_checkpoint:
 logging_steps: 5
-xformers_attention: true
+attn_implementation: xformers
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/mpt-7b/config.yml
+++ b/examples/archived/mpt-7b/config.yml
@@ -39,7 +39,6 @@ bf16: auto
 tf32: true
 resume_from_checkpoint:
 logging_steps: 5
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/openllama-3b/config.yml
+++ b/examples/archived/openllama-3b/config.yml
@@ -39,7 +39,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/openllama-3b/lora.yml
+++ b/examples/archived/openllama-3b/lora.yml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/openllama-3b/qlora.yml
+++ b/examples/archived/openllama-3b/qlora.yml
@@ -40,7 +40,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/qwen/lora.yml
+++ b/examples/archived/qwen/lora.yml
@@ -47,7 +47,6 @@ tf32: false
 gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention:
 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/qwen/qlora.yml
+++ b/examples/archived/qwen/qlora.yml
@@ -47,7 +47,6 @@ tf32: false
 gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention:
 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/qwen/qwen2-moe-lora.yaml
+++ b/examples/archived/qwen/qwen2-moe-lora.yaml
@@ -43,7 +43,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/qwen/qwen2-moe-qlora.yaml
+++ b/examples/archived/qwen/qwen2-moe-qlora.yaml
@@ -46,7 +46,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/redpajama/config-3b.yml
+++ b/examples/archived/redpajama/config-3b.yml
@@ -40,7 +40,6 @@ bf16: auto
 tf32: true
 resume_from_checkpoint:
 logging_steps: 5
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/replit-3b/config-lora.yml
+++ b/examples/archived/replit-3b/config-lora.yml
@@ -38,7 +38,6 @@ tf32: true
 gradient_checkpointing:
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/stablelm-2/1.6b/fft.yml
+++ b/examples/archived/stablelm-2/1.6b/fft.yml
@@ -44,7 +44,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
 flash_attn_fuse_mlp: true
--- a/examples/archived/stablelm-2/1.6b/lora.yml
+++ b/examples/archived/stablelm-2/1.6b/lora.yml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
--- a/examples/archived/starcoder2/qlora.yml
+++ b/examples/archived/starcoder2/qlora.yml
@@ -46,7 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/tiny-llama/lora-mps.yml
+++ b/examples/archived/tiny-llama/lora-mps.yml
@@ -47,7 +47,6 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: false
 warmup_ratio: 0.1
 evals_per_epoch: 0
--- a/examples/archived/tiny-llama/lora.yml
+++ b/examples/archived/tiny-llama/lora.yml
@@ -45,7 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/tiny-llama/pretrain.yml
+++ b/examples/archived/tiny-llama/pretrain.yml
@@ -36,7 +36,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/archived/tiny-llama/qlora.yml
+++ b/examples/archived/tiny-llama/qlora.yml
@@ -47,7 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml
+++ b/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml
@@ -71,8 +71,7 @@ early_stopping_patience: 3
 resume_from_checkpoint:
 auto_resume_from_checkpoints: true
 logging_steps: 1
-xformers_attention: true
+attn_implementation: xformers
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_ratio: 0.1
--- a/examples/archived/yi-34B-chat/qlora.yml
+++ b/examples/archived/yi-34B-chat/qlora.yml
@@ -10,7 +10,7 @@ load_in_4bit: true
 sequence_len: 1024
 bf16: auto
 tf32: false
-flash_attention: true
+attn_implementation: flash_attention_2
 special_tokens:
  bos_token: "<|startoftext|>"
  eos_token: "<|endoftext|>"
--- a/examples/cohere/command-r-7b-qlora.yml
+++ b/examples/cohere/command-r-7b-qlora.yml
@@ -48,7 +48,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -36,12 +36,7 @@
    "id": "msOCO4NRmRLa"
   },
   "outputs": [],
-   "source": [
+   "source": "%%capture\n# This step can take ~5-10 minutes to install dependencies\n!pip install --no-build-isolation \"axolotl>=0.16.1\"\n!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@fec1a88\""
    "%%capture\n",
    "# This step can take ~5-10 minutes to install dependencies\n",
    "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@fec1a88\""
   ]
  },
  {
   "cell_type": "markdown",
--- a/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
@@ -45,7 +45,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
@@ -45,7 +45,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/deepseek-v2/fft-fsdp-16b.yaml
+++ b/examples/deepseek-v2/fft-fsdp-16b.yaml
@@ -35,7 +35,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_ratio: 0.1
 evals_per_epoch: 2
--- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -59,7 +59,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
 warmup_ratio: 0.1
 evals_per_epoch: 2
--- a/examples/devstral/README.md
+++ b/examples/devstral/README.md
@@ -15,9 +15,8 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r
    Here is an example of how to install from pip:
 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
+# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+uv pip install --no-build-isolation 'axolotl>=0.16.1'
 pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```
 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage
--- a/examples/devstral/devstral-small-qlora.yml
+++ b/examples/devstral/devstral-small-qlora.yml
@@ -26,7 +26,6 @@ lora_model_dir:
 sequence_len: 2048
 sample_packing: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0
@@ -51,8 +50,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attn_implementation: flash_attention_2
-scaling_softmax: true
+# scaling_softmax: true  # needs flex_attention
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml
+++ b/examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml
@@ -29,7 +29,7 @@ output_dir: ./outputs/ndp-out/
 sequence_len: 2048
 sample_packing: true
-flash_attention: true
+attn_implementation: flash_attention_2
 gradient_accumulation_steps: 1
 micro_batch_size: 1
--- a/examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml
+++ b/examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml
@@ -26,7 +26,7 @@ output_dir: ./outputs/ndp-out/
 sequence_len: 8192
 sample_packing: true
-flash_attention: true
+attn_implementation: flash_attention_2
 gradient_accumulation_steps: 1
 micro_batch_size: 1  # must be 1 when using context parallel
--- a/Show More
+++ b/Show More