chore: update docker docs (#3623 )

fix: docker build failing (#3622 )
* fix: uv leftover docs * fix: docker build failing * chore: doc * fix: remove old pytorch build * fix: stop recommend flash-attn optional, let transformers pull * fix: remove ring flash attention from image * fix: quotes [skip ci] * chore: naming [skip ci]
2026-04-24 16:03:21 +07:00 · 2026-04-24 14:23:09 +07:00 · 2026-04-23 14:49:52 -04:00 · 2026-04-23 00:26:34 -04:00 · 2026-04-23 00:25:48 -04:00 · 2026-04-23 00:25:28 -04:00
168 changed files with 13190 additions and 1914 deletions
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -31,7 +31,11 @@ PRs are **greatly welcome**!

 Please run below to setup env
 ```bash
-pip3 install -r requirements-dev.txt -r requirements-tests.txt
+# Install axolotl + dev and test dependencies
+export UV_TORCH_BACKEND=cu128  # or cu130
+uv venv --no-project --relocatable
+source .venv/bin/activate
+uv pip install --no-build-isolation -e '.[deepspeed]' --group dev --group test
 pre-commit install

 # test
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -30,14 +30,6 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.9.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
-            platforms: "linux/amd64,linux/arm64"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
@@ -168,14 +160,6 @@ jobs:
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64,linux/arm64"
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.9.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-uv-base"
-            platforms: "linux/amd64,linux/arm64"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -6,7 +6,7 @@ on:
      types: [opened, synchronize, reopened, ready_for_review]
      paths:
       - '**.py'
-       - 'requirements.txt'
+       - 'pyproject.toml'
       - '.github/workflows/*.yml'
       - "*.[q]md"
       - "examples/**/*.y[a]?ml"
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -18,12 +18,6 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.9.0
-            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
@@ -180,12 +174,6 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.9.0
-            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -3,17 +3,15 @@ name: docker-multigpu-tests-biweekly
 on:
  pull_request:
    paths:
-      - 'tests/e2e/multigpu/**.py'
-      - 'requirements.txt'
-      - 'setup.py'
-      - 'pyproject.toml'
-      - '.github/workflows/multi-gpu-e2e.yml'
-      - 'scripts/cutcrossentropy_install.py'
-      - 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
-      - 'src/axolotl/utils/distributed.py'
+      - "tests/e2e/multigpu/**.py"
+      - "pyproject.toml"
+      - ".github/workflows/multi-gpu-e2e.yml"
+      - "scripts/cutcrossentropy_install.py"
+      - "src/axolotl/core/trainers/mixins/sequence_parallel.py"
+      - "src/axolotl/utils/distributed.py"
  workflow_dispatch:
  schedule:
-    - cron: '0 0 * * 1,4'  # Runs at 00:00 UTC every monday & thursday
+    - cron: "0 0 * * 1,4" # Runs at 00:00 UTC every monday & thursday

 # Cancel jobs on the same ref if a new one is triggered
 concurrency:
@@ -33,19 +31,19 @@ jobs:
      fail-fast: false
      matrix:
        include:
-#          - cuda: 129
-#            cuda_version: 12.9.1
-#            python_version: "3.12"
-#            pytorch: 2.9.1
-#            axolotl_extras: "fbgemm-gpu"
-#            num_gpus: 2
-#            dockerfile: "Dockerfile-uv.jinja"
+          #          - cuda: 129
+          #            cuda_version: 12.9.1
+          #            python_version: "3.12"
+          #            pytorch: 2.9.1
+          #            axolotl_extras: "fbgemm-gpu"
+          #            num_gpus: 2
+          #            dockerfile: "Dockerfile-uv.jinja"
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
-#            axolotl_extras: fbgemm-gpu
+            #            axolotl_extras: fbgemm-gpu
            num_gpus: 2
          - cuda: 128
            cuda_version: 12.8.1
@@ -53,7 +51,6 @@ jobs:
            pytorch: 2.10.0
            axolotl_extras: "fbgemm-gpu"
            num_gpus: 2
-            dockerfile: "Dockerfile-uv.jinja"
    runs-on: [self-hosted, modal]
    timeout-minutes: 120
    steps:
@@ -75,7 +72,7 @@ jobs:
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -8,6 +8,9 @@ on:

 permissions: {}

+env:
+  UV_SYSTEM_PYTHON: "1"
+
 jobs:
  setup_release:
    name: Create Release
@@ -41,11 +44,15 @@ jobs:
        with:
          python-version: "3.11"

+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+
      - name: Install dependencies
        run: |
-          pip3 install wheel packaging==26.0
-          pip3 install --no-build-isolation -e .
-          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+          uv pip install wheel packaging
+          uv pip install --no-build-isolation -e .
+          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
+            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse

      - name: Extract tag name
        id: tag
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -2,15 +2,18 @@ name: Tests Nightly against upstream main
 on:
  workflow_dispatch:
  schedule:
-    - cron: '0 0 * * *'  # Runs at 00:00 UTC every day
+    - cron: "0 0 * * *" # Runs at 00:00 UTC every day
  pull_request:
    types: [opened, synchronize, reopened, ready_for_review]
    paths:
-      - '.github/workflows/tests-nightly.yml'
+      - ".github/workflows/tests-nightly.yml"

 permissions:
  contents: read

+env:
+  UV_SYSTEM_PYTHON: "1"
+
 jobs:
  pre-commit:
    name: pre-commit
@@ -20,7 +23,7 @@ jobs:
      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"
-          cache: 'pip' # caching pip dependencies
+          cache: "pip" # caching pip dependencies
      - uses: pre-commit/action@v3.0.1
        env:
          SKIP: no-commit-to-branch
@@ -43,7 +46,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python_version: ["3.12"]  # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
+        python_version: ["3.12"] # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
        pytorch_version: ["2.9.1", "2.10.0"]
    timeout-minutes: 20

@@ -61,36 +64,34 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
-          cache: 'pip' # caching pip dependencies

-      - name: upgrade pip
-        run: |
-          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==26.0 setuptools==78.1.1 wheel
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7

      - name: Install PyTorch
        run: |
-          pip3 install torch==${{ matrix.pytorch_version }} torchvision
-
-      - name: Update requirements.txt
-        run: |
-          sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt
-          sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt
-          sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt
-          sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt
-          sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt
+          uv pip install torch==${{ matrix.pytorch_version }} torchvision
+          uv pip freeze | grep -E "^(torch|torchvision)==" > /tmp/torch-pin.txt

      - name: Install dependencies
        run: |
-          pip3 show torch
-          pip3 install --no-build-isolation -U -e .
-          python scripts/unsloth_install.py | sh
-          python scripts/cutcrossentropy_install.py | sh
-          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+          uv pip install --no-build-isolation -e . --override /tmp/torch-pin.txt
+          python scripts/cutcrossentropy_install.py --uv | sh
+          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
+            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
+
+      - name: Override with nightly HF packages
+        run: |
+          uv pip install --no-deps \
+            "transformers @ git+https://github.com/huggingface/transformers.git@main" \
+            "peft @ git+https://github.com/huggingface/peft.git@main" \
+            "accelerate @ git+https://github.com/huggingface/accelerate.git@main" \
+            "trl @ git+https://github.com/huggingface/trl.git@main" \
+            "datasets @ git+https://github.com/huggingface/datasets.git@main"

      - name: Make sure PyTorch version wasn't clobbered
        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__, f'Expected torch ${{ matrix.pytorch_version }} but got {torch.__version__}'"

      - name: Ensure axolotl CLI was installed
        run: |
@@ -102,9 +103,6 @@ jobs:
          pytest -v --durations=10 tests/patched/
          pytest -v --durations=10 tests/cli/

-      - name: cleanup pip cache
-        run: |
-          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

  docker-e2e-tests:
    if: github.repository_owner == 'axolotl-ai-cloud'
@@ -136,7 +134,6 @@ jobs:
            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
-            dockerfile: "Dockerfile-uv.jinja"
            nightly_build: "true"
    steps:
      - name: Checkout
@@ -157,7 +154,7 @@ jobs:
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -6,21 +6,19 @@ on:
    branches:
      - "main"
    paths:
-      - '**.py'
-      - 'requirements.txt'
-      - '.github/workflows/*.yml'
-      - 'requirements-tests.txt'
-      - 'cicd/cicd.sh'
-      - 'cicd/Dockerfile.jinja'
+      - "**.py"
+      - "pyproject.toml"
+      - ".github/workflows/*.yml"
+      - "cicd/cicd.sh"
+      - "cicd/Dockerfile-uv.jinja"
  pull_request:
-      types: [opened, synchronize, reopened, ready_for_review]
-      paths:
-       - '**.py'
-       - 'requirements.txt'
-       - '.github/workflows/*.yml'
-       - 'requirements-tests.txt'
-       - 'cicd/cicd.sh'
-       - 'cicd/Dockerfile.jinja'
+    types: [opened, synchronize, reopened, ready_for_review]
+    paths:
+      - "**.py"
+      - "pyproject.toml"
+      - ".github/workflows/*.yml"
+      - "cicd/cicd.sh"
+      - "cicd/Dockerfile-uv.jinja"
  workflow_dispatch:

 # Cancel jobs on the same ref if a new one is triggered
@@ -33,6 +31,7 @@ permissions:

 env:
  TRANSFORMERS_IS_CI: "yes"
+  UV_SYSTEM_PYTHON: "1"

 jobs:
  pre-commit:
@@ -44,7 +43,7 @@ jobs:
      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"
-          cache: 'pip' # caching pip dependencies
+          cache: "pip" # caching pip dependencies
      - uses: pre-commit/action@v3.0.1
        env:
          SKIP: no-commit-to-branch
@@ -94,32 +93,25 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
-          cache: 'pip' # caching pip dependencies

-      - name: upgrade pip
-        run: |
-          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==26.0 setuptools==75.8.0 wheel
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7

      - name: Install PyTorch
        run: |
-          pip3 install --no-cache-dir torch==${{ matrix.pytorch_version }} torchvision
+          uv pip install torch==${{ matrix.pytorch_version }} torchvision
+          uv pip freeze | grep -E "^(torch|torchvision)==" > /tmp/torch-pin.txt

      - name: Install dependencies
        run: |
-          pip3 show torch
-          pip3 install --no-cache-dir --no-build-isolation -U -e .
-          python scripts/unsloth_install.py | sh
-          python scripts/cutcrossentropy_install.py | sh
-          pip3 install -r requirements-dev.txt -r requirements-tests.txt
-
-      - name: cleanup pip cache
-        run: |
-          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
+          uv pip install --no-build-isolation -e . --override /tmp/torch-pin.txt
+          python scripts/cutcrossentropy_install.py --uv | sh
+          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
+            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse

      - name: Make sure PyTorch version wasn't clobbered
        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__, f'Expected torch ${{ matrix.pytorch_version }} but got {torch.__version__}'"

      - name: Ensure axolotl CLI was installed
        run: |
@@ -188,38 +180,42 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
-          cache: 'pip' # caching pip dependencies

-      - name: upgrade pip
-        run: |
-          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==26.0 setuptools==75.8.0 setuptools_scm build wheel psutil
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7

      - name: Install PyTorch
        run: |
-          pip3 install --no-cache-dir torch==${{ matrix.pytorch_version }} torchvision
+          uv pip install torch==${{ matrix.pytorch_version }} torchvision
+          uv pip freeze | grep -E "^(torch|torchvision)==" > /tmp/torch-pin.txt

      - name: Install dependencies
        run: |
-          pip3 show torch
+          uv pip install packaging setuptools_scm build wheel psutil
          python -m build --no-isolation --sdist
-          pip3 install --no-cache-dir --no-build-isolation dist/axolotl*.tar.gz
-          python scripts/unsloth_install.py | sh
-          python scripts/cutcrossentropy_install.py | sh
-          pip3 install -r requirements-dev.txt -r requirements-tests.txt
-
-      - name: cleanup pip cache
-        run: |
-          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
+          uv pip install --no-build-isolation dist/axolotl*.tar.gz --override /tmp/torch-pin.txt
+          python scripts/cutcrossentropy_install.py --uv | sh
+          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
+            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse

      - name: Make sure PyTorch version wasn't clobbered
        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__, f'Expected torch ${{ matrix.pytorch_version }} but got {torch.__version__}'"

      - name: Ensure axolotl CLI was installed
        run: |
          axolotl --help

+      - name: Verify agent docs are discoverable
+        run: |
+          # Agent docs live in docs/agents/ (source of truth) and are resolved
+          # at runtime from the repo checkout or via `axolotl fetch docs`
+          axolotl agent-docs --list
+          axolotl agent-docs | grep -q "Fine-tuning framework"
+          axolotl agent-docs grpo | grep -q "GRPO"
+          axolotl agent-docs sft | grep -q "SFT"
+          python -c "from axolotl.cli.agent_docs import get_doc, list_topics; assert len(list_topics()) >= 5; assert 'GRPO' in get_doc('grpo')"
+
      - name: Show HF cache
        run: hf cache ls

@@ -281,7 +277,6 @@ jobs:
            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
-            dockerfile: "Dockerfile-uv.jinja"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -302,7 +297,7 @@ jobs:
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
@@ -364,7 +359,7 @@ jobs:
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "GPU_TYPE=${{ matrix.gpu_type || 'L40S'}}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -16,6 +16,9 @@ axolotl inference config.yaml          # Interactive inference
 axolotl merge-lora config.yaml         # Merge LoRA adapter into base model
 axolotl vllm-serve config.yaml         # Start vLLM server for GRPO/EBFT training
 axolotl fetch examples                 # Download example configs
+axolotl agent-docs                     # Show agent-optimized docs (bundled with pip package)
+axolotl agent-docs grpo                # Topic-specific agent reference
+axolotl config-schema                  # Dump config JSON schema
 ```

 ## Training Methods
@@ -23,7 +26,7 @@ axolotl fetch examples                 # Download example configs
 | Method | Config Key | When to Use |
 |--------|-----------|-------------|
 | SFT | *(default)* | Input-output pairs, instruction tuning |
-| DPO/IPO | `rl: dpo` / `rl: ipo` | Paired preference data (chosen vs rejected) |
+| DPO/IPO | `rl: dpo` / `rl: dpo, dpo_loss_type: ["ipo"]` | Paired preference data (chosen vs rejected) |
 | KTO | `rl: kto` | Unpaired binary preference labels |
 | ORPO | `rl: orpo` | Single-stage alignment, no ref model |
 | GRPO | `rl: grpo` | RL with verifiable reward functions (math, code) |
@@ -35,6 +38,8 @@ Agent-specific references:
 - [docs/agents/grpo.md](docs/agents/grpo.md) — GRPO online RL with reward functions
 - [docs/agents/reward_modelling.md](docs/agents/reward_modelling.md) — outcome and process reward models
 - [docs/agents/pretraining.md](docs/agents/pretraining.md) — continual pretraining
+- [docs/agents/model_architectures.md](docs/agents/model_architectures.md) — model-specific quirks (Gemma4, Qwen3.5 MoE, etc.)
+- [docs/agents/new_model_support.md](docs/agents/new_model_support.md) — debugging and adding support for new model architectures

 ## Config Pattern

--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,6 +1,7 @@
-include requirements.txt
 include README.md
 include LICENSE
-include src/setuptools_axolotl_dynamic_dependencies.py
+include VERSION
 include src/axolotl/utils/chat_templates/templates/*.jinja
+include AGENTS.md
+recursive-include docs/agents *.md
 recursive-include axolotl *.py
--- a/README.md
+++ b/README.md
@@ -86,7 +86,7 @@ Features:
 **Requirements**:

 - NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
- Python 3.11
+- Python >=3.11 (3.12 recommended)
 - PyTorch ≥2.9.1

 ### Google Colab
@@ -95,11 +95,19 @@ Features:

 ### Installation

-#### Using pip
-
 ```bash
-pip3 install -U packaging==26.0 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
+# install uv if you don't already have it installed (restart shell after)
+curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# change depending on system
+export UV_TORCH_BACKEND=cu128
+
+# create a new virtual environment
+uv venv --python 3.12
+source .venv/bin/activate
+
+uv pip install torch==2.10.0 torchvision
+uv pip install --no-build-isolation axolotl[deepspeed]

 # Download example axolotl configs, deepspeed configs
 axolotl fetch examples
@@ -110,7 +118,7 @@ axolotl fetch deepspeed_configs  # OPTIONAL

 Installing with Docker can be less error prone than installing in your own environment.
 ```bash
-docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
+docker run --gpus '"all"' --ipc=host --rm -it axolotlai/axolotl:main-latest
 ```

 Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html).
@@ -157,6 +165,29 @@ That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/ge
 - [API Reference](https://docs.axolotl.ai/docs/api/) - Auto-generated code documentation
 - [FAQ](https://docs.axolotl.ai/docs/faq.html) - Frequently asked questions

+## AI Agent Support
+
+Axolotl ships with built-in documentation optimized for AI coding agents (Claude Code, Cursor, Copilot, etc.). These docs are bundled with the pip package — no repo clone needed.
+
+```bash
+# Show overview and available training methods
+axolotl agent-docs
+
+# Topic-specific references
+axolotl agent-docs sft                 # supervised fine-tuning
+axolotl agent-docs grpo                # GRPO online RL
+axolotl agent-docs preference_tuning   # DPO, KTO, ORPO, SimPO
+axolotl agent-docs reward_modelling    # outcome and process reward models
+axolotl agent-docs pretraining         # continual pretraining
+axolotl agent-docs --list              # list all topics
+
+# Dump config schema for programmatic use
+axolotl config-schema
+axolotl config-schema --field adapter
+```
+
+If you're working with the source repo, agent docs are also available at `docs/agents/` and the project overview is in `AGENTS.md`.
+
 ## 🤝 Getting Help

 - Join our [Discord community](https://discord.gg/HhrNrHJPRb) for support
--- a/2
+++ b/2
@@ -1 +1 @@
-0.16.0.dev0
+0.16.2.dev0
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -134,7 +134,6 @@ quartodoc:
        - monkeypatch.stablelm_attn_hijack_flash
        - monkeypatch.trainer_fsdp_optim
        - monkeypatch.transformers_fa_utils
-        - monkeypatch.unsloth_
        - monkeypatch.data.batch_dataset_fetcher
        - monkeypatch.mixtral
        - monkeypatch.gradient_checkpointing.offload_cpu
@@ -327,7 +326,6 @@ website:
        - section: "Advanced Features"
          contents:
            - docs/fsdp_qlora.qmd
-            - docs/unsloth.qmd
            - docs/torchao.qmd
            - docs/custom_integrations.qmd
            - docs/sequence_parallelism.qmd
--- a/cicd/Dockerfile-uv.jinja
+++ b/cicd/Dockerfile-uv.jinja
@@ -22,15 +22,6 @@ WORKDIR /workspace/axolotl
 RUN git fetch origin +$GITHUB_REF && \
    git checkout FETCH_HEAD

-# If AXOLOTL_EXTRAS is set, append it in brackets
-RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
-        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
-        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
-        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
-        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
-        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
-    fi
-
 RUN uv pip install packaging==26.0 setuptools==78.1.1
 RUN uv pip install torchvision
 RUN uv pip uninstall causal_conv1d
@@ -40,11 +31,21 @@ RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
    fi

-RUN python scripts/unsloth_install.py --uv | sh
+# Override with nightly HF packages for nightly builds
+RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
+        uv pip install --no-deps \
+            "transformers @ git+https://github.com/huggingface/transformers.git@main" \
+            "peft @ git+https://github.com/huggingface/peft.git@main" \
+            "accelerate @ git+https://github.com/huggingface/accelerate.git@main" \
+            "trl @ git+https://github.com/huggingface/trl.git@main" \
+            "datasets @ git+https://github.com/huggingface/datasets.git@main"; \
+    fi
+
 RUN python scripts/cutcrossentropy_install.py --uv | sh

 # So we can test the Docker image
-RUN uv pip install -r requirements-dev.txt -r requirements-tests.txt
+RUN uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
+    codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse

 # fix so that git fetch/pull from remote works
 RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -1,54 +0,0 @@
-FROM axolotlai/axolotl-base:{{ BASE_TAG }}
-
-ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
-ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
-ENV CUDA="{{ CUDA }}"
-ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
-ENV GITHUB_REF="{{ GITHUB_REF }}"
-ENV GITHUB_SHA="{{ GITHUB_SHA }}"
-ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
-ENV HF_HOME="{{ HF_HOME }}"
-ENV AXOLOTL_DATASET_NUM_PROC="8"
-
-RUN apt-get update && \
-    apt-get install -y --allow-change-held-packages vim curl nano zstd libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm
-
-WORKDIR /workspace
-
-RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
-
-WORKDIR /workspace/axolotl
-
-RUN git fetch origin +$GITHUB_REF && \
-    git checkout FETCH_HEAD
-
-# If AXOLOTL_EXTRAS is set, append it in brackets
-RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
-        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
-        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
-        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
-        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
-        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
-    fi
-
-RUN pip install packaging==26.0 setuptools==78.1.1 psutil
-RUN pip uninstall -y causal_conv1d
-RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
-    else \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
-    fi
-
-RUN python scripts/unsloth_install.py | sh
-RUN python scripts/cutcrossentropy_install.py | sh
-
-# So we can test the Docker image
-RUN pip install -r requirements-dev.txt -r requirements-tests.txt
-
-# fix so that git fetch/pull from remote works
-RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
-    git config --get remote.origin.fetch
-
-# helper for huggingface-login cli
-RUN git config --global credential.helper store
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -e

-python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"
+python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__, f'Expected torch $PYTORCH_VERSION but got {torch.__version__}'"

 set -o pipefail
 for i in 1 2 3; do
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -17,7 +17,7 @@ template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
 template_env = jinja2.Environment(
    loader=template_loader, autoescape=select_autoescape()
 )
-dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja")
+dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile-uv.jinja")
 df_template = template_env.get_template(dockerfile)

 df_args = {
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
@@ -16,7 +16,7 @@ template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
 template_env = jinja2.Environment(
    loader=template_loader, autoescape=select_autoescape()
 )
-dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja")
+dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile-uv.jinja")
 df_template = template_env.get_template(dockerfile)

 df_args = {
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -24,15 +24,15 @@ WORKDIR /workspace/axolotl
 # If AXOLOTL_EXTRAS is set, append it in brackets; don't install deepspeed with arm64
 RUN pip uninstall -y causal_conv1d
 RUN if [ "$TARGETARCH" = "arm64" ]; then \
-        BASE_EXTRAS="flash-attn,ring-flash-attn,optimizers,ray"; \
+        BASE_EXTRAS="optimizers,ray"; \
    else \
-        BASE_EXTRAS="deepspeed,flash-attn,ring-flash-attn,optimizers,ray"; \
+        BASE_EXTRAS="deepspeed,optimizers,ray"; \
    fi && \
    if [ "$AXOLOTL_EXTRAS" != "" ]; then \
        pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
        pip install --no-build-isolation -e .[$BASE_EXTRAS] $AXOLOTL_ARGS; \
-    fi && \    python scripts/unsloth_install.py | sh && \
+    fi && \
    python scripts/cutcrossentropy_install.py | sh && \
    pip install pytest && \
    pip cache purge
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -58,19 +58,3 @@ RUN git lfs install --skip-repo && \
    # The base image ships with `pydantic==1.8.2` which is not working
    pip3 install -U --no-cache-dir pydantic==1.10.10 && \
    pip3 cache purge
-
-# Map Python version (e.g., 3.12 -> cp312)
-RUN PYTHON_CP="cp$(echo $PYTHON_VERSION | tr -d '.')" && \
-    # Map PyTorch version (e.g., 2.9.1 -> torch2.9, 2.10.0 -> torch2.10)
-    TORCH_TAG="torch$(echo $PYTORCH_VERSION | grep -oP '^\d+\.\d+')" && \
-    # Map architecture
-    case "$TARGETARCH" in \
-        amd64) ARCH_TAG="x86_64" ;; \
-        arm64) ARCH_TAG="aarch64" ;; \
-        *) echo "Unsupported architecture: $TARGETARCH"; exit 1 ;; \
-    esac && \
-    WHL_VERSION="v0.7.16" && \
-    WHL_FILE="flash_attn-2.8.3+cu${CUDA}${TORCH_TAG}-${PYTHON_CP}-${PYTHON_CP}-linux_${ARCH_TAG}.whl" && \
-    wget -nv "https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}" && \
-    pip3 install --no-cache-dir "${WHL_FILE}" && \
-    rm "${WHL_FILE}"
--- a/docker/Dockerfile-tests
+++ b/docker/Dockerfile-tests
@@ -24,9 +24,9 @@ RUN git fetch origin +$GITHUB_REF && \

 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        pip install --no-build-isolation -e .[deepspeed,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \
+        pip install --no-build-isolation -e .[deepspeed,mamba-ssm] $AXOLOTL_ARGS; \
    fi

 # So we can test the Docker image
--- a/docker/Dockerfile-uv
+++ b/docker/Dockerfile-uv
@@ -24,16 +24,15 @@ WORKDIR /workspace/axolotl
 # If AXOLOTL_EXTRAS is set, append it in brackets; don't install deepspeed with arm64
 RUN uv pip uninstall causal_conv1d
 RUN if [ "$TARGETARCH" = "arm64" ]; then \
-        BASE_EXTRAS="flash-attn,ring-flash-attn,optimizers,ray"; \
+        BASE_EXTRAS="optimizers,ray"; \
    else \
-        BASE_EXTRAS="deepspeed,flash-attn,ring-flash-attn,optimizers,ray"; \
+        BASE_EXTRAS="deepspeed,optimizers,ray"; \
    fi && \
    if [ "$AXOLOTL_EXTRAS" != "" ]; then \
        uv pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
        uv pip install --no-build-isolation -e .[$BASE_EXTRAS] $AXOLOTL_ARGS; \
    fi && \
-    python scripts/unsloth_install.py --uv | sh && \
    python scripts/cutcrossentropy_install.py --uv | sh && \
    uv pip install pytest && \
    uv cache clean
--- a/docker/Dockerfile-uv-base
+++ b/docker/Dockerfile-uv-base
@@ -38,20 +38,3 @@ RUN uv pip install packaging setuptools wheel psutil \
 RUN if [ "$TARGETARCH" = "amd64" ]; then \
        MAMBA_SKIP_CUDA_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE uv pip install --no-build-isolation mamba_ssm causal_conv1d; \
    fi
-
-# Map Python version (e.g., 3.12 -> cp312)
-RUN PYTHON_CP="cp$(echo $PYTHON_VERSION | tr -d '.')" && \
-    # Map PyTorch version (e.g., 2.9.1 -> torch2.9, 2.10.0 -> torch2.10)
-    TORCH_TAG="torch$(echo $PYTORCH_VERSION | grep -oP '^\d+\.\d+')" && \
-    LINUX_TAG="manylinux_" && \
-    # Map architecture
-    case "$TARGETARCH" in \
-        amd64) ARCH_TAG="2_24_x86_64.manylinux_2_28_x86_64" ;; \
-        arm64) ARCH_TAG="2_34_aarch64" ;; \
-        *) echo "Unsupported architecture: $TARGETARCH"; exit 1 ;; \
-    esac && \
-    WHL_VERSION="v0.7.16" && \
-    WHL_FILE="flash_attn-2.8.3+cu${CUDA}${TORCH_TAG}-${PYTHON_CP}-${PYTHON_CP}-${LINUX_TAG}${ARCH_TAG}.whl" && \
-    wget -nv "https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}" && \
-    uv pip install --no-cache-dir "${WHL_FILE}" && \
-    rm "${WHL_FILE}"
--- a/docs/agents/model_architectures.md
+++ b/docs/agents/model_architectures.md
@@ -0,0 +1,198 @@
+# Model Architectures — Agent Reference
+
+Model-specific quirks, required settings, and known issues. Check this before debugging training failures on specific model families.
+
+## VLM (Vision Language Model) Quick Start
+
+All VLM configs require these four lines:
+```yaml
+processor_type: AutoProcessor
+skip_prepare_dataset: true
+remove_unused_columns: false
+sample_packing: false
+```
+
+Decision tree for VLM config:
+```text
+Is the model multimodal (has vision/audio encoder)?
+  ├─ YES: Add `freeze_mm_modules: true` if training text only
+  │       Add `chat_template: <model_template>` (e.g. gemma4, qwen3_5, gemma3)
+  │       LoRA: use regex `lora_target_modules` to restrict to language model
+  └─ NO: Train as a regular text model
+
+Is the model MoE (e.g. Gemma4 26B-A4B, Qwen3.5 35B-A3B)?
+  ├─ YES: Add `lora_target_parameters` for expert LoRA
+  │       Consider ScatterMoE kernels (see Plugins section)
+  └─ NO: Standard LoRA config
+```
+
+## Plugins & Optimizations
+
+### Cut Cross Entropy (CCE)
+
+Computes loss from hidden states + lm_head weight without materializing the full logits tensor, saving significant VRAM. Install if not already present:
+
+```bash
+uv pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@main"
+```
+
+```yaml
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+```
+
+### ScatterMoE Kernels
+
+Fuses expert + LoRA computation into a single kernel for MoE models. Significant speedup for models with many experts.
+
+```yaml
+plugins:
+  - axolotl.integrations.kernels.KernelsPlugin
+use_kernels: true
+use_scattermoe: true
+experts_implementation: scattermoe
+
+# Expert LoRA targets (3D parameter tensors, not nn.Linear):
+lora_target_parameters:
+  - experts.gate_up_proj
+  - experts.down_proj
+```
+
+Supported: Gemma4 (`gemma4_text`), Mixtral, Qwen MoE variants. The plugin auto-detects model type and routing function. Without ScatterMoE, expert LoRA still works but runs base expert matmul and LoRA as separate operations.
+
+## Gemma 4
+
+**Models**: `google/gemma-4-26B-A4B` (MoE), `google/gemma-4-31B` (dense), `google/gemma-4-E2B`, `google/gemma-4-E4B`
+
+**Architecture**: Multimodal wrapper (`Gemma4ForConditionalGeneration`) over a text backbone (`Gemma4TextModel`), with optional vision/audio encoders. All Gemma4 HF repos have `model_type: "gemma4"` — even text-only variants load as multimodal with a vision tower.
+
+### Required settings
+
+```yaml
+# Always needed for Gemma4:
+freeze_mm_modules: true          # Freeze vision/audio encoders for text-only training
+gradient_checkpointing_kwargs:
+  use_reentrant: false           # Shared per-layer norms cause "marked ready twice" with reentrant
+
+# LoRA target — restrict to language model only (DO NOT use lora_target_linear: true):
+lora_target_modules: 'model.language_model.layers.[\d]+.(_checkpoint_wrapped_module.)?(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'
+```
+
+### Auto-detection
+
+Axolotl auto-detects Gemma4 and applies:
+- `use_reentrant: false` for gradient checkpointing
+- `ddp_find_unused_parameters: true` for DDP (skipped when `activation_offloading: true`)
+
+### Multi-GPU
+
+| Strategy | Works? | Notes |
+|----------|--------|-------|
+| DDP | Yes | Auto-sets `ddp_find_unused_parameters=True` |
+| DDP + activation_offloading | Yes | `find_unused_parameters` is skipped (conflicts with checkpoint wrappers) |
+| FSDP1 | No | OOM during dequantization/sharding with QLoRA |
+| FSDP2 | Yes | Use `Gemma4TextDecoderLayer` (not `Gemma4DecoderLayer`) as wrap class |
+| FSDP2 + activation_offloading | Yes | Lowest VRAM (~26 GiB/GPU for 26B-A4B) |
+
+FSDP2 config:
+```yaml
+fsdp:
+  - full_shard
+  - auto_wrap
+fsdp_config:
+  fsdp_version: 2
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_transformer_layer_cls_to_wrap: Gemma4TextDecoderLayer
+```
+
+### MoE (26B-A4B)
+
+- `enable_moe_block: true`, 256 experts, top-k routing
+- No separate `SparseMoeBlock` — MoE is embedded in each decoder layer
+- Expert LoRA targets 3D parameter tensors:
+  ```yaml
+  lora_target_parameters:
+    - experts.gate_up_proj
+    - experts.down_proj
+  ```
+- ScatterMoE kernel acceleration:
+  ```yaml
+  plugins:
+    - axolotl.integrations.kernels.KernelsPlugin
+  use_kernels: true
+  use_scattermoe: true
+  experts_implementation: scattermoe
+  ```
+
+### VLM (Vision) Training
+
+All Gemma4 models load as `Gemma4ForConditionalGeneration` with a vision tower. No custom `ProcessingStrategy` needed — the base class auto-detects the image token.
+
+```yaml
+base_model: google/gemma-4-E2B-it   # or E4B-it, 26B-A4B
+processor_type: AutoProcessor
+freeze_mm_modules: true
+chat_template: gemma4
+
+skip_prepare_dataset: true
+remove_unused_columns: false
+sample_packing: false
+```
+
+A starting VLM loss of ~8-15 is typical. In most runs, loss converges below 1.0 within ~30-50 steps, though results may vary across configurations.
+
+For the 26B-A4B MoE variant with ScatterMoE + expert LoRA + CCE, add:
+```yaml
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+  - axolotl.integrations.kernels.KernelsPlugin
+use_kernels: true
+use_scattermoe: true
+experts_implementation: scattermoe
+lora_target_parameters:
+  - experts.gate_up_proj
+  - experts.down_proj
+```
+
+### Common issues
+
+| Symptom | Cause | Fix |
+|---------|-------|-----|
+| `mm_token_type_ids is required` in DDP | `model.config` not accessible through DDP wrapper | Already fixed — `unwrap_model()` in `compute_loss` and `prediction_step` |
+| `marked a variable ready twice` in DDP | `ddp_find_unused_parameters=True` + activation_offloading checkpoint wrappers | Auto-handled — `find_unused_parameters` is skipped when `activation_offloading: true` |
+| Loss ~12 instead of ~0.5 | Using `lora_target_linear: true` (applies LoRA to vision/audio modules) | Use the regex `lora_target_modules` pattern instead |
+| FSDP2 `Could not find Gemma4AudioLayer` | Auto-wrap detects `_no_split_modules` including audio layers that don't exist | Explicitly set `fsdp_transformer_layer_cls_to_wrap: Gemma4TextDecoderLayer` |
+| `Gemma4ClippableLinear not supported` by PEFT | Vision tower uses a non-standard linear wrapper | Axolotl patches this automatically via `_patch_peft_clippable_linear()` |
+
+### E2B/E4B dense models
+
+These have `hidden_size_per_layer_input: 256` (per-layer input embeddings) and `attention_k_eq_v: False`. Known issue: loss starts higher than expected (~12 vs ~0.5 for 26B). Root cause under investigation — may be related to the per-layer input mechanism or the `Gemma4ForConditionalGeneration` loss computation.
+
+## Gemma 3
+
+**Models**: `google/gemma-3-*`
+
+- `ddp_find_unused_parameters: true` needed (multimodal unused params)
+- `use_reentrant: false` recommended
+- Attention mask must be dropped for sample packing (handled automatically)
+- Multi-GPU test currently skipped (`tests/e2e/multigpu/test_gemma3.py`)
+
+## Qwen 3.5 MoE
+
+**Models**: `Qwen/Qwen3.5-35B-A3B`
+
+- Hybrid architecture: DeltaNet linear attention (30 layers) + full attention (10 layers)
+- 256 experts, 8 active per token
+- Known weight scale drift in late DeltaNet layers (36-38) due to AdamW + rare expert interaction
+- Fix: `normalize_weight_scales` config to detect and rescale outliers:
+  ```yaml
+  normalize_weight_scales:
+    - name_pattern: 'linear_attn\.conv1d\.weight'
+      threshold: 1.3
+  ```
+
+## General MoE Notes
+
+- `lora_target_linear: true` with multimodal MoE models will apply LoRA to ALL linear modules including vision/audio encoders — use regex `lora_target_modules` to restrict to language model only
+- Rare experts get larger effective learning rate from AdamW (small second-moment estimates) — can cause weight drift in recurrent/SSM components. Use `normalize_weight_scales` with `dry_run: true` to detect.
+- For ScatterMoE kernel support, set `experts_implementation: scattermoe` and add the KernelsPlugin
--- a/docs/agents/new_model_support.md
+++ b/docs/agents/new_model_support.md
@@ -0,0 +1,181 @@
+# New Model Support — Agent Reference
+
+Guide for debugging and adding support for new model architectures in axolotl. Based on lessons learned from Gemma4, Gemma3, Qwen2-VL, and other multimodal/MoE models.
+
+## Quick Validation Checklist
+
+When testing a new model, run through these checks in order:
+
+1. **Does the model load?** `axolotl preprocess config.yaml` — catches config schema errors
+2. **Does LoRA apply?** Check for "Unsupported layer type" warnings from PEFT
+3. **Is the initial loss sane?** First-step loss for a pretrained model should be 0.5–2.0 for SFT
+4. **Does sample packing work?** Compare loss with `sample_packing: true` vs `false` — should be similar
+5. **Is CCE active?** Check for "Applying Cut Cross Entropy" log and verify peak VRAM is lower
+
+## Loss Debugging
+
+### Expected initial loss
+A pretrained model doing SFT should start with loss roughly in the 0.5–2.0 range. If loss starts above 3.0, something is wrong. If it's near `log(vocab_size)` (≈ 12 for 262K vocab), the model is predicting at random — attention masking or model weights are broken.
+
+### Direct comparison technique
+The fastest way to isolate a loss issue — bypass the trainer entirely:
+
+```python
+# Load model via axolotl's pipeline (applies all patches)
+from axolotl.cli.config import load_cfg
+from axolotl.utils.config import normalize_config, prepare_plugins
+from axolotl.loaders.tokenizer import load_tokenizer
+from axolotl.loaders.model import ModelLoader
+
+cfg = load_cfg("your_config.yaml")
+normalize_config(cfg)
+prepare_plugins(cfg)
+tokenizer = load_tokenizer(cfg)
+model, _ = ModelLoader(cfg, tokenizer).load()
+
+# Forward pass on preprocessed data
+model.train()
+out = model(input_ids, labels=labels)
+print(f"Direct loss: {out.loss.item()}")  # Compare to trainer's reported loss
+```
+
+If direct loss is correct (~1.0) but trainer reports 3–4x higher, check `model_accepts_loss_kwargs` (see below).
+
+### `model_accepts_loss_kwargs` inflation
+HF Trainer checks if the model's `forward()` has `**kwargs` and sets `model_accepts_loss_kwargs=True`. This changes loss normalization: the trainer does NOT divide loss by `gradient_accumulation_steps` before logging. The gradient is correct — only the logged loss is inflated.
+
+**Symptom**: Logged loss ≈ actual_loss × gradient_accumulation_steps.
+
+**Which models are affected**: Any model with `**kwargs` in forward (common in multimodal models for extra inputs like `mm_token_type_ids`, `pixel_values`, etc.).
+
+**Fix location**: `src/axolotl/core/trainers/base.py` `__init__()` — after `super().__init__()`, check if the unwrapped model actually has `num_items_in_batch` in its forward signature. If not, set `self.model_accepts_loss_kwargs = False`.
+
+## Multimodal Models (ForConditionalGeneration)
+
+Many recent models use `ForConditionalGeneration` as the top-level class, not `ForCausalLM`:
+- Gemma3 → `Gemma3ForConditionalGeneration`
+- Gemma4 → `Gemma4ForConditionalGeneration`
+- Qwen2-VL → `Qwen2VLForConditionalGeneration`
+- LLaVA → `LlavaForConditionalGeneration`
+
+### Why this matters
+
+| Component | Targets `ForCausalLM` | Needs `ForConditionalGeneration` |
+|-----------|----------------------|--------------------------------|
+| CCE patches | ✅ (default) | ❌ silently inactive if not patched |
+| PEFT LoRA | ✅ | May fail on custom layer types |
+| HF Trainer label handling | ✅ | May need extra inputs |
+
+### Required extra inputs
+Multimodal models require special inputs during training even for text-only data:
+
+| Model | Required Input | Value for Text-Only |
+|-------|---------------|-------------------|
+| Gemma4 | `mm_token_type_ids` | `torch.zeros_like(input_ids)` |
+| Gemma3 | `token_type_ids` | `torch.zeros_like(input_ids)` |
+
+Auto-inject in `compute_loss()` when not provided by the data collator. See `core/trainers/base.py`.
+
+### Custom layer types and PEFT
+Vision towers often use custom module wrappers that PEFT doesn't support:
+
+| Model | Custom Layer | Wraps | Fix |
+|-------|-------------|-------|-----|
+| Gemma4 | `Gemma4ClippableLinear` | `nn.Linear` | Redirect to `.linear` child |
+
+Fix location: `src/axolotl/loaders/adapter.py` `_patch_peft_clippable_linear()`.
+
+## Sample Packing
+
+### How packed sequence detection works (transformers ≥ 5.x)
+`transformers.masking_utils._preprocess_mask_arguments()` detects packed sequences from `position_ids` resets. But **only when `attention_mask is None`**:
+
+```python
+# From masking_utils.py:
+if position_ids is not None and attention_mask is None and past_key_values is None:
+    packed_sequence_mask = find_packed_sequence_indices(position_ids)
+```
+
+If the collator provides an all-ones `attention_mask`, packing detection is **skipped** and the model builds a single causal mask spanning all packed sequences → cross-sequence attention leakage → very high loss.
+
+### Fix for models using `create_causal_mask_mapping`
+For Gemma3, Gemma4, and similar models that use the new transformers masking system, remove `attention_mask` from inputs when sample packing is active:
+
+```python
+# In compute_loss():
+if (
+    self.args.sample_packing
+    and model_type in ("gemma4", "gemma3")
+    and "attention_mask" in inputs
+    and "position_ids" in inputs
+):
+    del inputs["attention_mask"]
+```
+
+Fix location: `src/axolotl/core/trainers/base.py` `compute_loss()`.
+
+### Models that DON'T need this fix
+Older models that use `_prepare_4d_causal_attention_mask` (Llama, Mistral, Qwen2, etc.) handle sample packing via axolotl's multipack attention monkeypatch instead. Only models using the new `create_causal_mask_mapping` / `create_causal_mask` masking system need the `attention_mask` removal.
+
+## Attention Backend Selection
+
+| Backend | Config | head_dim limit | torch_compile | Notes |
+|---------|--------|---------------|---------------|-------|
+| FA2 | `flash_attention: true` | 256 | ✅ | Fastest when supported |
+| FA4 | auto with `flash_attention: true` | 256 (SM90+) | ✅ | Auto-detected on H100+ |
+| SDPA | `sdp_attention: true` | None | ✅ | Universal fallback |
+| flex | `flex_attention: true` | None | ⚠️ Triton OOM for large head_dim | Good for variable head dims |
+| eager | neither set | None | ✅ | Slowest, always works |
+
+**Check model support**: Look at `_supports_flash_attn_2`, `_supports_flex_attn`, `_supports_sdpa` attributes on the model class.
+
+**head_dim gotcha**: The 256 limit is specific to flash-attn CUDA kernels, NOT PyTorch-level. SDPA and flex_attention both handle arbitrary head_dim. Models with `global_head_dim > 256` (Gemma4: 512) must use SDPA or flex.
+
+**flex + compile gotcha**: `torch_compile` with flex_attention can hit Triton shared memory OOM for large head_dim. Falls back to eager per-function (not a crash, but slower). Unsloth disables flex for Gemma4 for this reason.
+
+## Cut Cross Entropy (CCE)
+
+### How CCE patches work
+CCE replaces the model's `forward()` with a fused version that computes loss from hidden states + lm_head weight without materializing the full logits tensor. This saves ~`batch × seq_len × vocab_size × dtype_bytes` of VRAM.
+
+### Adding CCE for a new model
+1. Check if the model type is in `cut_cross_entropy.transformers.patch.PATCH_FNS`
+2. If not, axolotl's generic fallback (`integrations/cut_cross_entropy/__init__.py` `patch_llama_like()`) patches `{Prefix}ForCausalLM.forward` with `cce_forward`
+3. For multimodal models (`ForConditionalGeneration`), a model-specific patch is needed in `ml-cross-entropy` repo
+4. The multimodal `cce_forward` must accept all extra kwargs (pixel_values, mm_token_type_ids, etc.) and pop any that would conflict before calling `self.model()`
+
+### Common CCE pitfall
+If CCE appears active (log says "Applying Cut Cross Entropy") but peak VRAM doesn't decrease, check which class was patched. If the model loads as `ForConditionalGeneration` but CCE patched `ForCausalLM`, the patch is silently inactive.
+
+## MoE Models
+
+### Dense MLP vs MoE experts
+Some MoE models (e.g., Gemma4) have BOTH dense MLP layers and MoE expert layers at every decoder layer:
+- `gate_proj/up_proj/down_proj` → targets the **dense MLP** (`Gemma4TextMLP`)
+- `experts.gate_up_proj/experts.down_proj` → targets the **MoE experts** (`Gemma4TextExperts`)
+
+LoRA on the dense MLP works normally. Expert LoRA via `lora_target_parameters` requires PEFT support for the specific expert module type (may warn "Unsupported layer type").
+
+### ScatterMoE kernels
+`use_scattermoe: true` with `experts_implementation: scattermoe` registers fused expert kernels via transformers' `ExpertsInterface`. Significant speedup for MoE models. Requires the kernels plugin:
+```yaml
+plugins:
+  - axolotl.integrations.kernels.KernelsPlugin
+use_kernels: true
+use_scattermoe: true
+experts_implementation: scattermoe
+```
+
+## Where to Add Model-Specific Fixes
+
+| What | Where | Example |
+|------|-------|---------|
+| Missing forward inputs | `core/trainers/base.py` `compute_loss()` | mm_token_type_ids injection |
+| Attention mask fixes | `core/trainers/base.py` `compute_loss()` | Sample packing mask removal |
+| Loss logging fixes | `core/trainers/base.py` `__init__()` | model_accepts_loss_kwargs override |
+| PEFT/LoRA patches | `loaders/adapter.py` | ClippableLinear redirect |
+| Attention patches | `monkeypatch/attention/` | FA4 tuple fix |
+| Model-specific patches | `loaders/patch_manager.py` `_apply_model_specific_patches()` | Llama4, Kimi, NemotronH |
+| CCE patches | `ml-cross-entropy` repo `transformers/` | Per-model cce_forward |
+| Example configs | `examples/<model>/` | Validated YAML |
+| Config validation | `utils/schemas/validation.py` | Compatibility checks |
--- a/docs/agents/preference_tuning.md
+++ b/docs/agents/preference_tuning.md
@@ -38,7 +38,7 @@ No vLLM server needed (unlike GRPO). Offline RL with pre-collected preference da

 1. Paired preference data (chosen + rejected)?
   - Default → `rl: dpo`
-   - Overfitting → `rl: ipo`
+   - Overfitting → `rl: dpo, dpo_loss_type: ["ipo"]`
   - VRAM-limited → `rl: orpo` (no ref model)
   - Length-sensitive → `rl: simpo` (no ref model)
 2. Only binary labels (good/bad)? → `rl: kto`
--- a/docs/agents/sft.md
+++ b/docs/agents/sft.md
@@ -91,6 +91,30 @@ Watch for: loss never decreasing (check `train_on_inputs`, dataset, LR), loss go
 | FSDP save hangs | Use `fsdp_state_dict_type: FULL_STATE_DICT` |
 | DeepSpeed CheckpointError | Set `use_reentrant: true` in `gradient_checkpointing_kwargs` |

+## Profiling
+
+To profile training and identify optimization opportunities:
+
+```yaml
+# Profile steps 3-7 (after warmup/autotuning settles)
+profiler_steps_start: 3
+profiler_steps: 5
+```
+
+This produces `profiler_trace.json` (Chrome trace) and `snapshot.pickle` (memory snapshot) in `output_dir`.
+View the Chrome trace at `chrome://tracing`.
+
+To programmatically inspect the trace:
+```bash
+python scripts/analyze_profile.py output_dir/
+```
+
+The trace shows per-kernel CUDA times, memory allocations, and operator-level breakdown. Look for:
+- **Large matmul kernels**: candidates for fusion or quantization
+- **Memory copies (H2D/D2H)**: unnecessary data movement
+- **Small frequent kernels**: candidates for kernel fusion
+- **Gaps between kernels**: pipeline bubbles from CPU overhead
+
 Full troubleshooting: [training_stability.qmd](../training_stability.qmd), [debugging.qmd](../debugging.qmd)

 ## File Map
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -108,6 +108,14 @@ datasets:
    type: chat_template
 ```

+::: {.callout-tip}
+`chat_template_jinja` also accepts a file path to a `.jinja2` file instead of an inline string:
+
+```yaml
+chat_template_jinja: ./path/to/my_template.jinja2
+```
+:::
+
 ::: {.callout-important}
 Please make sure that your `tokenizer.eos_token` is same as EOS (End-of-Sequence) token in template. Otherwise, set `eos_token` under `special_tokens: `.
 :::
@@ -294,6 +302,113 @@ datasets:
 It is not necessary to set both `message_field_training` and `message_field_training_detail` at once.
 :::

+#### Content parts with per-part training control
+
+Instead of using character offsets with `train_detail`, you can split a message's content into a list of parts, each with its own training flag. This is useful when you want to mask specific sections of a response (e.g., mask reasoning but train on the answer).
+
+```{.json filename="data.jsonl"}
+{
+  "messages": [
+    {"role": "user", "content": [{"type": "text", "text": "What is 2+2?"}]},
+    {
+      "role": "assistant",
+      "content": [
+        {"type": "text", "text": "Let me think step by step...", "train": false},
+        {"type": "text", "text": " The answer is 4.", "train": true}
+      ]
+    }
+  ]
+}
+```
+
+The configuration is the same as standard `chat_template` — no extra fields needed:
+
+```yaml
+datasets:
+  - path: ...
+    type: chat_template
+    roles_to_train: ["assistant"]
+```
+
+Each content part supports:
+
+- `type`: `"text"` (required)
+- `text`: the text value (also accepts `content` or `value` as the key)
+- `train`: `true`/`false` (optional) — whether to train on this part
+- `weight`: `0`/`1` (optional) — alternative to `train`
+
+If a part has no `train` or `weight` flag, it inherits the turn-level training decision (from `roles_to_train`, `message_field_training`, or `train_on_inputs`).
+
+::: {.callout-warning title="Whitespace at part boundaries"}
+BPE tokenizers (used by Llama, Qwen, Mistral, GPT, etc.) prepend spaces to word tokens. For example, `" answer"` is a single token — the space is part of it. This means **where you place whitespace between content parts matters**:
+
+**Split BEFORE spaces** (space goes with the next part):
+
+```json
+[
+  {"type": "text", "text": "Let me think...", "train": false},
+  {"type": "text", "text": " The answer is 4.", "train": true}
+]
+```
+
+**DON'T put trailing spaces** on a part (the space merges with the next word into one token that straddles the boundary, and straddling tokens are masked):
+
+```json
+[
+  {"type": "text", "text": "Let me think... ", "train": false},
+  {"type": "text", "text": "The answer is 4.", "train": true}
+]
+```
+
+In the bad example, `" The"` becomes a single token that spans both parts. Because it straddles the boundary, it is conservatively **masked** (not trained) — even though the second part has `train: true`.
+
+**Newlines** typically merge with preceding punctuation (e.g., `":\n"` is one token). Keep newlines with the preceding part:
+
+```json
+[
+  {"type": "text", "text": "Thinking:\n", "train": false},
+  {"type": "text", "text": "The answer is 4.", "train": true}
+]
+```
+
+Axolotl will log a warning if it detects trailing whitespace at a boundary between parts with different training flags.
+:::
+
+::: {.callout-note}
+When all content parts in a message are strings, they are concatenated before being passed to the chat template. This means content parts work with **any** Jinja template — the template sees a plain string, and the per-part training flags are applied during tokenization.
+:::
+
+##### Per-part training on reasoning_content
+
+For templates that support a separate `reasoning_content` field (e.g., `qwen3`), the same content-parts format works on `reasoning_content`. This is useful for masking incorrect reasoning steps while training on self-corrections:
+
+```{.json filename="data.jsonl"}
+{
+  "messages": [
+    {"role": "user", "content": [{"type": "text", "text": "What is 2+2?"}]},
+    {
+      "role": "assistant",
+      "reasoning_content": [
+        {"type": "text", "text": "Hmm maybe 2+2=5.", "train": false},
+        {"type": "text", "text": " Wait no, 2+2=4.", "train": true}
+      ],
+      "content": [
+        {"type": "text", "text": "The answer is 4.", "train": true}
+      ]
+    }
+  ]
+}
+```
+
+The `reasoning_content` and `content` fields are handled independently — each has its own token boundaries and per-part masking. No additional configuration is needed beyond what the template already requires.
+
+::: {.callout-tip}
+When `reasoning_content` is provided as a separate field, `split_thinking` is not needed — the reasoning is already separated from the content in the data.
+:::
+
+The same whitespace rules apply to `reasoning_content` parts as to `content` parts — split before spaces, keep newlines with the preceding part.
+
+
 #### Reasoning split

 (For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.
--- a/docs/debugging.qmd
+++ b/docs/debugging.qmd
@@ -76,8 +76,10 @@ datasets:
 Make sure you have an [editable install](https://setuptools.pypa.io/en/latest/userguide/development_mode.html) of Axolotl, which ensures that changes you make to the code are reflected at runtime.  Run the following commands from the root of this project:

 ```bash
-pip3 install packaging
-pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
+export UV_TORCH_BACKEND=cu128  # or cu130
+uv venv --no-project --relocatable
+source .venv/bin/activate
+uv pip install --no-build-isolation -e '.[deepspeed]' --group dev --group test
 ```

 #### Remote Hosts
@@ -208,17 +210,18 @@ cd axolotl
 Next, run the desired docker image and mount the current directory. Below is a docker command you can run to do this:[^2]

 ```bash
-docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl:main-py3.10-cu118-2.0.1
+docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl-uv:main-latest
 ```

 >[!Tip]
 > To understand which containers are available, see the [Docker section of the README](../README.md#docker) and the [DockerHub repo](https://hub.docker.com/r/axolotlai/axolotl/tags).  For details of how the Docker containers are built, see axolotl's [Docker CI builds](../.github/workflows/main.yml).

-You will now be in the container.  Next, perform an editable install of Axolotl:
+You will now be in the container.  Next, install Axolotl with dev dependencies:

 ```bash
-pip3 install packaging
-pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
+uv venv --no-project --relocatable
+source .venv/bin/activate
+uv pip install --no-build-isolation -e '.[deepspeed]' --group dev --group test
 ```

 ### Attach To Container
--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -6,23 +6,33 @@ format:
    toc-depth: 4
 ---

-This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai).
+This section describes the different Docker images that are released by AxolotlAI at
+[Docker Hub](https://hub.docker.com/u/axolotlai).

 ::: {.callout-important}
-For Blackwell GPUs, please use the tags with PyTorch 2.7.1 and CUDA 12.8.
+### Switch to the `-uv` images
+
+Each image below ships a **uv variant** that uses [uv](https://docs.astral.sh/uv/) with a relocatable venv
+(`/workspace/axolotl-venv`) instead of Miniconda + pip. Append `-uv` to the image name
+(e.g. `axolotlai/axolotl-uv`, `axolotlai/axolotl-base-uv`, `axolotlai/axolotl-cloud-uv`). Tags follow the
+same format as their non-uv counterparts.
+
+**We recommend switching to the `-uv` images early.** In the near future we will publish the uv-based
+build to the non-uv tags as well. The non-uv names will continue to work, but they will start serving
+the uv image.
 :::

 ## Base

-The base image is the most minimal image that can install Axolotl. It is based on the `nvidia/cuda` image. It includes python, torch, git, git-lfs, awscli, pydantic, and more.
+The base image is the most minimal image that can install Axolotl. It is based on the `nvidia/cuda` image.
+It includes python, torch, git, git-lfs, awscli, pydantic, and more.

 #### Image

-```
-axolotlai/axolotl-base
-```
-
-Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-base)
+| Variant | Image | Docker Hub |
+|---------|-------|------------|
+| pip | `axolotlai/axolotl-base` | [Link](https://hub.docker.com/r/axolotlai/axolotl-base) |
+| uv | `axolotlai/axolotl-base-uv` | [Link](https://hub.docker.com/r/axolotlai/axolotl-base-uv) |

 #### Tags format

@@ -32,8 +42,10 @@ main-base-py{python_version}-cu{cuda_version}-{pytorch_version}

 Tags examples:

- `main-base-py3.11-cu128-2.8.0`
 - `main-base-py3.11-cu128-2.9.1`
+- `main-base-py3.12-cu128-2.10.0`
+- `main-base-py3.12-cu130-2.9.1`
+- `main-base-py3.12-cu130-2.10.0`

 ## Main

@@ -41,11 +53,10 @@ The main image is the image that is used to run Axolotl. It is based on the `axo

 #### Image

-```
-axolotlai/axolotl
-```
-
-Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl)
+| Variant | Image | Docker Hub |
+|---------|-------|------------|
+| pip | `axolotlai/axolotl` | [Link](https://hub.docker.com/r/axolotlai/axolotl) |
+| uv | `axolotlai/axolotl-uv` | [Link](https://hub.docker.com/r/axolotlai/axolotl-uv) |

 #### Tags format {#sec-main-tags}

@@ -53,7 +64,7 @@ Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl)
 # on push to main
 main-py{python_version}-cu{cuda_version}-{pytorch_version}

-# latest main (currently torch 2.6.0, python 3.11, cuda 12.4)
+# latest main (currently torch 2.9.1, python 3.11, cuda 12.8)
 main-latest

 # nightly build
@@ -71,12 +82,13 @@ There may be some extra tags appended to the image, like `-vllm` which installs

 Tags examples:

- `main-py3.11-cu128-2.8.0`
 - `main-py3.11-cu128-2.9.1`
+- `main-py3.12-cu128-2.10.0`
+- `main-py3.12-cu130-2.9.1`
+- `main-py3.12-cu130-2.10.0`
 - `main-latest`
- `main-20250303-py3.11-cu124-2.6.0`
- `main-20250303-py3.11-cu126-2.6.0`
- `0.12.0`
+- `main-20260315-py3.11-cu128-2.9.1`
+- `0.16.1`

 ## Cloud

@@ -90,11 +102,10 @@ Jupyter lab is run by default. Set `JUPYTER_DISABLE=1` in the environment variab

 #### Image

-```
-axolotlai/axolotl-cloud
-```
-
-Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-cloud)
+| Variant | Image | Docker Hub |
+|---------|-------|------------|
+| pip | `axolotlai/axolotl-cloud` | [Link](https://hub.docker.com/r/axolotlai/axolotl-cloud) |
+| uv | `axolotlai/axolotl-cloud-uv` | [Link](https://hub.docker.com/r/axolotlai/axolotl-cloud-uv) |

 #### Tags format

--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -57,7 +57,7 @@ description: Frequently asked questions

 **Q: vLLM is not working with Axolotl**

-> A: We currently recommend torch 2.6.0 for use with `vllm`. Please ensure you use the right version. For Docker, please use the `main-py3.11-cu124-2.6.0` tag.
+> A: We currently recommend torch 2.10 for use with `vllm`. Please ensure you use the right version. For Docker, please use the `main-py3.12-cu128-2.10.0` tag (note: torch 2.10 images are built with Python 3.12).

 **Q: FA2 2.8.0 `undefined symbol` runtime error on CUDA 12.4**

--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -15,64 +15,30 @@ This guide covers all the ways you can install and set up Axolotl for your envir

 - NVIDIA GPU (Ampere architecture or newer for `bf16` and Flash Attention) or AMD GPU
 - Python ≥3.11
- PyTorch ≥2.6.0
+- PyTorch ≥2.9.1

-## Installation Methods {#sec-installation-methods}
-
-::: {.callout-important}
-Please make sure to have Pytorch installed before installing Axolotl in your local environment.
-
-Follow the instructions at: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
-:::
+## Installation {#sec-installation}

 ::: {.callout-important}
 For Blackwell GPUs, please use Pytorch 2.9.1 and CUDA 12.8.
 :::

-### PyPI Installation (Recommended) {#sec-pypi}
+### Quick Install {#sec-uv}

-```{.bash}
-pip3 install -U packaging setuptools wheel ninja
-pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
-```
+Axolotl uses [uv](https://docs.astral.sh/uv/) as its package manager. uv is a fast, reliable Python package installer and resolver built in Rust.

-We use `--no-build-isolation` in order to detect the installed PyTorch version (if
-installed) in order not to clobber it, and so that we set the correct version of
-dependencies that are specific to the PyTorch version or other installed
-co-dependencies.
-
-### uv Installation {#sec-uv}
-
-uv is a fast, reliable Python package installer and resolver built in Rust. It offers significant performance improvements over pip and provides better dependency resolution, making it an excellent choice for complex environments.
-
-Install uv if not already installed
+Install uv if not already installed:
 ```{.bash}
 curl -LsSf https://astral.sh/uv/install.sh | sh
 source $HOME/.local/bin/env
 ```

-Choose your CUDA version to use with PyTorch; e.g. `cu124`, `cu126`, `cu128`,
-then create the venv and activate
+Choose your CUDA version (e.g. `cu128`, `cu130`), create a venv, and install:
 ```{.bash}
-export UV_TORCH_BACKEND=cu126
-uv venv --no-project --relocatable
+export UV_TORCH_BACKEND=cu128  # or cu130
+uv venv
 source .venv/bin/activate
-```
-
-Install PyTorch
- PyTorch 2.6.0 recommended
-```{.bash}
-uv pip install packaging setuptools wheel
-uv pip install torch==2.6.0
-uv pip install awscli pydantic
-```
-
-Install axolotl from PyPi
-```{.bash}
-uv pip install --no-build-isolation axolotl[deepspeed,flash-attn]
-
-# optionally install with vLLM if you're using torch==2.6.0 and want to train w/ GRPO
-uv pip install --no-build-isolation axolotl[deepspeed,flash-attn,vllm]
+uv pip install --no-build-isolation axolotl[deepspeed]
 ```

 ### Edge/Development Build {#sec-edge-build}
@@ -82,14 +48,16 @@ For the latest features between releases:
 ```{.bash}
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
-pip3 install -U packaging setuptools wheel ninja
-pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
+export UV_TORCH_BACKEND=cu128  # or cu130
+uv venv
+source .venv/bin/activate
+uv pip install --no-build-isolation -e '.[deepspeed]'
 ```

 ### Docker {#sec-docker}

 ```{.bash}
-docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
+docker run --gpus '"all"' --rm -it --ipc=host axolotlai/axolotl-uv:main-latest
 ```

 For development with Docker:
@@ -106,12 +74,12 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it \
  --ulimit memlock=-1 --ulimit stack=67108864 \
  --mount type=bind,src="${PWD}",target=/workspace/axolotl \
  -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-  axolotlai/axolotl:main-latest
+  axolotlai/axolotl-uv:main-latest
 ```
 :::

 ::: {.callout-important}
-For Blackwell GPUs, please use `axolotlai/axolotl:main-py3.11-cu128-2.9.1` or the cloud variant `axolotlai/axolotl-cloud:main-py3.11-cu128-2.9.1`.
+For Blackwell GPUs, please use `axolotlai/axolotl-uv:main-py3.11-cu128-2.9.1` or the cloud variant `axolotlai/axolotl-cloud-uv:main-py3.11-cu128-2.9.1`.
 :::

 Please refer to the [Docker documentation](docker.qmd) for more information on the different Docker images that are available.
@@ -122,7 +90,7 @@ Please refer to the [Docker documentation](docker.qmd) for more information on t

 For providers supporting Docker:

- Use `axolotlai/axolotl-cloud:main-latest`
+- Use `axolotlai/axolotl-cloud-uv:main-latest`
 - Available on:
    - [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
    - [Vast.ai](https://cloud.vast.ai?ref_id=62897&template_id=bdd4a49fa8bce926defc99471864cace&utm_source=axolotl&utm_medium=partner&utm_campaign=template_launch_july2025&utm_content=docs_link)
@@ -141,7 +109,7 @@ For providers supporting Docker:
 ### macOS {#sec-macos}

 ```{.bash}
-pip3 install --no-build-isolation -e '.'
+uv pip install --no-build-isolation -e '.'
 ```

 See @sec-troubleshooting for Mac-specific issues.
@@ -152,21 +120,44 @@ See @sec-troubleshooting for Mac-specific issues.
 We recommend using WSL2 (Windows Subsystem for Linux) or Docker.
 :::

-## Environment Managers {#sec-env-managers}
+## Migrating from pip to uv {#sec-migrating}

-### Conda/Pip venv {#sec-conda}
+If you have an existing pip-based Axolotl installation, you can migrate to uv:

-1. Install Python ≥3.11
-2. Install PyTorch: https://pytorch.org/get-started/locally/
-3. Install Axolotl:
-   ```{.bash}
-   pip3 install -U packaging setuptools wheel ninja
-   pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
-   ```
-4. (Optional) Login to Hugging Face:
-   ```{.bash}
-   hf auth login
-   ```
+```{.bash}
+# Install uv
+curl -LsSf https://astral.sh/uv/install.sh | sh
+source $HOME/.local/bin/env
+
+# Create a fresh venv (recommended for a clean start)
+export UV_TORCH_BACKEND=cu128  # or cu130
+uv venv
+source .venv/bin/activate
+
+# Reinstall axolotl
+uv pip install --no-build-isolation axolotl[deepspeed]
+```
+
+## Using pip (Alternative) {#sec-pip}
+
+If you are unable to install uv, you can still use pip directly.
+
+::: {.callout-important}
+Please make sure to have PyTorch installed before installing Axolotl with pip.
+
+Follow the instructions at: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
+:::
+
+```{.bash}
+pip3 install -U packaging setuptools wheel ninja
+pip3 install --no-build-isolation axolotl[deepspeed]
+```
+
+For editable/development installs:
+```{.bash}
+pip3 install -U packaging setuptools wheel ninja
+pip3 install --no-build-isolation -e '.[deepspeed]'
+```

 ## Troubleshooting {#sec-troubleshooting}

--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -8,6 +8,7 @@ format:

 ## Supported Models

+- [Gemma-4](#sec-gemma-4) *(NEW)*
 - [Mllama](#sec-mllama)
 - [Llama4](#sec-llama4)
 - [Pixtral](#sec-pixtral)
@@ -138,6 +139,40 @@ base_model: mistralai/Voxtral-Mini-3B-2507
 processor_type: VoxtralProcessor
 ```

+### Gemma-4 {#sec-gemma-4}
+
+All Gemma 4 variants (E2B, E4B, 26B-A4B, 31B) load as multimodal models even for text-only training.
+
+```yaml
+base_model: google/gemma-4-E2B-it  # or E4B-it, 26B-A4B, 31B
+
+chat_template: gemma4
+freeze_mm_modules: true  # freeze vision/audio encoders for text-only or vision LoRA
+
+# For the 26B-A4B MoE model, enable ScatterMoE and expert LoRA:
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+  - axolotl.integrations.kernels.KernelsPlugin
+use_kernels: true
+use_scattermoe: true
+experts_implementation: scattermoe
+
+lora_target_modules: 'model.language_model.layers.[\d]+.(_checkpoint_wrapped_module.)?(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'
+
+# MoE expert LoRA (3D tensors, not nn.Linear) — only for 26B-A4B:
+lora_target_parameters:
+  - experts.gate_up_proj
+  - experts.down_proj
+```
+
+::: {.callout-warning}
+Gemma 4 VLM training starts with high loss (~8-15). This is expected — see the [training stability guide](training_stability.qmd) for details.
+:::
+
+::: {.callout-tip}
+For DDP training, axolotl auto-detects Gemma4 and sets `use_reentrant=False` and `ddp_find_unused_parameters=True`. However, when `activation_offloading: true`, `ddp_find_unused_parameters` is skipped (checkpoint wrappers conflict with it); use `freeze_mm_modules: true` instead to handle unused vision/audio params. For FSDP2, use `fsdp_transformer_layer_cls_to_wrap: Gemma4TextDecoderLayer`.
+:::
+
 ### Gemma-3 {#sec-gemma-3}

 ::: {.callout-tip}
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -320,8 +320,10 @@ The input format is a simple JSON input with customizable fields based on the ab
 As IPO is just DPO with a different loss function, all supported dataset formats for [DPO](#dpo) are also supported for IPO.

 ```yaml
-rl: ipo
+rl: dpo
+dpo_loss_type: ["ipo"]
 ```
+*Note:* Passing `rl: ipo` directly is still supported, but will soon be deprecated.

 ### ORPO

--- a/docs/unsloth.qmd
+++ b/docs/unsloth.qmd
@@ -1,53 +0,0 @@
---
-title: "Unsloth"
-description: "Hyper-optimized QLoRA finetuning for single GPUs"
---
-
-### Overview
-
-Unsloth provides hand-written optimized kernels for LLM finetuning that slightly improve speed and VRAM over
-standard industry baselines.
-
-::: {.callout-important}
-Due to breaking changes in transformers `v4.48.0`, users will need to downgrade to `<=v4.47.1` to use this patch.
-
-This will later be deprecated in favor of [LoRA Optimizations](lora_optims.qmd).
-:::
-
-
-### Installation
-
-The following will install the correct unsloth and extras from source.
-
-```bash
-python scripts/unsloth_install.py | sh
-```
-
-### Usage
-
-Axolotl exposes a few configuration options to try out unsloth and get most of the performance gains.
-
-Our unsloth integration is currently limited to the following model architectures:
- - llama
-
-These options are specific to LoRA finetuning and cannot be used for multi-GPU finetuning
-```yaml
-unsloth_lora_mlp: true
-unsloth_lora_qkv: true
-unsloth_lora_o: true
-```
-
-These options are composable and can be used with multi-gpu finetuning
-```yaml
-unsloth_cross_entropy_loss: true
-unsloth_rms_norm: true
-unsloth_rope: true
-```
-
-### Limitations
-
- Single GPU only; e.g. no multi-gpu support
- No deepspeed or FSDP support (requires multi-gpu)
- LoRA + QLoRA support only. No full fine tunes or fp8 support.
- Limited model architecture support. Llama, Phi, Gemma, Mistral only
- No MoE support.
--- a/examples/LiquidAI/README.md
+++ b/examples/LiquidAI/README.md
@@ -15,8 +15,7 @@ Thanks to the team at LiquidAI for giving us early access to prepare for these r
    Here is an example of how to install from pip:
    ```bash
    # Ensure you have a compatible version of Pytorch installed
-    pip3 install packaging setuptools wheel ninja
-    pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+    uv pip install --no-build-isolation 'axolotl>=0.16.1'
    ```

 2.  Run one of the finetuning examples below.
@@ -35,7 +34,7 @@ Thanks to the team at LiquidAI for giving us early access to prepare for these r

    **LFM2-MoE**
    ```bash
-    pip install git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6
+    uv pip install git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6

    # LoRA SFT (1x48GB @ 16.2GiB)
    axolotl train examples/LiquidAI/lfm2-8b-a1b-lora.yaml
@@ -45,7 +44,7 @@ Thanks to the team at LiquidAI for giving us early access to prepare for these r

 - **Installation Error**: If you encounter `ImportError: ... undefined symbol ...` or `ModuleNotFoundError: No module named 'causal_conv1d_cuda'`, the `causal-conv1d` package may have been installed incorrectly. Try uninstalling it:
  ```bash
-  pip uninstall -y causal-conv1d
+  uv pip uninstall causal-conv1d
  ```

 - **Dataset Loading**: Read more on how to load your own dataset in our [documentation](https://docs.axolotl.ai/docs/dataset_loading.html).
--- a/examples/apertus/README.md
+++ b/examples/apertus/README.md
@@ -11,12 +11,11 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
    Here is an example of how to install from main for pip:

 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
+# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl

-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation -e '.[flash-attn]'
+uv pip install --no-build-isolation -e '.'

 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
@@ -31,7 +30,7 @@ python scripts/cutcrossentropy_install.py | sh
 # For those using our Docker image, use the below path.
 export CUDA_HOME=/usr/local/cuda

-pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
+uv pip install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
 ```

 For any installation errors, see [XIELU Installation Issues](#xielu-installation-issues)
@@ -67,7 +66,7 @@ If those didn't help, please try the below solutions:
 1. Pass env for CMAKE and try install again:

    ```bash
-    Python_EXECUTABLE=$(which python) pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
+    Python_EXECUTABLE=$(which python) uv pip install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
    ```

 2. Git clone the repo and manually hardcode python path:
@@ -92,7 +91,7 @@ If those didn't help, please try the below solutions:
    ```

    ```bash
-    pip3 install . --no-build-isolation --no-deps
+    uv pip install . --no-build-isolation --no-deps
    ```

 ## Optimization Guides
--- a/examples/arcee/README.md
+++ b/examples/arcee/README.md
@@ -13,12 +13,11 @@ Thanks to the team at Arcee.ai for using Axolotl in supervised fine-tuning the A
    Here is an example of how to install from main for pip:

 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
+# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl

-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation -e '.[flash-attn]'
+uv pip install --no-build-isolation -e '.'

 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -36,12 +36,7 @@
    "id": "msOCO4NRmRLa"
   },
   "outputs": [],
-   "source": [
-    "%%capture\n",
-    "# This step can take ~5-10 minutes to install dependencies\n",
-    "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
-    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@63b15e6\""
-   ]
+   "source": "%%capture\n# This step can take ~5-10 minutes to install dependencies\n!pip install --no-build-isolation \"axolotl>=0.16.1\"\n!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@fec1a88\""
  },
  {
   "cell_type": "markdown",
--- a/examples/devstral/README.md
+++ b/examples/devstral/README.md
@@ -15,9 +15,8 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r
    Here is an example of how to install from pip:

 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
+uv pip install --no-build-isolation 'axolotl>=0.16.1'
 ```

 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage
--- a/examples/gemma3/gemma-3-1b-qlora.yml
+++ b/examples/gemma3/gemma-3-1b-qlora.yml
@@ -26,8 +26,8 @@ output_dir: ./outputs/out

 # Freeze vision tower
 unfrozen_parameters:
-  - ^model\.language_model\..*
-  - ^lm_head\..*
+  - ^model.language_model.*
+  - ^lm_head.*

 adapter: qlora
 lora_r: 32
--- a/examples/gemma3/gemma-3-270m-qlora.yml
+++ b/examples/gemma3/gemma-3-270m-qlora.yml
@@ -26,8 +26,8 @@ output_dir: ./outputs/out

 # Freeze vision tower
 unfrozen_parameters:
-  - ^model\.language_model\..*
-  - ^lm_head\..*
+  - ^model.language_model.*
+  - ^lm_head.*

 adapter: qlora
 lora_r: 32
--- a/examples/gemma3/gemma-3-4b-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-qlora.yml
@@ -22,8 +22,8 @@ output_dir: ./outputs/out

 # Freeze vision tower
 unfrozen_parameters:
-  - ^model\.language_model\..*
-  - ^lm_head\..*
+  - ^model.language_model.*
+  - ^lm_head.*

 adapter: qlora
 lora_model_dir:
--- a/examples/gemma3n/README.md
+++ b/examples/gemma3n/README.md
@@ -9,18 +9,17 @@ Gemma-3n is a family of multimodal models from Google found on [HuggingFace](htt
    Here is an example of how to install from pip:

 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
+uv pip install --no-build-isolation 'axolotl>=0.16.1'
 ```

 2. In addition to Axolotl's requirements, Gemma-3n requires:

 ```bash
-pip3 install timm==1.0.17
+uv pip install timm==1.0.17

 # for loading audio data
-pip3 install librosa==0.11.0
+uv pip install librosa==0.11.0
 ```

 3. Download sample dataset files
--- a/examples/gemma4/26b-a4b-moe-qlora.yaml
+++ b/examples/gemma4/26b-a4b-moe-qlora.yaml
@@ -0,0 +1,93 @@
+# Gemma 4 26B-A4B MoE QLoRA with ScatterMoE kernels
+#
+# Validated: 50 steps on FineTome-100k, loss 8.8 -> 1.8, single RTX 5090 (32GB)
+# torch_compile=true: 21 GiB peak VRAM, ~230 tok/s, 336s total
+#
+# Key notes:
+# - Max sequence length on 32GB GPU: 2048 (micro_batch_size=1, SDP attention).
+#   4096 seq_len OOMs due to head_dim=512 math SDP materializing full score matrix.
+#   Use 48GB+ GPUs for longer sequences or multi-GPU with FSDP.
+
+base_model: google/gemma-4-26B-A4B
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+  - axolotl.integrations.kernels.KernelsPlugin
+  - axolotl.integrations.liger.LigerPlugin
+use_kernels: true
+use_scattermoe: true
+experts_implementation: scattermoe
+torch_compile: true
+liger_layer_norm: true
+liger_rope: true
+liger_rms_norm: true
+liger_glu_activation: true
+liger_rms_norm_gated: true
+strict: false
+
+chat_template: gemma4
+datasets:
+  - path: mlabonne/FineTome-100k
+    type: chat_template
+    split: train[:10%]
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+val_set_size: 0.05
+output_dir: ./outputs/gemma4-26b-a4b-qlora
+
+sequence_len: 2048
+sample_packing: true
+
+load_in_4bit: true
+quantize_moe_experts: true
+adapter: qlora
+lora_r: 16
+lora_alpha: 32
+lora_dropout: 0
+
+# Restrict LoRA to text backbone only (skip vision/audio encoders)
+# using regex to match only the text decoder attention projections.
+lora_target_modules: 'model.language_model.layers.[\d]+.(_checkpoint_wrapped_module.)?(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'
+
+# MoE expert LoRA (3D Parameter tensors, not nn.Linear)
+lora_target_parameters:
+  - experts.gate_up_proj
+  - experts.down_proj
+
+lora_mlp_kernel: false
+lora_qkv_kernel: false
+lora_o_kernel: false
+
+bnb_config_kwargs:
+  bnb_4bit_use_double_quant: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_torch_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+activation_offloading: true
+logging_steps: 1
+
+# FA2 not supported
+sdp_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 4
+saves_per_epoch: 1
+weight_decay: 0.0
+special_tokens:
--- a/examples/gemma4/31b-qlora-flex.yaml
+++ b/examples/gemma4/31b-qlora-flex.yaml
@@ -0,0 +1,71 @@
+base_model: google/gemma-4-31B
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+  - axolotl.integrations.liger.LigerPlugin
+torch_compile: true
+liger_layer_norm: true
+liger_rope: true
+liger_rms_norm: true
+liger_glu_activation: true
+liger_rms_norm_gated: true
+strict: false
+
+chat_template: gemma4
+datasets:
+  - path: mlabonne/FineTome-100k
+    type: chat_template
+    split: train[:10%]
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+val_set_size: 0.05
+output_dir: ./outputs/gemma4-31b-qlora-flex
+
+sequence_len: 2048
+sample_packing: true
+
+load_in_4bit: true
+adapter: qlora
+lora_r: 16
+lora_alpha: 32
+lora_dropout: 0
+
+# Restrict LoRA to text backbone only (skip vision/audio encoders)
+lora_target_modules: 'model.language_model.layers.[\d]+.(_checkpoint_wrapped_module.)?(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'
+
+lora_mlp_kernel: false
+lora_qkv_kernel: false
+lora_o_kernel: false
+
+bnb_config_kwargs:
+  bnb_4bit_use_double_quant: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+optimizer: adamw_torch_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+activation_offloading: true
+logging_steps: 1
+
+# FA not supported
+flex_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 4
+saves_per_epoch: 1
+weight_decay: 0.0
+special_tokens:
--- a/examples/gemma4/31b-qlora.yaml
+++ b/examples/gemma4/31b-qlora.yaml
@@ -0,0 +1,69 @@
+base_model: google/gemma-4-31B
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+  - axolotl.integrations.liger.LigerPlugin
+torch_compile: false
+liger_layer_norm: true
+liger_rope: true
+liger_rms_norm: true
+liger_glu_activation: true
+liger_rms_norm_gated: true
+strict: false
+
+chat_template: gemma4
+datasets:
+  - path: mlabonne/FineTome-100k
+    type: chat_template
+    split: train[:10%]
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+val_set_size: 0.05
+output_dir: ./outputs/gemma4-31b-qlora
+
+sequence_len: 2048
+sample_packing: true
+
+load_in_4bit: true
+adapter: qlora
+lora_r: 16
+lora_alpha: 32
+lora_dropout: 0
+
+# Restrict LoRA to text backbone only (skip vision/audio encoders)
+# using regex to match only the text decoder attention projections.
+lora_target_modules: 'model.language_model.layers.[\d]+.(_checkpoint_wrapped_module.)?(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'
+
+bnb_config_kwargs:
+  bnb_4bit_use_double_quant: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_torch_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+activation_offloading: true
+logging_steps: 1
+
+# FA not supported
+sdp_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 4
+saves_per_epoch: 1
+weight_decay: 0.0
+special_tokens:
--- a/examples/gemma4/README.md
+++ b/examples/gemma4/README.md
@@ -0,0 +1,60 @@
+# Finetune Google's Gemma 4 with Axolotl
+
+[Gemma 4](https://huggingface.co/collections/google/gemma-4) is a family of multimodal models from Google. This guide covers how to train them with Axolotl.
+
+## Getting started
+
+1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
+
+2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
+
+3. Run the finetuning example:
+
+```bash
+# 26B MoE QLoRA (1x80GB @ ~50 GiB)
+axolotl train examples/gemma4/26b-a4b-moe-qlora.yaml
+
+# 31B Dense QLoRA (1x80GB @ ~44 GiB)
+axolotl train examples/gemma4/31b-qlora.yaml
+
+# 31B Dense QLoRA Flex Attn (1x80GB @ ~26 GiB)
+axolotl train examples/gemma4/31b-qlora-flex.yaml
+```
+
+### MoE Expert Quantization & Expert LoRA (26B-A4B only)
+
+The 26B-A4B config uses ScatterMoE kernels via the transformers `ExpertsInterface` and quantizes expert weights on load. To learn about expert quantization, expert LoRA targeting, and related limitations, see the [MoE Expert Quantization](https://docs.axolotl.ai/docs/expert_quantization.html) docs.
+
+## Flex Attention
+
+Reduce ~40% VRAM (at the cost of up to half throughput) by setting the below (shown in `examples/gemma4/31b-qlora-flex.yaml`):
+
+```yaml
+torch_compile: true
+flex_attention: true
+```
+
+This works for both the MoE and Dense model.
+
+## Limitations
+
+- **Flash Attention**: FA2 (max head_dim=256) and FA4 (max head_dim=128) cannot support Gemma 4's `global_head_dim=512`. Use SDP or flex attention instead.
+- **LoRA kernels**: Not supported due to KV-sharing layers.
+- **lora_target_linear**: Incompatible for multimodal models — use `lora_target_modules` with a regex to restrict LoRA to the text backbone.
+
+### TIPS
+
+- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
+- You can run full finetuning by removing `adapter: qlora`, `load_in_4bit: true`, and `quantize_moe_experts: true` from the config. This is heavy and has not been tested.
+
+## Optimization Guides
+
+Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
+
+## Related Resources
+
+- [Gemma 4 Blog](https://huggingface.co/blog/gemma4)
+- [Axolotl Docs](https://docs.axolotl.ai)
+- [Axolotl Website](https://axolotl.ai)
+- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
+- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
--- a/examples/gemma4/e2b-vision-lora.yaml
+++ b/examples/gemma4/e2b-vision-lora.yaml
@@ -0,0 +1,62 @@
+# Gemma 4 E2B Vision LoRA
+#
+# Fine-tuning LM LoRA adapters on multimodal Gemma4 with vision/multimodal modules frozen.
+# Uses the base ProcessingStrategy (auto-detects image_token from processor).
+
+base_model: google/gemma-4-E2B-it
+processor_type: AutoProcessor
+freeze_mm_modules: true
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+strict: false
+
+# Required for vision/multimodal training
+skip_prepare_dataset: true
+remove_unused_columns: false
+sample_packing: false
+
+chat_template: gemma4
+datasets:
+  - path: HuggingFaceH4/llava-instruct-mix-vsft
+    type: chat_template
+    split: train[:100]
+
+val_set_size: 0
+output_dir: ./outputs/gemma4-e2b-vision-lora
+
+adapter: lora
+sequence_len: 2048
+pad_to_sequence_len: false
+
+lora_r: 16
+lora_alpha: 32
+lora_dropout: 0
+# Target language model only — vision encoder is frozen via freeze_mm_modules
+lora_target_modules: 'model.language_model.layers.[\d]+.(_checkpoint_wrapped_module.)?(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 1
+max_steps: 10
+optimizer: adamw_torch_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+logging_steps: 1
+sdp_attention: true
+
+warmup_ratio: 0.1
+weight_decay: 0.0
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
--- a/examples/gpt-oss/README.md
+++ b/examples/gpt-oss/README.md
@@ -13,9 +13,8 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
    Here is an example of how to install from pip:

 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
+uv pip install --no-build-isolation 'axolotl>=0.16.1'
 ```

 2. Choose one of the following configs below for training the 20B model. (for 120B, see [below](#training-120b))
@@ -87,7 +86,7 @@ for more information about using a special vllm-openai docker image for inferenc
 Optionally, vLLM can be installed from nightly:

 ```bash
-pip install --no-build-isolation --pre -U vllm --extra-index-url https://wheels.vllm.ai/nightly
+uv pip install --no-build-isolation --pre -U vllm --extra-index-url https://wheels.vllm.ai/nightly
 ```
 and the vLLM server can be started with the following command (modify `--tensor-parallel-size 8` to match your environment):
 ```bash
--- a/examples/granite4/README.md
+++ b/examples/granite4/README.md
@@ -11,12 +11,11 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
    Here is an example of how to install from main for pip:

 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.7.1 min)
+# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl

-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation -e '.[flash-attn]'
+uv pip install --no-build-isolation -e '.'

 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
--- a/examples/hunyuan/README.md
+++ b/examples/hunyuan/README.md
@@ -9,12 +9,11 @@ Tencent released a family of opensource models called HunYuan with varying param
    Here is an example of how to install from main for pip:

 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
+# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl

-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation -e '.[flash-attn]'
+uv pip install --no-build-isolation -e '.'

 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
--- a/examples/internvl3_5/README.md
+++ b/examples/internvl3_5/README.md
@@ -11,7 +11,7 @@ This guide shows how to fine-tune it with Axolotl.
 2. Install `timm` for vision model support:

    ```bash
-    pip install timm==1.0.19
+    uv pip install timm==1.0.19
    ```

 3. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
--- a/examples/magistral/README.md
+++ b/examples/magistral/README.md
@@ -13,9 +13,8 @@ Thanks to the team at MistralAI for giving us early access to prepare for these
    Here is an example of how to install from pip:

 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.7.0 min)
-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
+uv pip install --no-build-isolation 'axolotl>=0.16.1'
 ```

 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage
--- a/examples/magistral/vision/README.md
+++ b/examples/magistral/vision/README.md
@@ -12,7 +12,7 @@ Before starting, ensure you have:

 1. Install the required vision lib:
    ```bash
-    pip install 'mistral-common[opencv]==1.8.5'
+    uv pip install 'mistral-common[opencv]==1.8.5'
    ```

 2. Download the example dataset image:
--- a/examples/ministral3/README.md
+++ b/examples/ministral3/README.md
@@ -23,7 +23,7 @@ Note: This is still experimental given it is based on transformers v5 RC.
    git checkout transformers-v5

    # Install packages for transformers v5
-    pip install -e .
+    uv pip install -e .
    ```

 4. Run the fine-tuning:
--- a/examples/ministral3/vision/README.md
+++ b/examples/ministral3/vision/README.md
@@ -12,7 +12,7 @@ Before starting, ensure you have:

 1. Install the required vision lib:
    ```bash
-    pip install 'mistral-common[opencv]==1.8.6'
+    uv pip install 'mistral-common[opencv]==1.8.6'
    ```

 2. Download the example dataset image:
--- a/examples/mistral-small/README.md
+++ b/examples/mistral-small/README.md
@@ -12,7 +12,7 @@ Before starting, ensure you have:

 1. Install the required vision lib:
    ```bash
-    pip install 'mistral-common[opencv]==1.8.5'
+    uv pip install 'mistral-common[opencv]==1.8.5'
    ```

 2. Download the example dataset image:
--- a/examples/mistral4/README.md
+++ b/examples/mistral4/README.md
@@ -13,7 +13,7 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r
 3. Install transformers from main

  ```bash
-  pip install git+https://github.com/huggingface/transformers.git
+  uv pip install git+https://github.com/huggingface/transformers.git
  ```

 4. Run one of the example configs:
--- a/examples/nemotron-h/120b-a12b-qlora.yaml
+++ b/examples/nemotron-h/120b-a12b-qlora.yaml
@@ -1,5 +1,15 @@
 base_model: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16

+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+  - axolotl.integrations.liger.LigerPlugin
+
+liger_layer_norm: true
+liger_rope: true
+liger_rms_norm: true
+liger_glu_activation: true
+liger_rms_norm_gated: true
+
 # LoRA kernel patches are incompatible with this architecture — see README.
 lora_mlp_kernel: false
 lora_qkv_kernel: false
@@ -22,8 +32,6 @@ dataset_prepared_path: last_run_prepared
 sequence_len: 4096
 sample_packing: true

-use_cut_cross_entropy: true
-
 load_in_4bit: true
 quantize_moe_experts: true
 adapter: qlora
@@ -31,16 +39,16 @@ lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.0
 lora_target_modules:
-  # Attention projection layers (present in ~12 attention layers out of 88)
  - q_proj
  - k_proj
  - v_proj
  - o_proj
-  # To also train MoE expert weights, add them via lora_target_parameters
-  # (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj):
-  #   lora_target_parameters:
-  #     - up_proj
-  #     - down_proj
+
+# To also train MoE expert weights, add them via lora_target_parameters
+# (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj):
+# lora_target_parameters:
+#   - up_proj
+#   - down_proj

 wandb_project:
 wandb_entity:
--- a/examples/nemotron-h/nano-30b-a3b-qlora.yaml
+++ b/examples/nemotron-h/nano-30b-a3b-qlora.yaml
@@ -1,6 +1,16 @@
 # See examples/nemotron-h/README.md for architecture notes and requirements.
 base_model: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16

+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+  - axolotl.integrations.liger.LigerPlugin
+
+liger_layer_norm: true
+liger_rope: true
+liger_rms_norm: true
+liger_glu_activation: true
+liger_rms_norm_gated: true
+
 # LoRA kernel patches are incompatible with this architecture — see README.
 lora_mlp_kernel: false
 lora_qkv_kernel: false
@@ -23,8 +33,6 @@ dataset_prepared_path: last_run_prepared
 sequence_len: 4096
 sample_packing: true

-use_cut_cross_entropy: true
-
 load_in_4bit: true
 quantize_moe_experts: true
 adapter: qlora
@@ -36,11 +44,12 @@ lora_target_modules:
  - k_proj
  - v_proj
  - o_proj
-  # To also train MoE expert weights, add them via lora_target_parameters
-  # (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj):
-  #   lora_target_parameters:
-  #     - up_proj
-  #     - down_proj
+
+# To also train MoE expert weights, add them via lora_target_parameters
+# (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj):
+# lora_target_parameters:
+#   - up_proj
+#   - down_proj

 wandb_project:
 wandb_entity:
--- a/examples/qwen3-next/README.md
+++ b/examples/qwen3-next/README.md
@@ -12,7 +12,7 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations

 3. Install FLA for improved performance
 ```bash
-pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.4.1
+uv pip uninstall causal-conv1d && uv pip install flash-linear-attention==0.4.1
 ```

 4. Run the finetuning example:
--- a/examples/qwen3.5/27b-fft.yaml
+++ b/examples/qwen3.5/27b-fft.yaml
@@ -26,8 +26,8 @@ sample_packing: true

 # Freeze vision encoder
 unfrozen_parameters:
-  - model\.language_model\..*
-  - lm_head\..*
+  - model.language_model.*
+  - lm_head.*

 wandb_project:
 wandb_entity:
--- a/examples/qwen3.5/35b-a3b-moe-vision-lora.yaml
+++ b/examples/qwen3.5/35b-a3b-moe-vision-lora.yaml
@@ -0,0 +1,62 @@
+# Qwen 3.5 35B-A3B MoE Vision LoRA
+#
+# Vision fine-tuning of the hybrid DeltaNet + Attention MoE model.
+# 256 experts, 8 active per token, with early-fusion vision support.
+
+base_model: Qwen/Qwen3.5-35B-A3B
+processor_type: AutoProcessor
+
+# Required for vision/multimodal training
+skip_prepare_dataset: true
+remove_unused_columns: false
+sample_packing: false
+
+chat_template: qwen3_5
+datasets:
+  - path: HuggingFaceH4/llava-instruct-mix-vsft
+    type: chat_template
+    split: train[:100]
+
+val_set_size: 0
+output_dir: ./outputs/qwen35-35b-a3b-vision-lora
+
+adapter: lora
+sequence_len: 4096
+pad_to_sequence_len: false
+
+lora_r: 16
+lora_alpha: 32
+lora_dropout: 0
+lora_target_modules:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - down_proj
+  - up_proj
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 1
+max_steps: 10
+optimizer: adamw_torch_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+weight_decay: 0.0
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
--- a/examples/qwen3.5/README.md
+++ b/examples/qwen3.5/README.md
@@ -10,7 +10,7 @@

 3. Install FLA for sample packing support with the Gated DeltaNet linear attention layers:
  ```bash
-  pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.4.1
+  uv pip uninstall causal-conv1d && uv pip install flash-linear-attention==0.4.1
  ```
  > FLA is required when `sample_packing: true`. Without it, training raises a `RuntimeError` on packed sequences. Vision configs use `sample_packing: false` so FLA is optional there.

--- a/examples/seed-oss/README.md
+++ b/examples/seed-oss/README.md
@@ -11,8 +11,7 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
    Here is an example of how to install from pip:
    ```bash
    # Ensure you have a compatible version of Pytorch installed
-    pip3 install packaging setuptools wheel ninja
-    pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+    uv pip install --no-build-isolation 'axolotl>=0.16.1'

    # Install Cut Cross Entropy
    python scripts/cutcrossentropy_install.py | sh
--- a/examples/smolvlm2/README.md
+++ b/examples/smolvlm2/README.md
@@ -13,14 +13,13 @@ This guide shows how to fine-tune SmolVLM2 models with Axolotl.
    Here is an example of how to install from pip:
    ```bash
    # Ensure you have a compatible version of Pytorch installed
-    pip3 install packaging setuptools wheel ninja
-    pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+    uv pip install --no-build-isolation 'axolotl>=0.16.1'
    ```

 2. Install an extra dependency:

    ```bash
-    pip3 install num2words==0.5.14
+    uv pip install num2words==0.5.14
    ```

 3.  Run the finetuning example:
--- a/examples/voxtral/README.md
+++ b/examples/voxtral/README.md
@@ -11,17 +11,16 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r
    Here is an example of how to install from pip:

 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
+uv pip install --no-build-isolation 'axolotl>=0.16.1'
 ```

 2. Please install the below.

 ```bash
 # audio
-pip3 install librosa==0.11.0
-pip3 install 'mistral_common[audio]==1.8.3'
+uv pip install librosa==0.11.0
+uv pip install 'mistral_common[audio]==1.8.3'

 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,15 +1,165 @@
 [build-system]
-requires = ["setuptools>=64", "wheel", "setuptools_scm>=8", "packaging==26.0"]
+requires = ["setuptools>=64", "wheel", "setuptools_scm>=8"]
 build-backend = "setuptools.build_meta"

 [project]
 name = "axolotl"
-dynamic = ["version", "dependencies", "optional-dependencies"]
+dynamic = ["version"]
 description = "LLM Trainer"
 readme = "README.md"
 requires-python = ">=3.10"
 # license = "Apache-2.0"

+dependencies = [
+    # Core ML stack
+    "torch>=2.9.1",
+    "packaging==26.0",
+    "huggingface_hub>=1.1.7",
+    "peft>=0.19.1,<0.20.0",
+    "tokenizers>=0.22.1",
+    "transformers==5.5.4",
+    "accelerate==1.13.0",
+    "datasets>=4.8.4,<4.9.0",
+    "trl==1.1.0",
+    "hf_xet==1.4.3",
+    "kernels==0.13.0",
+    "trackio>=0.16.1",
+    "typing-extensions>=4.15.0",
+    "optimum==1.16.2",
+    "hf_transfer",
+    "sentencepiece",
+    "gradio>=6.2.0,<7.0",
+    "modal==1.3.0.post1",
+    "pydantic>=2.10.6",
+    "addict",
+    "fire",
+    "PyYAML>=6.0",
+    "requests",
+    "wandb",
+    "einops",
+    "colorama",
+    "numba>=0.61.2",
+    "numpy>=2.2.6",
+
+    # Evaluation & metrics
+    "evaluate==0.4.1",
+    "scipy",
+    "nvidia-ml-py==12.560.30",
+    "art",
+    "tensorboard",
+    "python-dotenv==1.0.1",
+
+    # Remote filesystems
+    "s3fs>=2024.5.0",
+    "gcsfs>=2025.3.0",
+    "adlfs>=2024.5.0",
+    "ocifs==1.3.2",
+
+    "zstandard==0.22.0",
+    "fastcore",
+
+    # lm eval harness
+    "lm_eval==0.4.11",
+    "langdetect==1.0.9",
+    "immutabledict==4.2.0",
+    "antlr4-python3-runtime==4.13.2",
+
+    "schedulefree==1.4.1",
+    "openenv-core==0.1.0",
+
+    # Axolotl contribs
+    "axolotl-contribs-lgpl==0.0.7",
+    "axolotl-contribs-mit==0.0.6",
+
+    # Telemetry
+    "posthog==6.7.11",
+
+    "mistral-common==1.11.0",
+
+    # Platform-specific (Linux only)
+    "bitsandbytes==0.49.1 ; sys_platform != 'darwin'",
+    "triton>=3.4.0 ; sys_platform != 'darwin'",
+    "xformers>=0.0.33.post2 ; sys_platform != 'darwin' and platform_machine != 'aarch64'",
+    "liger-kernel==0.7.0 ; sys_platform != 'darwin'",
+    "torchao==0.17.0 ; sys_platform != 'darwin' and platform_machine != 'aarch64'",
+
+    # Architecture-specific
+    "fla-core==0.4.1 ; platform_machine != 'aarch64'",
+    "flash-linear-attention==0.4.1 ; platform_machine != 'aarch64'",
+]
+
+[project.optional-dependencies]
+flash-attn = ["flash-attn==2.8.3"]
+ring-flash-attn = [
+    "flash-attn==2.8.3",
+    "ring-flash-attn>=0.1.7",
+]
+deepspeed = [
+    "deepspeed>=0.18.6,<0.19.0",
+    "deepspeed-kernels",
+]
+mamba-ssm = [
+    "mamba-ssm==1.2.0.post1",
+    "causal_conv1d",
+]
+auto-gptq = [
+    "auto-gptq==0.5.1",
+]
+mlflow = [
+    "mlflow",
+]
+galore = [
+    "galore_torch",
+]
+apollo = [
+    "apollo-torch",
+]
+optimizers = [
+    "galore_torch",
+    "apollo-torch",
+    "lomo-optim==0.1.1",
+    "torch-optimi==0.2.1",
+    "came_pytorch==0.1.3",
+]
+ray = [
+    "ray[train]>=2.52.1",
+]
+vllm = [
+    "vllm>=0.15.0",
+]
+llmcompressor = [
+    "llmcompressor>=0.10.0",
+]
+fbgemm-gpu = ["fbgemm-gpu-genai>=1.3.0"]
+opentelemetry = [
+    "opentelemetry-api",
+    "opentelemetry-sdk",
+    "opentelemetry-exporter-prometheus",
+    "prometheus-client",
+]
+
+[dependency-groups]
+dev = [
+    "black",
+    "mypy",
+    "pre-commit",
+    "types-requests",
+    "quartodoc",
+    "jupyter",
+    "blobfile",
+    "tiktoken",
+]
+test = [
+    "codecov",
+    "codecov-cli",
+    "pytest",
+    "pytest-cov",
+    "pytest-retry",
+    "pytest-sugar",
+    "pytest-xdist",
+    "tbparse",
+]
+
 [project.scripts]
 axolotl = "axolotl.cli.main:main"

@@ -18,18 +168,15 @@ Homepage = "https://axolotl.ai/"
 Documentation = "https://docs.axolotl.ai/"
 Repository = "https://github.com/axolotl-ai-cloud/axolotl.git"

-[tool.setuptools_scm]
-
 [tool.setuptools]
-py-modules = ["setuptools_axolotl_dynamic_dependencies"]
 include-package-data = true

+[tool.setuptools.packages.find]
+where = ["src"]
+
 [tool.setuptools.dynamic]
 version = { file = "VERSION" }

-[tool.setuptools.cmdclass]
-build_py = "setuptools_axolotl_dynamic_dependencies.BuildPyCommand"
-
 [tool.ruff]
 line-length = 88
 target-version = "py310"
@@ -67,5 +214,43 @@ markers = [
    "slow: marks tests as slow",
 ]

+# UV specific configuration
+[tool.uv]
+prerelease = "allow"
+conflicts = [
+    [
+        { package = "axolotl" },
+        { extra = "vllm" },
+    ],
+    [
+        { package = "axolotl" },
+        { extra = "flash-attn" },
+    ],
+    [
+        { package = "axolotl" },
+        { extra = "ring-flash-attn" },
+    ],
+    [
+        { package = "axolotl" },
+        { extra = "mamba-ssm" },
+    ],
+    [
+        { package = "axolotl" },
+        { extra = "auto-gptq" },
+    ],
+    [
+        { package = "axolotl" },
+        { extra = "fbgemm-gpu" },
+    ],
+    [
+        { package = "axolotl" },
+        { extra = "llmcompressor" },
+    ],
+]
+
 [tool.uv.extra-build-dependencies]
-axolotl = ["huggingface_hub"]
+mamba-ssm = [{ requirement = "torch", match-runtime = true }]
+causal-conv1d = [{ requirement = "torch", match-runtime = true }]
+flash-attn = [{ requirement = "torch", match-runtime = true }]
+deepspeed = [{ requirement = "torch", match-runtime = true }]
+auto-gptq = [{ requirement = "torch", match-runtime = true }]
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,8 +0,0 @@
-black
-mypy
-pre-commit
-types-requests
-quartodoc
-jupyter
-blobfile
-tiktoken
--- a/requirements-tests.txt
+++ b/requirements-tests.txt
@@ -1,8 +0,0 @@
-codecov
-codecov-cli
-pytest
-pytest-cov
-pytest-retry
-pytest-sugar
-pytest-xdist
-tbparse
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,78 +0,0 @@
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
-
-# START section of dependencies that don't install on Darwin/MacOS
-bitsandbytes==0.49.1
-triton>=3.4.0
-mamba-ssm==1.2.0.post1
-xformers>=0.0.23.post1
-liger-kernel==0.7.0
-# END section
-
-packaging==26.0
-huggingface_hub>=1.1.7
-peft>=0.18.1
-tokenizers>=0.22.1
-transformers==5.4.0
-accelerate==1.13.0
-datasets==4.5.0
-deepspeed>=0.18.6,<0.19.0
-trl==0.29.0
-hf_xet==1.3.2
-kernels==0.12.2
-
-fla-core==0.4.1
-flash-linear-attention==0.4.1
-
-trackio>=0.16.1
-typing-extensions>=4.15.0
-
-optimum==1.16.2
-hf_transfer
-sentencepiece
-gradio>=6.2.0,<7.0
-
-modal==1.3.0.post1
-pydantic>=2.10.6
-addict
-fire
-PyYAML>=6.0
-requests
-wandb
-einops
-colorama
-numba>=0.61.2
-numpy>=2.2.6
-
-# qlora things
-evaluate==0.4.1
-scipy
-nvidia-ml-py==12.560.30
-art
-tensorboard
-python-dotenv==1.0.1
-
-# remote filesystems
-s3fs>=2024.5.0
-gcsfs>=2025.3.0
-adlfs>=2024.5.0
-ocifs==1.3.2
-
-zstandard==0.22.0
-fastcore
-
-# lm eval harness
-lm_eval==0.4.11
-langdetect==1.0.9
-immutabledict==4.2.0
-antlr4-python3-runtime==4.13.2
-
-torchao==0.17.0
-openenv-core==0.1.0
-schedulefree==1.4.1
-
-axolotl-contribs-lgpl==0.0.7
-axolotl-contribs-mit==0.0.6
-# telemetry
-posthog==6.7.11
-
-mistral-common==1.11.0
--- a/scripts/analyze_profile.py
+++ b/scripts/analyze_profile.py
--- a/scripts/cutcrossentropy_install.py
+++ b/scripts/cutcrossentropy_install.py
@@ -29,5 +29,5 @@ UV_PREFIX = "uv " if USE_UV else ""

 print(
    UNINSTALL_PREFIX
-    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@63b15e6"'
+    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@fec1a88"'
 )
--- a/scripts/unsloth_install.py
+++ b/scripts/unsloth_install.py
@@ -1,40 +0,0 @@
-# noqa
-import sys
-
-try:
-    import torch
-except ImportError as error:
-    raise ImportError("Install torch via `pip install torch`") from error
-from packaging.version import Version as V
-
-use_uv = "--uv" in sys.argv[1:]
-
-v = V(torch.__version__)
-cuda = str(torch.version.cuda)
-try:
-    is_ampere = torch.cuda.get_device_capability()[0] >= 8
-except RuntimeError:
-    is_ampere = False
-if cuda != "12.1" and cuda != "11.8" and cuda != "12.4":
-    raise RuntimeError(f"CUDA = {cuda} not supported!")
-if v <= V("2.1.0"):
-    raise RuntimeError(f"Torch = {v} too old!")
-elif v <= V("2.1.1"):
-    x = "cu{}{}-torch211"
-elif v <= V("2.1.2"):
-    x = "cu{}{}-torch212"
-elif v < V("2.3.0"):
-    x = "cu{}{}-torch220"
-elif v < V("2.4.0"):
-    x = "cu{}{}-torch230"
-elif v < V("2.5.0"):
-    x = "cu{}{}-torch240"
-elif v < V("2.6.0"):
-    x = "cu{}{}-torch250"
-else:
-    raise RuntimeError(f"Torch = {v} too new!")
-x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "")
-uv_prefix = "uv " if use_uv else ""
-print(
-    f'{uv_prefix}pip install unsloth-zoo==2024.12.1 && {uv_prefix}pip install --no-deps "unsloth[{x}]==2024.12.4"'
-)
--- a/setup.py
+++ b/setup.py
@@ -1,230 +0,0 @@
-"""setup.py for axolotl"""
-
-import os
-import platform
-import re
-from importlib.metadata import PackageNotFoundError, version
-from pathlib import Path
-
-from setuptools import find_packages, setup
-
-
-def parse_requirements(extras_require_map):
-    _install_requires = []
-    _dependency_links = []
-    with open("./requirements.txt", encoding="utf-8") as requirements_file:
-        lines = [r.strip() for r in requirements_file.readlines()]
-        for line in lines:
-            is_extras = "deepspeed" in line or "mamba-ssm" in line
-            if line.startswith("--extra-index-url"):
-                # Handle custom index URLs
-                _, url = line.split()
-                _dependency_links.append(url)
-            elif not is_extras and line and line[0] != "#":
-                # Handle standard packages
-                _install_requires.append(line)
-    try:
-        xformers_version = [req for req in _install_requires if "xformers" in req][0]
-        install_xformers = platform.machine() != "aarch64"
-        if platform.machine() == "aarch64":
-            # skip on ARM64
-            skip_packages = [
-                "torchao",
-                "fla-core",
-                "flash-linear-attention",
-            ]
-            _install_requires = [
-                req
-                for req in _install_requires
-                if re.split(r"[>=<]", req)[0].strip() not in skip_packages
-            ]
-        if "Darwin" in platform.system():
-            # skip packages not compatible with OSX
-            skip_packages = [
-                "bitsandbytes",
-                "triton",
-                "mamba-ssm",
-                "xformers",
-                "liger-kernel",
-            ]
-            _install_requires = [
-                req
-                for req in _install_requires
-                if re.split(r"[>=<]", req)[0].strip() not in skip_packages
-            ]
-            print(
-                _install_requires, [req in skip_packages for req in _install_requires]
-            )
-        else:
-            # detect the version of torch already installed
-            # and set it so dependencies don't clobber the torch version
-            try:
-                torch_version = version("torch")
-            except PackageNotFoundError:
-                torch_version = "2.8.0"  # default to torch 2.8.0
-            _install_requires.append(f"torch=={torch_version}")
-
-            version_match = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?", torch_version)
-            if version_match:
-                major, minor, patch = version_match.groups()
-                major, minor = int(major), int(minor)
-                patch = (
-                    int(patch) if patch is not None else 0
-                )  # Default patch to 0 if not present
-            else:
-                raise ValueError("Invalid version format")
-
-            torch_parts = torch_version.split("+")
-            if len(torch_parts) == 2:
-                torch_cuda_version = torch_parts[1]
-                _dependency_links.append(
-                    f"https://download.pytorch.org/whl/{torch_cuda_version}"
-                )
-
-            if (major, minor) >= (2, 10):
-                extras_require_map.pop("fbgemm-gpu")
-                extras_require_map["fbgemm-gpu"] = [
-                    "fbgemm-gpu==1.5.0",
-                    "fbgemm-gpu-genai==1.5.0",
-                ]
-                if not install_xformers:
-                    _install_requires.pop(_install_requires.index(xformers_version))
-                extras_require_map["vllm"] = ["vllm>=0.17.1"]
-            elif (major, minor) >= (2, 9):
-                extras_require_map.pop("fbgemm-gpu")
-                extras_require_map["fbgemm-gpu"] = [
-                    "fbgemm-gpu==1.4.0",
-                    "fbgemm-gpu-genai==1.4.2",
-                ]
-                if not install_xformers:
-                    _install_requires.pop(_install_requires.index(xformers_version))
-                if patch == 0:
-                    extras_require_map["vllm"] = ["vllm==0.13.0"]
-                else:
-                    extras_require_map["vllm"] = ["vllm==0.14.0"]
-            elif (major, minor) >= (2, 8):
-                extras_require_map.pop("fbgemm-gpu")
-                extras_require_map["fbgemm-gpu"] = ["fbgemm-gpu-genai==1.3.0"]
-                extras_require_map["vllm"] = ["vllm==0.11.0"]
-                if not install_xformers:
-                    _install_requires.pop(_install_requires.index(xformers_version))
-            elif (major, minor) >= (2, 7):
-                _install_requires.pop(_install_requires.index(xformers_version))
-                if patch == 0:
-                    if install_xformers:
-                        _install_requires.append("xformers==0.0.30")
-                    # vllm 0.9.x is incompatible with latest transformers
-                    extras_require_map.pop("vllm")
-                else:
-                    if install_xformers:
-                        _install_requires.append("xformers==0.0.31")
-                    extras_require_map["vllm"] = ["vllm==0.10.1"]
-            elif (major, minor) >= (2, 6):
-                _install_requires.pop(_install_requires.index(xformers_version))
-                if install_xformers:
-                    _install_requires.append("xformers==0.0.29.post3")
-                # since we only support 2.6.0+cu126
-                _dependency_links.append("https://download.pytorch.org/whl/cu126")
-                extras_require_map.pop("vllm")
-            elif (major, minor) >= (2, 5):
-                _install_requires.pop(_install_requires.index(xformers_version))
-                if install_xformers:
-                    if patch == 0:
-                        _install_requires.append("xformers==0.0.28.post2")
-                    else:
-                        _install_requires.append("xformers>=0.0.28.post3")
-                extras_require_map.pop("vllm")
-            elif (major, minor) >= (2, 4):
-                extras_require_map.pop("vllm")
-                if install_xformers:
-                    if patch == 0:
-                        _install_requires.pop(_install_requires.index(xformers_version))
-                        _install_requires.append("xformers>=0.0.27")
-                    else:
-                        _install_requires.pop(_install_requires.index(xformers_version))
-                        _install_requires.append("xformers==0.0.28.post1")
-            else:
-                raise ValueError("axolotl requires torch>=2.4")
-
-    except PackageNotFoundError:
-        pass
-    return _install_requires, _dependency_links, extras_require_map
-
-
-def get_package_version():
-    with open(
-        Path(os.path.dirname(os.path.abspath(__file__))) / "VERSION",
-        "r",
-        encoding="utf-8",
-    ) as fin:
-        version_ = fin.read().strip()
-    return version_
-
-
-extras_require = {
-    "flash-attn": ["flash-attn==2.8.3"],
-    "ring-flash-attn": [
-        "flash-attn==2.8.3",
-        "ring-flash-attn>=0.1.7",
-    ],
-    "deepspeed": [
-        "deepspeed==0.18.2",
-        "deepspeed-kernels",
-    ],
-    "mamba-ssm": [
-        "mamba-ssm==1.2.0.post1",
-        "causal_conv1d",
-    ],
-    "auto-gptq": [
-        "auto-gptq==0.5.1",
-    ],
-    "mlflow": [
-        "mlflow",
-    ],
-    "galore": [
-        "galore_torch",
-    ],
-    "apollo": [
-        "apollo-torch",
-    ],
-    "optimizers": [
-        "galore_torch",
-        "apollo-torch",
-        "lomo-optim==0.1.1",
-        "torch-optimi==0.2.1",
-        "came_pytorch==0.1.3",
-    ],
-    "ray": [
-        "ray[train]>=2.52.1",
-    ],
-    "vllm": [
-        "vllm==0.10.0",
-    ],
-    "llmcompressor": [
-        "llmcompressor==0.5.1",
-    ],
-    "fbgemm-gpu": ["fbgemm-gpu-genai==1.3.0"],
-    "opentelemetry": [
-        "opentelemetry-api",
-        "opentelemetry-sdk",
-        "opentelemetry-exporter-prometheus",
-        "prometheus-client",
-    ],
-}
-install_requires, dependency_links, extras_require_build = parse_requirements(
-    extras_require
-)
-
-setup(
-    version=get_package_version(),
-    package_dir={"": "src"},
-    packages=find_packages("src"),
-    install_requires=install_requires,
-    dependency_links=dependency_links,
-    entry_points={
-        "console_scripts": [
-            "axolotl=axolotl.cli.main:main",
-        ],
-    },
-    extras_require=extras_require_build,
-)
--- a/src/axolotl/cli/agent_docs/init.py
+++ b/src/axolotl/cli/agent_docs/init.py
@@ -0,0 +1,108 @@
+"""Bundled agent documentation for axolotl.
+
+These docs are optimized for consumption by AI coding agents.
+The source of truth is docs/agents/*.md and AGENTS.md in the repo root.
+This module resolves those paths at runtime — no files are duplicated
+into the package.
+
+For pip-only installs (no repo checkout), run `axolotl fetch docs` first
+to download the docs locally.
+"""
+
+from pathlib import Path
+
+# Topic name -> (filename in docs/agents/, fallback filename for AGENTS.md)
+TOPICS = {
+    "overview": "AGENTS.md",
+    "sft": "docs/agents/sft.md",
+    "grpo": "docs/agents/grpo.md",
+    "preference_tuning": "docs/agents/preference_tuning.md",
+    "reward_modelling": "docs/agents/reward_modelling.md",
+    "pretraining": "docs/agents/pretraining.md",
+    "model_architectures": "docs/agents/model_architectures.md",
+    "new_model_support": "docs/agents/new_model_support.md",
+}
+
+
+def _find_repo_root() -> Path | None:
+    """Walk up from this file to find the repo root (contains AGENTS.md)."""
+    # In an editable install or repo checkout, walk up from
+    # src/axolotl/cli/agent_docs/ to find the repo root
+    current = Path(__file__).resolve().parent
+    while current != current.parent:
+        if (current / "AGENTS.md").exists() and (current / "docs" / "agents").is_dir():
+            return current
+        current = current.parent
+    return None
+
+
+def _find_docs_dir() -> Path | None:
+    """Find a fetched docs directory (from `axolotl fetch docs`)."""
+    # axolotl fetch docs --dest defaults to ./docs/ in cwd
+    cwd_docs = Path.cwd() / "docs" / "agents"
+    if cwd_docs.is_dir():
+        return Path.cwd()
+    return None
+
+
+def _resolve_path(topic: str) -> Path:
+    """Resolve a topic name to the actual file path."""
+    if topic not in TOPICS:
+        available = ", ".join(sorted(TOPICS.keys()))
+        raise FileNotFoundError(f"Unknown topic: {topic!r}. Available: {available}")
+
+    relative_path = TOPICS[topic]
+
+    # Try repo root first (editable install / repo checkout)
+    repo_root = _find_repo_root()
+    if repo_root:
+        candidate = repo_root / relative_path
+        if candidate.exists():
+            return candidate
+
+    # Try cwd (fetched docs via `axolotl fetch docs`)
+    docs_root = _find_docs_dir()
+    if docs_root:
+        candidate = docs_root / relative_path
+        if candidate.exists():
+            return candidate
+
+    # Also check cwd directly for AGENTS.md
+    if topic == "overview":
+        cwd_agents = Path.cwd() / "AGENTS.md"
+        if cwd_agents.exists():
+            return cwd_agents
+
+    raise FileNotFoundError(
+        f"Could not find {relative_path!r}. "
+        f"If you installed axolotl via pip, run `axolotl fetch docs` first "
+        f"to download the documentation."
+    )
+
+
+def get_doc(topic: str = "overview") -> str:
+    """Return the content of an agent doc by topic name.
+
+    Args:
+        topic: One of the keys in TOPICS, or "overview" (default).
+
+    Returns:
+        The markdown content of the doc.
+
+    Raises:
+        FileNotFoundError: If the topic can't be found.
+    """
+    return _resolve_path(topic).read_text()
+
+
+def list_topics() -> dict[str, str]:
+    """Return a dict of topic name -> first line (title) of each doc."""
+    result = {}
+    for topic in sorted(TOPICS.keys()):
+        try:
+            path = _resolve_path(topic)
+            first_line = path.read_text().split("\n", 1)[0].lstrip("# ").strip()
+            result[topic] = first_line
+        except FileNotFoundError:
+            result[topic] = "(not found — run `axolotl fetch docs`)"
+    return result
--- a/src/axolotl/cli/main.py
+++ b/src/axolotl/cli/main.py
@@ -294,7 +294,9 @@ def merge_lora(config: str, **kwargs):


@cli.command()
-@click.argument("directory", type=click.Choice(["examples", "deepspeed_configs"]))
+@click.argument(
+    "directory", type=click.Choice(["examples", "deepspeed_configs", "docs"])
+)
@click.option("--dest", help="Destination directory")
 def fetch(directory: str, dest: Optional[str]):
    """
@@ -303,9 +305,10 @@ def fetch(directory: str, dest: Optional[str]):
    Available directories:
    - examples: Example configuration files
    - deepspeed_configs: DeepSpeed configuration files
+    - docs: Full documentation (Quarto markdown files)

    Args:
-        directory: One of `examples`, `deepspeed_configs`.
+        directory: One of `examples`, `deepspeed_configs`, `docs`.
        dest: Optional destination directory.
    """
    fetch_from_github(f"{directory}/", dest)
@@ -340,6 +343,112 @@ def delinearize_llama4(model: str, output: str):
    do_delinearize_llama4(model, output)


+@cli.command("agent-docs")
+@click.argument("topic", required=False, default=None)
+@click.option("--list", "list_topics", is_flag=True, help="List available topics")
+def agent_docs(topic: Optional[str], list_topics: bool):
+    """Show agent-optimized documentation.
+
+    Prints reference docs designed for AI coding agents.
+    These docs are bundled with the package — no network access needed.
+
+    \b
+    Examples:
+        axolotl agent-docs              # overview (start here)
+        axolotl agent-docs grpo         # GRPO reference
+        axolotl agent-docs sft          # SFT reference
+        axolotl agent-docs --list       # list all topics
+    """
+    from axolotl.cli.agent_docs import get_doc, list_topics as _list_topics
+
+    if list_topics:
+        for name, title in _list_topics().items():
+            click.echo(f"  {name:25s} {title}")
+        return
+
+    if topic is None:
+        topic = "overview"
+
+    try:
+        click.echo(get_doc(topic))
+    except FileNotFoundError as exc:
+        raise click.BadParameter(str(exc)) from exc
+
+
+@cli.command("config-schema")
+@click.option(
+    "--format",
+    "output_format",
+    type=click.Choice(["json", "yaml"]),
+    default="json",
+    help="Output format (default: json)",
+)
+@click.option("--field", help="Show schema for a specific field only")
+def config_schema(output_format: str, field: Optional[str]):
+    """Dump the full config JSON schema.
+
+    Useful for AI agents and tooling to discover all available config options,
+    their types, defaults, and descriptions.
+
+    \b
+    Examples:
+        axolotl config-schema                    # full JSON schema
+        axolotl config-schema --format yaml      # YAML format
+        axolotl config-schema --field adapter     # single field
+    """
+    import json
+
+    try:
+        schema = AxolotlInputConfig.model_json_schema()
+    except (TypeError, ValueError, AttributeError) as exc:
+        # Fallback: dump field names, types, and defaults when full schema
+        # generation fails (e.g. torch.dtype not JSON-serializable)
+        LOG.warning(
+            "Full JSON schema generation failed, using simplified fallback: %s", exc
+        )
+        fields = {}
+        for name, field_info in AxolotlInputConfig.model_fields.items():
+            entry = {}
+            if field_info.description:
+                entry["description"] = field_info.description
+            if field_info.default is not None:
+                try:
+                    json.dumps(field_info.default)
+                    entry["default"] = field_info.default
+                except (TypeError, ValueError):
+                    entry["default"] = str(field_info.default)
+            annotation = field_info.annotation
+            if annotation is not None:
+                entry["type"] = str(annotation)
+            fields[name] = entry
+        schema = {
+            "properties": fields,
+            "_note": "simplified schema (full generation failed)",
+        }
+
+    if field:
+        props = schema.get("properties", {})
+        if field not in props:
+            # Try case-insensitive match
+            matches = [k for k in props if k.lower() == field.lower()]
+            if matches:
+                field = matches[0]
+            else:
+                raise click.BadParameter(
+                    f"Unknown field: {field!r}. "
+                    f"Omit --field to dump the full schema, "
+                    f"or pipe to jq: axolotl config-schema | jq '.properties | keys'"
+                )
+        schema = {field: props[field]}
+
+    if output_format == "yaml":
+        import yaml  # pylint: disable=import-outside-toplevel
+
+        click.echo(yaml.dump(schema, default_flow_style=False, sort_keys=False))
+    else:
+        click.echo(json.dumps(schema, indent=2))
+
+
 cli.add_command(lm_eval)


--- a/src/axolotl/cli/merge_lora.py
+++ b/src/axolotl/cli/merge_lora.py
@@ -115,6 +115,7 @@ def _do_merge_lora_efficient(*, cfg: DictDefault) -> None:
        simulate_nf4_experts=simulate_nf4_experts,
        nf4_blocksize=nf4_blocksize,
        nf4_double_quant=nf4_double_quant,
+        trust_remote_code=bool(getattr(cfg, "trust_remote_code", False)),
    )

    LOG.debug("Memory-efficient LoRA merge completed successfully!")
--- a/src/axolotl/cli/utils/lora_merge.py
+++ b/src/axolotl/cli/utils/lora_merge.py
@@ -17,6 +17,93 @@ from axolotl.utils.logging import get_logger
 LOG = get_logger(__name__)


+def _build_layer_type_map(
+    base_model_path: Path, trust_remote_code: bool = False
+) -> dict[str, str]:
+    """Build a map of module_name -> layer_type using a meta-device model.
+
+    Instantiates the model architecture on the meta device (zero memory)
+    to inspect which modules are Linear vs Conv1d/Conv2d/Conv3d.
+    This avoids relying on weight tensor ndim heuristics.
+    """
+    import json as _json
+
+    import torch.nn as nn
+    from transformers import AutoConfig
+
+    config_path = base_model_path / "config.json"
+    if not config_path.exists():
+        return {}
+
+    try:
+        with open(config_path) as f:
+            model_config = _json.load(f)
+    except (OSError, _json.JSONDecodeError):
+        return {}
+
+    architectures = model_config.get("architectures", [])
+    if not architectures:
+        return {}
+
+    try:
+        config = AutoConfig.from_pretrained(
+            str(base_model_path), trust_remote_code=trust_remote_code
+        )
+    except Exception:
+        LOG.debug("Could not load config for layer type introspection")
+        return {}
+
+    # Determine the right Auto class from architectures
+    from transformers import (
+        AutoModel,
+        AutoModelForCausalLM,
+    )
+
+    auto_classes = [AutoModelForCausalLM, AutoModel]
+    try:
+        from transformers import AutoModelForImageTextToText
+
+        auto_classes.insert(0, AutoModelForImageTextToText)
+    except ImportError:
+        pass
+
+    model = None
+    for auto_cls in auto_classes:
+        try:
+            with torch.device("meta"):
+                model = auto_cls.from_config(
+                    config, trust_remote_code=trust_remote_code
+                )
+            break
+        except Exception:  # noqa: BLE001
+            LOG.debug(
+                "Could not instantiate meta model with %s, trying next",
+                auto_cls.__name__,
+            )
+
+    if model is None:
+        LOG.debug("Could not instantiate meta model for layer type introspection")
+        return {}
+
+    layer_types = {}
+    for name, module in model.named_modules():
+        if isinstance(module, nn.Conv3d):
+            layer_types[name] = "Conv3d"
+        elif isinstance(module, nn.Conv2d):
+            layer_types[name] = "Conv2d"
+        elif isinstance(module, nn.Conv1d):
+            layer_types[name] = "Conv1d"
+        elif isinstance(module, nn.Linear):
+            layer_types[name] = "Linear"
+
+    del model
+    LOG.debug(
+        f"Layer type map: {len(layer_types)} modules "
+        f"({sum(1 for v in layer_types.values() if 'Conv' in v)} conv layers)"
+    )
+    return layer_types
+
+
 def _simulate_nf4_roundtrip(
    tensor: torch.Tensor,
    blocksize: Optional[int] = None,
@@ -191,6 +278,7 @@ def _build_peft_layer_and_get_delta(
    adapter_name: str = "default",
    is_param_wrapper: bool = False,
    magnitude: Optional[torch.Tensor] = None,
+    layer_type: Optional[str] = None,
 ) -> torch.Tensor:
    """
    Use PEFT's own layer classes to compute the LoRA delta weight.
@@ -211,7 +299,7 @@ def _build_peft_layer_and_get_delta(
    out_features = lora_b.shape[0]
    lora_alpha = lora_config_dict.get("lora_alpha", lora_config_dict.get("r", 1))
    use_rslora = bool(lora_config_dict.get("use_rslora", False))
-    use_dora = bool(lora_config_dict.get("use_dora", False)) and magnitude is not None
+    use_dora = bool(lora_config_dict.get("use_dora", False))

    if is_param_wrapper:
        from peft.tuners.lora.layer import ParamWrapper
@@ -227,18 +315,110 @@ def _build_peft_layer_and_get_delta(
            "weight", nn.Parameter(base_tensor.clone(), requires_grad=False)
        )

+        # ParamWrapper rejects dropout/fan_in_fan_out/lora_bias/use_dora, so
+        # build a minimal config with only the fields it accepts.
+        pw_config = LoraConfig(
+            r=r,
+            lora_alpha=lora_alpha,
+            lora_dropout=0.0,
+            fan_in_fan_out=False,
+            use_rslora=use_rslora,
+            use_dora=False,
+            lora_bias=False,
+        )
+
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", UserWarning)
            layer = ParamWrapper(
                fake,
                adapter_name=adapter_name,
                parameter_name="weight",
+                config=pw_config,
                r=r,
                lora_alpha=lora_alpha,
-                use_rslora=use_rslora,
            )
        layer.lora_A[adapter_name].weight.data = lora_a
        layer.lora_B[adapter_name].weight.data = lora_b
+        delta = layer.get_delta_weight(adapter_name)
+        # peft >=0.19.1 may return delta with transposed dims for 3D params
+        if delta.shape != base_tensor.shape and delta.ndim == 3:
+            delta = delta.transpose(1, 2).contiguous()
+        return delta
+    elif (
+        layer_type and "Conv" in layer_type or (layer_type is None and lora_a.ndim > 2)
+    ):
+        # Conv layer detected via model introspection (or ndim fallback)
+
+        from peft.tuners.lora import layer as peft_lora_layer
+
+        # Determine conv type from layer_type map or fall back to ndim
+        if layer_type and "Conv" in layer_type:
+            conv_type: str = layer_type
+        else:
+            ndim = lora_a.ndim
+            _conv_map = {3: "Conv1d", 4: "Conv2d", 5: "Conv3d"}
+            if ndim not in _conv_map:
+                raise ValueError(
+                    f"Unsupported LoRA weight dimensionality {ndim} for conv layer"
+                )
+            conv_type = _conv_map[ndim]
+            LOG.warning(
+                f"Using ndim-based fallback for conv detection (ndim={ndim}). "
+                f"Consider providing layer_type from meta-device introspection."
+            )
+
+        conv_cls_map = {"Conv1d": nn.Conv1d, "Conv2d": nn.Conv2d, "Conv3d": nn.Conv3d}
+        ConvCls = conv_cls_map[conv_type]
+        PeftConvCls = getattr(peft_lora_layer, conv_type)
+
+        # Reconstruct conv parameters from base tensor and lora_a shapes
+        # base_tensor: [out_channels, in_channels/groups, *kernel_size]
+        # lora_a:      [r, in_channels/groups, *kernel_size]
+        # lora_b:      [out_channels, r, *ones]
+        out_channels = base_tensor.shape[0]
+        in_channels = base_tensor.shape[1]
+        kernel_size = tuple(base_tensor.shape[2:])
+        stride = (1,) * (base_tensor.ndim - 2)
+        padding = (0,) * (base_tensor.ndim - 2)
+
+        base_layer = ConvCls(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=False,
+        )
+        base_layer.weight.data = base_tensor.clone()
+
+        conv_config = LoraConfig(
+            r=r_total,
+            lora_alpha=lora_alpha,
+            use_rslora=use_rslora,
+            use_dora=use_dora,
+        )
+        layer = PeftConvCls(
+            base_layer,
+            adapter_name=adapter_name,
+            config=conv_config,
+            r=r_total,
+            lora_alpha=lora_alpha,
+        )
+        layer.lora_A[adapter_name].weight.data = lora_a
+        layer.lora_B[adapter_name].weight.data = lora_b
+
+        if use_dora:
+            if magnitude is None:
+                raise ValueError(
+                    f"DoRA merge requires a magnitude vector but none was found "
+                    f"for conv layer (adapter={adapter_name}). Check that the "
+                    f"adapter checkpoint contains lora_magnitude_vector weights."
+                )
+            mag_layer = layer.lora_magnitude_vector[adapter_name]
+            mag_layer.weight = nn.Parameter(magnitude)
+            layer.merge(adapter_names=[adapter_name])
+            return base_layer.weight.data - base_tensor
+
        return layer.get_delta_weight(adapter_name)
    else:
        from peft.tuners.lora.layer import Linear as LoraLinear
@@ -251,15 +431,20 @@ def _build_peft_layer_and_get_delta(
            or lora_config_dict.get("lora_fan_in_fan_out", False)
        )

-        layer = LoraLinear(
-            base_layer,
-            adapter_name=adapter_name,
+        linear_config = LoraConfig(
            r=r_total,
            lora_alpha=lora_alpha,
            fan_in_fan_out=fan_in_fan_out,
            use_rslora=use_rslora,
            use_dora=use_dora,
        )
+        layer = LoraLinear(
+            base_layer,
+            adapter_name=adapter_name,
+            config=linear_config,
+            r=r_total,
+            lora_alpha=lora_alpha,
+        )
        layer.lora_A[adapter_name].weight.data = lora_a
        layer.lora_B[adapter_name].weight.data = lora_b

@@ -267,6 +452,12 @@ def _build_peft_layer_and_get_delta(
            # DoRA merges magnitude normalization into the weight directly.
            # Use PEFT's merge() which handles DoRA internally, then
            # compute the delta as merged_weight - original_weight.
+            if magnitude is None:
+                raise ValueError(
+                    f"DoRA merge requires a magnitude vector but none was found "
+                    f"for linear layer (adapter={adapter_name}). Check that the "
+                    f"adapter checkpoint contains lora_magnitude_vector weights."
+                )
            mag_layer = layer.lora_magnitude_vector[adapter_name]
            mag_layer.weight = nn.Parameter(magnitude)
            layer.merge(adapter_names=[adapter_name])
@@ -382,6 +573,7 @@ def _merge_tensor_with_lora(
    nf4_double_quant: bool = True,
    use_dora: bool = False,
    weight_renamings: Optional[Dict[str, str]] = None,
+    layer_type_map: Optional[Dict[str, str]] = None,
 ) -> tuple[torch.Tensor, bool]:
    """
    Helper function to merge a single tensor with its corresponding LoRA weights.
@@ -426,12 +618,30 @@ def _merge_tensor_with_lora(
            if use_dora
            else None
        )
+
+        # Look up layer type from meta-device model introspection
+        _layer_type = None
+        if layer_type_map:
+            mod_path = key.rsplit(".weight", 1)[0] if key.endswith(".weight") else key
+            _layer_type = layer_type_map.get(mod_path)
+            # Try common prefix variations (e.g. with/without "model." prefix)
+            if _layer_type is None:
+                for prefix in [
+                    "model.",
+                    "model.language_model.",
+                    "model.language_model.model.",
+                ]:
+                    _layer_type = layer_type_map.get(prefix + mod_path)
+                    if _layer_type:
+                        break
+
        delta = _build_peft_layer_and_get_delta(
            lora_a.to(device),
            lora_b.to(device),
            lora_config_dict,
            tensor.to(device),
            magnitude=magnitude.to(device) if magnitude is not None else None,
+            layer_type=_layer_type,
        )
        merged_tensor = (
            (tensor.to(device).to(torch.float32) + delta.to(torch.float32))
@@ -556,6 +766,7 @@ def _fuse_and_unfuse_with_merge(
    nf4_double_quant: bool = True,
    use_dora: bool = False,
    weight_renamings: Optional[Dict[str, str]] = None,
+    layer_type_map: Optional[Dict[str, str]] = None,
 ) -> tuple[Dict[str, torch.Tensor], int, set]:
    """
    For tensors matching WeightConverter patterns (MoE expert weights):
@@ -696,12 +907,32 @@ def _fuse_and_unfuse_with_merge(
                    if use_dora
                    else None
                )
+                # Look up layer type for the fused key
+                _layer_type = None
+                if layer_type_map:
+                    mod_path = (
+                        fused_key.rsplit(".weight", 1)[0]
+                        if fused_key.endswith(".weight")
+                        else fused_key
+                    )
+                    _layer_type = layer_type_map.get(mod_path)
+                    if _layer_type is None:
+                        for prefix in [
+                            "model.",
+                            "model.language_model.",
+                            "model.language_model.model.",
+                        ]:
+                            _layer_type = layer_type_map.get(prefix + mod_path)
+                            if _layer_type:
+                                break
+
                delta = _build_peft_layer_and_get_delta(
                    lora_a.to(device),
                    lora_b.to(device),
                    lora_config_dict,
                    fused_tensor.to(device),
                    magnitude=magnitude.to(device) if magnitude is not None else None,
+                    layer_type=_layer_type,
                )
                fused_tensor = (
                    (
@@ -740,6 +971,7 @@ def merge_lora_sharded_efficient(
    simulate_nf4_experts: bool = False,
    nf4_blocksize: Optional[int] = None,
    nf4_double_quant: bool = True,
+    trust_remote_code: bool = False,
 ) -> None:
    """
    Memory-efficient LoRA merging that processes shards individually
@@ -750,6 +982,8 @@ def merge_lora_sharded_efficient(
        simulate_nf4_experts: Apply NF4 roundtrip only to MoE expert tensors
            (for quantize_moe_experts). Expert tensors are identified by having
            "expert" in the key name and ndim >= 3.
+        trust_remote_code: Whether to trust remote code when loading model
+            config for layer-type introspection. Defaults to False for safety.
    """
    base_model_path = Path(base_model_path)
    lora_adapter_path = Path(lora_adapter_path)
@@ -780,6 +1014,10 @@ def merge_lora_sharded_efficient(

    use_dora = bool(lora_config_dict.get("use_dora", False))

+    # Build layer type map via meta-device model introspection
+    layer_type_map = _build_layer_type_map(
+        base_model_path, trust_remote_code=trust_remote_code
+    )
    unsupported_methods = []

    # Check for AdaLoRA (Adaptive LoRA)
@@ -904,6 +1142,7 @@ def merge_lora_sharded_efficient(
                nf4_double_quant=nf4_double_quant,
                use_dora=use_dora,
                weight_renamings=weight_renamings,
+                layer_type_map=layer_type_map,
            )
            merged_count += fused_merged

@@ -926,6 +1165,7 @@ def merge_lora_sharded_efficient(
                nf4_double_quant=nf4_double_quant,
                use_dora=use_dora,
                weight_renamings=weight_renamings,
+                layer_type_map=layer_type_map,
            )
            merged_tensors[key] = merged_tensor
            if was_merged:
--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -41,6 +41,7 @@ from axolotl.utils.callbacks import (
    GCCallback,
    SaveAxolotlConfigtoWandBCallback,
    SaveModelOnFirstStepCallback,
+    SkipEvalOnResumeCallback,
 )
 from axolotl.utils.callbacks.profiler import PytorchProfilerCallback
 from axolotl.utils.distributed import build_parallelism_config
@@ -118,6 +119,9 @@ class TrainerBuilderBase(abc.ABC):
            plugin_manager.add_callbacks_pre_trainer(cfg=self.cfg, model=self.model)
        )

+        if self.cfg.resume_from_checkpoint:
+            callbacks.append(SkipEvalOnResumeCallback())
+
        if self.cfg.gc_steps:
            callbacks.append(GCCallback(gc_steps=self.cfg.gc_steps))

--- a/src/axolotl/core/builders/causal.py
+++ b/src/axolotl/core/builders/causal.py
@@ -370,7 +370,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        data_collator_kwargs = {
            "padding": True,  # True/"longest" is the default
        }
-        multiple = 64
+        multiple = getattr(self.cfg, "pad_to_multiple_of", None) or 64
        if self.cfg.pad_to_sequence_len:
            data_collator_kwargs["pad_to_multiple_of"] = multiple * math.ceil(
                self.cfg.sequence_len / multiple
--- a/src/axolotl/core/builders/rl.py
+++ b/src/axolotl/core/builders/rl.py
@@ -228,9 +228,47 @@ class HFRLTrainerBuilder(TrainerBuilderBase):

        return training_args, trainer_kwargs

+    def build_collator(self, **kwargs):
+        """Build a data collator for preference-tuning trainers.
+
+        Returns None for RL types that provide their own collator (e.g. GRPO,
+        KTO), letting the trainer construct its default. For DPO/IPO/ORPO/SIMPO
+        returns an ``AxolotlDPODataCollatorWithPadding`` when
+        ``pad_to_multiple_of`` is set, otherwise None (so the trainer
+        falls back to the TRL default).
+        """
+        if self.cfg.rl not in (
+            RLType.DPO,
+            RLType.IPO,
+            RLType.ORPO,
+            RLType.SIMPO,
+        ):
+            return None
+
+        pad_to_multiple_of = getattr(self.cfg, "pad_to_multiple_of", None)
+        if not pad_to_multiple_of:
+            return None
+
+        from axolotl.utils.collators.dpo import AxolotlDPODataCollatorWithPadding
+
+        LOG.info(
+            f"Using AxolotlDPODataCollatorWithPadding with pad_to_multiple_of="
+            f"{pad_to_multiple_of}"
+        )
+        is_enc_dec = getattr(self.model.config, "is_encoder_decoder", False)
+        return AxolotlDPODataCollatorWithPadding(
+            pad_token_id=self.tokenizer.pad_token_id,
+            is_encoder_decoder=is_enc_dec,
+            pad_to_multiple_of=pad_to_multiple_of,
+            **kwargs,
+        )
+
    def build(self, total_num_steps):
        training_args, trainer_kwargs = self._build_training_arguments(total_num_steps)

+        if (data_collator := self.build_collator()) is not None:
+            trainer_kwargs["data_collator"] = data_collator
+
        if self.eval_dataset:
            trainer_kwargs["eval_dataset"] = self.eval_dataset
        if (
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -100,6 +100,27 @@ class AxolotlTrainer(
        self._signature_columns = None  # workaround for pylint

        super().__init__(*_args, **kwargs)
+
+        # Gemma4 (and similar multimodal models) declare **kwargs in forward() for
+        # extra inputs like mm_token_type_ids.  HF Trainer interprets VAR_KEYWORD as
+        # "the model handles num_items_in_batch internally" and skips the loss ÷
+        # gradient_accumulation_steps normalisation, which inflates the *logged* loss
+        # (the gradient itself is still correct). Override to False when the model
+        # doesn't actually consume num_items_in_batch.
+        if self.model_accepts_loss_kwargs:
+            model_to_check = self.accelerator.unwrap_model(self.model)
+            if hasattr(model_to_check, "base_model"):  # PEFT wrapper
+                model_to_check = model_to_check.base_model
+            if hasattr(model_to_check, "model"):
+                model_to_check = model_to_check.model
+            fwd = getattr(model_to_check, "forward", None)
+            if fwd is not None:
+                import inspect
+
+                params = inspect.signature(fwd).parameters
+                if "num_items_in_batch" not in params:
+                    self.model_accepts_loss_kwargs = False
+
        self.train_data_collator = self.data_collator
        self._stored_metrics = defaultdict(
            lambda: defaultdict(lambda: {"values": [], "reduction": "mean"})
@@ -381,6 +402,31 @@ class AxolotlTrainer(
            # Store per-step trainable tokens for throughput calculation
            self.state.tokens["trainable_tokens"] = trainable_tokens.detach().cpu()

+        # Gemma4 requires mm_token_type_ids during training (even for text-only).
+        # Inject zeros (= text token type) when not provided by the data collator.
+        # Use unwrap_model to handle DDP/FSDP wrappers that don't proxy .config.
+        _unwrapped = self.accelerator.unwrap_model(model)
+        _model_type = getattr(getattr(_unwrapped, "config", None), "model_type", None)
+        if (
+            "mm_token_type_ids" not in inputs
+            and "input_ids" in inputs
+            and _model_type == "gemma4"
+        ):
+            inputs["mm_token_type_ids"] = torch.zeros_like(inputs["input_ids"])
+
+        # Gemma4 (and Gemma3): transformers' masking_utils detects packed sequences
+        # from position_ids, but only when attention_mask is None.  When sample
+        # packing is active the collator provides an all-ones attention_mask that
+        # prevents this detection — remove it so the model builds the correct
+        # per-sequence causal masks.
+        if (
+            self.args.sample_packing
+            and _model_type in ("gemma4", "gemma3")
+            and "attention_mask" in inputs
+            and "position_ids" in inputs
+        ):
+            del inputs["attention_mask"]
+
        if self.args.orpo_alpha:
            return self.orpo_compute_loss(
                model,
@@ -389,6 +435,23 @@ class AxolotlTrainer(
                num_items_in_batch=num_items_in_batch,
            )

+        # Gemma4ForConditionalGeneration computes loss with a manual
+        # nn.CrossEntropyLoss() that bypasses proper num_items_in_batch
+        # normalization and does redundant attention_mask filtering.
+        # Compute loss externally using the standard loss_function instead.
+        if _model_type == "gemma4" and "labels" in inputs:
+            labels = inputs.pop("labels")
+            outputs = model(**inputs)
+            logits = outputs.logits
+            unwrapped = self.accelerator.unwrap_model(model)
+            vocab_size = unwrapped.config.get_text_config().vocab_size
+            loss = unwrapped.loss_function(
+                logits, labels, vocab_size, num_items_in_batch=num_items_in_batch
+            )
+            if return_outputs:
+                return loss, outputs
+            return loss
+
        return super().compute_loss(
            model,
            inputs,
@@ -401,6 +464,21 @@ class AxolotlTrainer(
        LOG.info("Running evaluation step...")
        return super().evaluate(*args, **kwargs)

+    @override
+    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
+        # Gemma4 requires mm_token_type_ids even during evaluation.
+        _unwrapped = self.accelerator.unwrap_model(model)
+        _model_type = getattr(getattr(_unwrapped, "config", None), "model_type", None)
+        if (
+            "mm_token_type_ids" not in inputs
+            and "input_ids" in inputs
+            and _model_type == "gemma4"
+        ):
+            inputs["mm_token_type_ids"] = torch.zeros_like(inputs["input_ids"])
+        return super().prediction_step(
+            model, inputs, prediction_loss_only, ignore_keys=ignore_keys
+        )
+
    @staticmethod
    def orpo_concatenate_inputs(inputs, label_pad_token=-100, pad_token=0, device=None):
        concatenated_batch = {}
@@ -508,12 +586,24 @@ class AxolotlTrainer(
        )

        # Perform a single forward pass
+        forward_kwargs = {
+            "input_ids": concat_inputs["input_ids"],
+            "attention_mask": concat_inputs["attention_mask"],
+            "labels": concat_inputs["labels"],
+        }
+        # Gemma4 requires mm_token_type_ids during training (even for text-only)
+        if (
+            getattr(getattr(model, "config", None), "model_type", None) == "gemma4"
+            and "mm_token_type_ids" not in concat_inputs
+        ):
+            forward_kwargs["mm_token_type_ids"] = torch.zeros_like(
+                concat_inputs["input_ids"]
+            )
+        elif "mm_token_type_ids" in concat_inputs:
+            forward_kwargs["mm_token_type_ids"] = concat_inputs["mm_token_type_ids"]
+
        outputs = model(
-            **{
-                "input_ids": concat_inputs["input_ids"],
-                "attention_mask": concat_inputs["attention_mask"],
-                "labels": concat_inputs["labels"],
-            },
+            **forward_kwargs,
            output_hidden_states=True,
        )

--- a/src/axolotl/core/trainers/dpo/init.py
+++ b/src/axolotl/core/trainers/dpo/init.py
@@ -20,8 +20,16 @@ class DPOStrategy:
    @classmethod
    def set_training_args_kwargs(cls, cfg):
        training_args_kwargs = {}
+        if cfg.rl is RLType.DPO:
+            if cfg.dpo_loss_type is not None:
+                training_args_kwargs["loss_type"] = cfg.dpo_loss_type
+
+            if cfg.dpo_loss_weights is not None:
+                training_args_kwargs["loss_weights"] = cfg.dpo_loss_weights
+
        if cfg.rl is RLType.IPO:
            training_args_kwargs["loss_type"] = ["ipo"]
+
        # Label smoothing is not compatible with IPO
        if cfg.rl is RLType.DPO and cfg.dpo_label_smoothing:
            training_args_kwargs["label_smoothing"] = cfg.dpo_label_smoothing
--- a/src/axolotl/core/trainers/grpo/async_trainer.py
+++ b/src/axolotl/core/trainers/grpo/async_trainer.py
@@ -242,6 +242,85 @@ class ProducerConfig:
            )


+class _GroupShardedSampler:
+    """Rank-aware shard of a ``RepeatSampler`` that preserves GRPO groups.
+
+    ``RepeatSampler`` yields ``num_generations`` consecutive copies of
+    each prompt, forming a GRPO group. For distributed training each
+    rank must see a disjoint slice of prompts (otherwise every rank
+    dogpiles on the first 1/world_size of the batch) while keeping each
+    group intact on a single rank so advantage normalization sees all
+    peer generations.
+
+    ``accelerator.prepare(DataLoader)`` does not handle this correctly
+    for custom samplers with ``split_batches=False`` (the default): it
+    leaves the sampler alone and every rank replays identical indices.
+    This wrapper fixes that by consuming the inner sampler's full
+    output, chunking it into ``num_generations``-sized groups, and
+    round-robining whole groups across ranks.
+
+    Intended to be used ONLY when distributed training is active
+    (``num_replicas > 1``); for single-rank it is a no-op but still
+    correct.
+    """
+
+    def __init__(
+        self,
+        inner: Any,
+        num_generations: int,
+        rank: int,
+        num_replicas: int,
+    ):
+        if num_generations < 1:
+            raise ValueError(f"num_generations must be >= 1, got {num_generations}")
+        if num_replicas < 1:
+            raise ValueError(f"num_replicas must be >= 1, got {num_replicas}")
+        if not (0 <= rank < num_replicas):
+            raise ValueError(f"rank must be in [0, {num_replicas}), got {rank}")
+        self.inner = inner
+        self.num_generations = num_generations
+        self.rank = rank
+        self.num_replicas = num_replicas
+
+    def __iter__(self):
+        all_indices = list(self.inner)
+        if len(all_indices) % self.num_generations != 0:
+            raise ValueError(
+                f"inner sampler yielded {len(all_indices)} indices, "
+                f"not a multiple of num_generations={self.num_generations}"
+            )
+        # Chunk the flat index sequence into groups of num_generations
+        # consecutive indices. ``RepeatSampler`` guarantees that each
+        # group contains num_generations copies of the same prompt id.
+        groups = [
+            all_indices[i : i + self.num_generations]
+            for i in range(0, len(all_indices), self.num_generations)
+        ]
+        # Round-robin whole groups across ranks. Round-robin (vs.
+        # contiguous chunking) preserves approximate shuffled order on
+        # each rank even when the group count is small relative to the
+        # world size.
+        for group in groups[self.rank :: self.num_replicas]:
+            yield from group
+
+    def __len__(self):
+        try:
+            inner_len = len(self.inner)
+        except TypeError:
+            # Non-sized inner sampler — we can't know the per-rank
+            # length without materializing. Return 0 as a hint that the
+            # DataLoader should fall back to iteration.
+            return 0
+        total_groups = inner_len // self.num_generations
+        # Ceiling division for the trailing groups that don't divide
+        # evenly — extra groups go to the first ``total_groups %
+        # num_replicas`` ranks, matching the round-robin above.
+        my_groups = (
+            total_groups + self.num_replicas - self.rank - 1
+        ) // self.num_replicas
+        return my_groups * self.num_generations
+
+
 class DataProducer(ABC):
    """Abstract base class for online data producers.

@@ -556,6 +635,34 @@ class GRPODataProducer(BaseDataProducer):
            seed=self._seed,
        )

+        # Shard the sampler across distributed ranks so each rank sees
+        # a disjoint slice of prompts. ``RepeatSampler`` groups each
+        # prompt with ``num_generations`` consecutive copies — our
+        # wrapper round-robins WHOLE groups across ranks so all
+        # generations of a given prompt stay on the same rank (needed
+        # for GRPO advantage normalization within a group).
+        #
+        # Without this, ``accelerator.prepare(dl)`` with the default
+        # ``split_batches=False`` leaves the custom sampler alone, so
+        # every rank iterates the identical index sequence and the
+        # cluster dogpiles on the first 1/world_size of the prompts.
+        num_replicas = max(1, trainer.accelerator.num_processes)
+        if num_replicas > 1:
+            sampler = _GroupShardedSampler(
+                inner=sampler,
+                num_generations=self._num_generations,
+                rank=trainer.accelerator.process_index,
+                num_replicas=num_replicas,
+            )
+            logger.info(
+                "[RANK:%d] _GroupShardedSampler active "
+                "(num_replicas=%d, num_generations=%d, gen_batch=%d)",
+                trainer.accelerator.process_index,
+                num_replicas,
+                self._num_generations,
+                self._generation_batch_size,
+            )
+
        # Use identity collator (same as stock GRPOTrainer)
        def _identity(x):
            return x
@@ -574,12 +681,11 @@ class GRPODataProducer(BaseDataProducer):
                rank=trainer.args.process_index,
            ),
        )
-        self._prompt_dl = trainer.accelerator.prepare(dl)
-
-        # Don't let accelerator track this dataloader
-        acc_dls = trainer.accelerator._dataloaders
-        if self._prompt_dl in acc_dls:
-            acc_dls.remove(self._prompt_dl)
+        # Skip accelerator.prepare — we're handling per-rank sharding
+        # ourselves via ``_GroupShardedSampler``. ``prepare()`` would
+        # otherwise try to wrap the DataLoader with its own sharding
+        # logic which does not understand our group structure.
+        self._prompt_dl = dl

        self._prompt_iter = iter(self._prompt_dl)

@@ -1103,11 +1209,22 @@ class AsyncGRPOTrainer(GRPOTrainer):
        - vllm_lora_sync: saves adapter to filesystem, vLLM loads natively
        - PEFT no-merge: computes merged weights as new tensors, NCCL broadcast
        - Non-PEFT: stock sync_weights via merge_adapter + NCCL
+
+        This is the canonical sync trigger and runs in BOTH async and
+        synchronous modes from ``_prepare_inputs_with_data_producer`` /
+        ``_prepare_inputs_legacy_async``. The ``_generate_single_turn``
+        patch is a parallel backup for non-data-producer paths (vanilla
+        GRPO without NeMo Gym), where the data producer is bypassed
+        entirely and TRL's stock generate-then-sync flow is used instead.
        """
-        if not (self.use_vllm and self.args.async_prefetch):
+        if not self.use_vllm:
            return
        step = self.state.global_step
-        interval = self.args.vllm_sync_interval
+        # Default to syncing every step when no interval is configured —
+        # otherwise ``step % None`` would TypeError, and the previous
+        # behavior of crashing on the first sync was strictly worse than
+        # the standard "sync every optimizer step".
+        interval = self.args.vllm_sync_interval or 1
        if step != self._last_synced_step and step % interval == 0:
            if step == 0:
                logger.info("Skipping vLLM weight sync at step 0 (no training yet)")
@@ -1202,13 +1319,42 @@ class AsyncGRPOTrainer(GRPOTrainer):

        # Permanently replace vllm_generation.sync_weights with our custom
        # sync to avoid merge_adapter (fails on FP8 / races with training).
-        # For LoRA sync mode, make it a no-op here since _maybe_sync_vllm_weights
-        # handles the sync with proper interval tracking.
+        #
+        # The design has two modes that have to be threaded carefully:
+        #
+        #   - Async prefetch ON: BG generation thread can't safely call
+        #     sync_weights mid-rollout (it races with the trainer's optimizer
+        #     step and can corrupt weights). We no-op the stock sync hook and
+        #     drive sync ourselves from ``_maybe_sync_vllm_weights`` after the
+        #     optimizer step on the main thread.
+        #
+        #   - Async prefetch OFF (synchronous mode): TRL's stock
+        #     ``_generate_single_turn`` calls ``sync_weights`` once per step
+        #     boundary. There's no BG thread to race with, and
+        #     ``_maybe_sync_vllm_weights`` short-circuits with
+        #     ``if not async_prefetch: return``, so we MUST wire the stock
+        #     hook directly to our LoRA sync helper — otherwise nothing ever
+        #     pushes weights to vLLM and the trainer becomes a no-op (vLLM
+        #     keeps serving the base model, every rollout in every group
+        #     produces identical outputs, advantages are zero, optimizer
+        #     step gets skipped, repeat).
        if not getattr(self, "_patched_sync_weights", False):
            if self.use_vllm and hasattr(self, "vllm_generation"):
                if getattr(self.args, "vllm_lora_sync", False):
-                    # No-op: LoRA sync is driven by _maybe_sync_vllm_weights
-                    self.vllm_generation.sync_weights = lambda: None
+                    if getattr(self.args, "async_prefetch", False):
+                        # Async: drive sync from main thread via
+                        # _maybe_sync_vllm_weights instead.
+                        self.vllm_generation.sync_weights = lambda: None
+                    else:
+                        # Sync mode: TRL's _generate_single_turn already
+                        # calls sync_weights once per step boundary. Wire
+                        # it directly to our LoRA filesystem sync helper.
+                        sync_helper = self._sync_lora_adapter
+
+                        def _lora_filesystem_sync():
+                            sync_helper()
+
+                        self.vllm_generation.sync_weights = _lora_filesystem_sync
                    self._patched_sync_weights = True
                else:
                    from accelerate.utils import is_peft_model
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -19,7 +19,7 @@ python scripts/cutcrossentropy_install.py | sh

 - If you are installing from pip
 ```bash
-pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@63b15e6"
+pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@fec1a88"
 ```

 ## Usage
@@ -44,6 +44,7 @@ plugins:
 - gemma3_text
 - gemma3n
 - gemma3n_text
+- gemma4
 - glm
 - glm4
 - glm4_moe
--- a/src/axolotl/integrations/cut_cross_entropy/init.py
+++ b/src/axolotl/integrations/cut_cross_entropy/init.py
@@ -35,7 +35,7 @@ LOG = get_logger(__name__)

 _CCE_INSTALL_MESSAGE = (
    "Please install Axolotl's fork of cut_cross_entropy with transformers support using "
-    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@63b15e6"`'
+    '`pip uninstall -y cut-cross-entropy && pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@fec1a88"`'
 )


--- a/src/axolotl/integrations/hatchery/init.py
+++ b/src/axolotl/integrations/hatchery/init.py
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""Hatchery/Tinker remote training integration for Axolotl.
+
+Routes axolotl's preprocessed data to a remote training API (Tinker or
+Hatchery) instead of running forward/backward locally. The remote
+service handles model weights, LoRA adapters, and gradient updates.
+"""
+
+from .args import HatcheryArgs, HatcheryConfig
+from .plugin import HatcheryPlugin
+
+__all__ = ["HatcheryArgs", "HatcheryConfig", "HatcheryPlugin"]
+
+# Usage:
+#   plugins:
+#     - axolotl.integrations.hatchery.HatcheryPlugin
+#
+#   hatchery:
+#     backend: tinker  # or "hatchery"
+#     lora_rank: 32
+#     loss_fn: cross_entropy  # SFT
+#     # loss_fn: ppo         # RL (auto-selects HatcheryRLTrainer)
+#
+#   learning_rate: 1e-4  # top-level, not under hatchery:
--- a/src/axolotl/integrations/hatchery/args.py
+++ b/src/axolotl/integrations/hatchery/args.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""Pydantic config schema for the Hatchery integration."""
+
+from __future__ import annotations
+
+from typing import Any, Literal, Optional
+
+from pydantic import BaseModel, Field
+
+
+class HatcheryConfig(BaseModel):
+    """Nested config under `hatchery:` in the axolotl YAML.
+
+    Only contains hatchery-specific settings. Standard training params
+    (learning_rate, weight_decay, adam_beta1/2, max_grad_norm,
+    gradient_accumulation_steps) are read from axolotl's top-level config.
+    """
+
+    # Backend & connection
+    backend: Literal["tinker", "hatchery"] = "tinker"
+    base_url: Optional[str] = None
+    api_key: Optional[str] = None
+    project_id: Optional[str] = None
+
+    # LoRA config sent to remote
+    lora_rank: int = Field(32, ge=1, le=256)
+    train_attn: bool = True
+    train_mlp: bool = True
+    train_unembed: bool = True
+
+    # Loss function
+    loss_fn: Literal["cross_entropy", "importance_sampling", "ppo", "cispo", "dro"] = (
+        "cross_entropy"
+    )
+    loss_fn_config: Optional[dict[str, Any]] = None
+
+    # Pipelining: submit next batch before awaiting previous result
+    pipeline: bool = True
+
+    # Sampling params (for RL flows)
+    max_sample_tokens: int = 256
+    sample_temperature: float = 1.0
+    num_samples: int = 4
+
+    # Reward functions (for RL) — list of fully qualified names
+    reward_funcs: Optional[list[str]] = None
+
+    # Checkpointing
+    save_steps: Optional[int] = None
+    save_name_prefix: str = "checkpoint"
+
+    # Timeout per future (seconds)
+    future_timeout: float = 600.0
+
+
+class HatcheryArgs(BaseModel):
+    """Top-level mixin that adds the nested `hatchery:` field."""
+
+    hatchery: Optional[HatcheryConfig] = None
--- a/src/axolotl/integrations/hatchery/data.py
+++ b/src/axolotl/integrations/hatchery/data.py
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""Convert axolotl batch tensors to Tinker/Hatchery Datum format.
+
+Both Tinker and Hatchery expect the client to apply the causal LM shift:
+
+  Original tokens:  [t0, t1, t2, ..., t_{L-1}]
+  model_input:      [t0, t1, ..., t_{L-2}]       (last token dropped)
+  target_tokens:    [t1, t2, ..., t_{L-1}]        (first token dropped)
+  weights:          [w1, w2, ..., w_{L-1}]        (aligned to targets)
+
+At position i, the model sees t_i and predicts target_tokens[i] = t_{i+1}.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+import torch
+
+
+def _tensor_to_wire(t: torch.Tensor) -> dict[str, Any]:
+    """Serialize a tensor to the TensorData wire dict."""
+    flat = t.detach().cpu().flatten()
+    dtype_map = {
+        torch.float32: "float32",
+        torch.float16: "float16",
+        torch.bfloat16: "bfloat16",
+        torch.int64: "int64",
+        torch.int32: "int32",
+    }
+    return {
+        "dtype": dtype_map.get(flat.dtype, "float32"),
+        "shape": list(t.shape),
+        "data": flat.tolist(),
+    }
+
+
+def _make_datum(
+    tokens: list[int],
+    loss_fn_inputs: dict[str, torch.Tensor],
+) -> dict[str, Any]:
+    """Build a Datum as a plain dict (wire-compatible with both Tinker and Hatchery)."""
+    return {
+        "model_input": {
+            "chunks": [{"type": "encoded_text", "tokens": tokens}],
+        },
+        "loss_fn_inputs": {
+            key: _tensor_to_wire(tensor) for key, tensor in loss_fn_inputs.items()
+        },
+    }
+
+
+def datums_to_tinker(datums: list[dict[str, Any]]):
+    """Wrap plain-dict datums into tinker.types.Datum objects.
+
+    Both the Tinker SDK and updated Hatchery client accept these.
+    """
+    import tinker.types as tt
+
+    result = []
+    for d in datums:
+        tokens = d["model_input"]["chunks"][0]["tokens"]
+        tinker_inputs = {}
+        for key, wire in d["loss_fn_inputs"].items():
+            tinker_inputs[key] = tt.TensorData(
+                data=wire["data"],
+                dtype=wire["dtype"],
+                shape=wire["shape"],
+            )
+        result.append(
+            tt.Datum(
+                model_input=tt.ModelInput.from_ints(tokens),
+                loss_fn_inputs=tinker_inputs,
+            )
+        )
+    return result
+
+
+def batch_to_datums_sft(
+    input_ids: torch.Tensor,
+    labels: torch.Tensor,
+    attention_mask: torch.Tensor | None = None,
+) -> list[dict[str, Any]]:
+    """Convert an axolotl SFT batch to Datum dicts with causal shift."""
+    batch_size = input_ids.size(0)
+    datums = []
+
+    for i in range(batch_size):
+        ids = input_ids[i]
+        lbl = labels[i]
+
+        if attention_mask is not None:
+            seq_len = int(attention_mask[i].sum().item())
+            ids = ids[:seq_len]
+            lbl = lbl[:seq_len]
+
+        model_tokens = ids[:-1].tolist()
+        shifted_labels = lbl[1:]
+
+        target_tokens = shifted_labels.clone()
+        weights = (shifted_labels != -100).float()
+        target_tokens[target_tokens == -100] = 0
+
+        datums.append(
+            _make_datum(
+                model_tokens,
+                {
+                    "target_tokens": target_tokens,
+                    "weights": weights,
+                },
+            )
+        )
+
+    return datums
+
+
+def batch_to_datums_rl(
+    input_ids: torch.Tensor,
+    labels: torch.Tensor,
+    logprobs: torch.Tensor,
+    advantages: torch.Tensor,
+    attention_mask: torch.Tensor | None = None,
+) -> list[dict[str, Any]]:
+    """Convert an RL batch to importance_sampling/ppo Datum dicts with causal shift."""
+    batch_size = input_ids.size(0)
+    datums = []
+
+    for i in range(batch_size):
+        ids = input_ids[i]
+        lbl = labels[i]
+
+        if attention_mask is not None:
+            seq_len = int(attention_mask[i].sum().item())
+        else:
+            seq_len = ids.size(0)
+        ids = ids[:seq_len]
+        lbl = lbl[:seq_len]
+        lp = logprobs[i, :seq_len]
+        adv = advantages[i, :seq_len]
+
+        model_tokens = ids[:-1].tolist()
+
+        target_tokens = lbl[1:].clone()
+        target_tokens[target_tokens == -100] = 0
+
+        datums.append(
+            _make_datum(
+                model_tokens,
+                {
+                    "target_tokens": target_tokens,
+                    "logprobs": lp[1:],
+                    "advantages": adv[1:],
+                },
+            )
+        )
+
+    return datums
--- a/src/axolotl/integrations/hatchery/examples/prep_math_rl.py
+++ b/src/axolotl/integrations/hatchery/examples/prep_math_rl.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""Prepare hendrycks_math for RL training with Hatchery/Tinker.
+
+Creates a dataset with chat-formatted prompts that include
+a hidden gold answer tag for the reward function.
+
+Run:
+  python src/axolotl/integrations/hatchery/examples/prep_math_rl.py
+"""
+
+import os
+import re
+
+from datasets import Dataset, load_dataset
+from transformers import AutoTokenizer
+
+
+def extract_boxed(text: str) -> str:
+    match = re.search(r"\\boxed\{", text)
+    if not match:
+        return ""
+    start = match.end()
+    depth = 1
+    i = start
+    while i < len(text) and depth > 0:
+        if text[i] == "{":
+            depth += 1
+        elif text[i] == "}":
+            depth -= 1
+        i += 1
+    return text[start : i - 1] if depth == 0 else ""
+
+
+def main():
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B", trust_remote_code=True)
+
+    ds = load_dataset("EleutherAI/hendrycks_math", "algebra", split="test")
+    level = os.environ.get("MATH_LEVEL", "Level 1")
+    filtered_rows = [x for x in ds if x["level"] == level]
+    print(f"{level} algebra: {len(filtered_rows)} problems")
+
+    rows = []
+    for prob in filtered_rows:
+        gold = extract_boxed(prob["solution"])
+        if not gold:
+            continue
+
+        # Format as chat prompt with hidden gold tag
+        prompt = (
+            f"Solve the following math problem. "
+            f"Show your work and put your final answer in \\boxed{{}}.\n\n"
+            f"{prob['problem']}"
+            f"<|gold|>{gold}<|/gold|>"
+        )
+
+        # Tokenize the prompt
+        text = tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}],
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        prompt_ids = tokenizer.encode(text, add_special_tokens=False)
+
+        rows.append(
+            {
+                "input_ids": prompt_ids,
+                "labels": [-100] * len(prompt_ids),
+                "attention_mask": [1] * len(prompt_ids),
+            }
+        )
+
+    out = Dataset.from_list(rows)
+    out_dir = f"./data/math_rl_{level.lower().replace(' ', '')}"
+    out.save_to_disk(out_dir)
+    print(f"Saved {len(out)} examples to {out_dir}")
+    if rows:
+        print(
+            f"Prompt length range: {min(len(r['input_ids']) for r in rows)}"
+            f"-{max(len(r['input_ids']) for r in rows)}"
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/src/axolotl/integrations/hatchery/examples/tinker_rl.yaml
+++ b/src/axolotl/integrations/hatchery/examples/tinker_rl.yaml
@@ -0,0 +1,47 @@
+# RL (GRPO): hendrycks_math Level 1 via Tinker with Qwen3-8B
+#
+# Prep:
+#   python src/axolotl/integrations/hatchery/examples/prep_math_rl.py
+#
+# Run:
+#   export TINKER_API_KEY="your-key"
+#   axolotl train src/axolotl/integrations/hatchery/examples/tinker_rl.yaml
+
+base_model: Qwen/Qwen3-8B
+
+plugins:
+  - axolotl.integrations.hatchery.HatcheryPlugin
+
+hatchery:
+  backend: tinker
+  lora_rank: 16
+  loss_fn: importance_sampling
+  max_sample_tokens: 2048
+  sample_temperature: 0.7
+  num_samples: 4
+  pipeline: true
+  save_steps: 5
+  reward_funcs:
+    - axolotl.integrations.hatchery.rewards.math_reward.math_reward
+
+datasets:
+  - path: ./data/math_rl_level1
+    ds_type: arrow
+    type: completion
+
+sequence_len: 2048
+
+learning_rate: 5.0e-5
+optimizer: adamw_torch
+adam_beta1: 0.9
+adam_beta2: 0.95
+weight_decay: 0.01
+max_grad_norm: 1.0
+
+max_steps: 10
+num_epochs: 1
+micro_batch_size: 1
+gradient_accumulation_steps: 1
+logging_steps: 1
+
+output_dir: ./outputs/tinker-rl-math
--- a/src/axolotl/integrations/hatchery/examples/tinker_sft.yaml
+++ b/src/axolotl/integrations/hatchery/examples/tinker_sft.yaml
@@ -0,0 +1,42 @@
+# SFT: KIMI-K2 thinking data via Tinker remote API with Qwen3-8B
+#
+# Usage:
+#   export TINKER_API_KEY="your-key"
+#   axolotl train src/axolotl/integrations/hatchery/examples/tinker_sft.yaml
+
+base_model: Qwen/Qwen3-8B
+
+plugins:
+  - axolotl.integrations.hatchery.HatcheryPlugin
+
+hatchery:
+  backend: tinker
+  lora_rank: 16
+  loss_fn: cross_entropy
+  pipeline: true
+  save_steps: 10
+
+datasets:
+  - path: TeichAI/kimi-k2-thinking-1000x
+    split: train[:50]
+    type: chat_template
+    chat_template: qwen3
+    split_thinking: true
+
+chat_template: qwen3
+sequence_len: 2048
+
+learning_rate: 3.0e-4
+optimizer: adamw_torch
+adam_beta1: 0.9
+adam_beta2: 0.95
+weight_decay: 0.01
+max_grad_norm: 1.0
+
+num_epochs: 1
+max_steps: 20
+micro_batch_size: 2
+gradient_accumulation_steps: 1
+logging_steps: 1
+
+output_dir: ./outputs/tinker-sft
--- a/src/axolotl/integrations/hatchery/plugin.py
+++ b/src/axolotl/integrations/hatchery/plugin.py
@@ -0,0 +1,147 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""Axolotl plugin that routes training to a remote Hatchery/Tinker API."""
+
+from __future__ import annotations
+
+import torch
+from peft import PeftModel
+from transformers import AutoConfig, PreTrainedModel, Trainer
+
+from axolotl.integrations.base import BasePlugin
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+
+class HatcheryPlugin(BasePlugin):
+    """Plugin that replaces local training with remote API calls.
+
+    Activated by adding to the axolotl YAML:
+
+        plugins:
+          - axolotl.integrations.hatchery.HatcheryPlugin
+
+        hatchery:
+          backend: tinker  # or "hatchery"
+          lora_rank: 32
+          loss_fn: cross_entropy
+          # ... see HatcheryConfig for full options
+    """
+
+    def get_input_args(self) -> str:
+        return "axolotl.integrations.hatchery.args.HatcheryArgs"
+
+    def register(self, cfg: dict):
+        """Auto-set config values needed for remote training."""
+        if cfg.get("remove_unused_columns") is None:
+            cfg["remove_unused_columns"] = False
+
+    def pre_model_load(self, cfg: DictDefault):
+        """Replace model loading with a tiny stub."""
+        hcfg = cfg.hatchery or {}
+        backend = (
+            hcfg.get("backend", "tinker")
+            if isinstance(hcfg, dict)
+            else getattr(hcfg, "backend", "tinker")
+        )
+        LOG.info(
+            f"Hatchery plugin active: training dispatched to remote "
+            f"{backend} API. Skipping local model weight loading."
+        )
+
+        from axolotl.loaders import ModelLoader
+
+        def _stub_build_model(loader_self) -> bool:
+            base_model = loader_self.cfg.base_model
+            LOG.info(f"Skipping model weight loading for: {base_model}")
+
+            config = AutoConfig.from_pretrained(
+                base_model,
+                trust_remote_code=loader_self.cfg.get("trust_remote_code", False),
+            )
+
+            class _Stub(PreTrainedModel):
+                config_class = type(config)
+                _no_split_modules: list[str] = []
+                supports_gradient_checkpointing = False
+
+                def __init__(self, cfg):
+                    super().__init__(cfg)
+                    vocab_size = getattr(cfg, "vocab_size", 32000)
+                    self.embed_tokens = torch.nn.Embedding(vocab_size, 1)
+
+                def get_input_embeddings(self):
+                    return self.embed_tokens
+
+                def set_input_embeddings(self, value):
+                    pass
+
+                def get_output_embeddings(self):
+                    return None
+
+            loader_self.model = _Stub(config)
+            return True
+
+        ModelLoader._build_model = _stub_build_model  # type: ignore[method-assign,assignment]
+
+    def get_trainer_cls(self, cfg: DictDefault) -> type[Trainer] | None:
+        """Return the appropriate remote trainer class."""
+        hcfg = cfg.hatchery
+        loss_fn = getattr(hcfg, "loss_fn", "cross_entropy") if hcfg else "cross_entropy"
+
+        if loss_fn in ("importance_sampling", "ppo", "cispo", "dro"):
+            from .rl_trainer import HatcheryRLTrainer
+
+            return HatcheryRLTrainer
+
+        from .trainer import HatcheryTrainer
+
+        return HatcheryTrainer
+
+    def post_model_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
+        model._hatchery_remote = True
+
+    def post_train(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
+        LOG.info(
+            "Hatchery: skipping local model save (weights are on remote API). "
+            "Use `tinker checkpoint download` or hatchery CLI to retrieve."
+        )
+
+    def post_trainer_create(self, cfg: DictDefault, trainer: Trainer):
+        """Inject hatchery config + axolotl training params into the trainer."""
+        from .args import HatcheryConfig
+        from .rl_trainer import HatcheryRLTrainer
+        from .trainer import HatcheryTrainer
+
+        if not isinstance(trainer, (HatcheryTrainer, HatcheryRLTrainer)):
+            return
+
+        hcfg = cfg.hatchery
+        if isinstance(hcfg, dict):
+            hatchery_config = HatcheryConfig(**hcfg)
+        elif hcfg is None:
+            hatchery_config = HatcheryConfig()
+        else:
+            hatchery_config = hcfg
+
+        trainer.hatchery_args = hatchery_config
+        trainer._base_model_name = cfg.base_model
+
+        # Pull standard training params from axolotl config so they
+        # don't need to be duplicated under hatchery:
+        trainer._optim_params = {
+            "learning_rate": cfg.learning_rate
+            if cfg.learning_rate is not None
+            else 1e-4,
+            "beta1": cfg.adam_beta1 if cfg.adam_beta1 is not None else 0.9,
+            "beta2": cfg.adam_beta2 if cfg.adam_beta2 is not None else 0.95,
+            "eps": cfg.adam_epsilon if cfg.adam_epsilon is not None else 1e-12,
+            "weight_decay": cfg.weight_decay if cfg.weight_decay is not None else 0.0,
+            "grad_clip_norm": cfg.max_grad_norm
+            if cfg.max_grad_norm is not None
+            else 0.0,
+        }
--- a/src/axolotl/integrations/hatchery/rewards/init.py
+++ b/src/axolotl/integrations/hatchery/rewards/init.py
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
--- a/src/axolotl/integrations/hatchery/rewards/math_reward.py
+++ b/src/axolotl/integrations/hatchery/rewards/math_reward.py
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""Math reward function for hendrycks_math GRPO training.
+
+Uses math_verify for robust answer comparison. Falls back to
+exact string match of \\boxed{} content only when math_verify
+is unavailable.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+
+LOG = logging.getLogger(__name__)
+
+
+def extract_boxed(text: str) -> str | None:
+    """Extract \\boxed{...} answer handling nested braces."""
+    match = re.search(r"\\boxed\{", text)
+    if not match:
+        return None
+    start = match.end()
+    depth = 1
+    i = start
+    while i < len(text) and depth > 0:
+        if text[i] == "{":
+            depth += 1
+        elif text[i] == "}":
+            depth -= 1
+        i += 1
+    return text[start : i - 1] if depth == 0 else None
+
+
+def math_reward(prompts: list[str], completions: list[str], **kwargs) -> list[float]:
+    """Score completions by checking if \\boxed{} answer matches the gold answer.
+
+    The gold answer is extracted from the prompt (appended as a hidden
+    tag by the dataset preprocessing). Format:
+      ... <|gold|>ANSWER<|/gold|>
+    """
+    rewards = []
+    for prompt, completion in zip(prompts, completions, strict=True):
+        gold_match = re.search(r"<\|gold\|>(.*?)<\|/gold\|>", prompt)
+        if not gold_match:
+            rewards.append(0.0)
+            continue
+
+        gold_answer = gold_match.group(1).strip()
+        pred_answer = extract_boxed(completion)
+
+        if pred_answer is None:
+            rewards.append(0.0)
+            continue
+
+        verified = None
+        try:
+            from math_verify import parse, verify
+
+            gold_parsed = parse(gold_answer)
+            pred_parsed = parse(pred_answer)
+            verified = verify(gold_parsed, pred_parsed)
+        except Exception:
+            LOG.debug(
+                "math_verify unavailable or failed, using string fallback",
+                exc_info=True,
+            )
+
+        if verified is not None:
+            rewards.append(1.0 if verified else 0.0)
+        elif pred_answer.strip() == gold_answer.strip():
+            rewards.append(1.0)
+        else:
+            rewards.append(0.0)
+
+    return rewards
--- a/src/axolotl/integrations/hatchery/rl_trainer.py
+++ b/src/axolotl/integrations/hatchery/rl_trainer.py
@@ -0,0 +1,409 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""Remote RL trainer (GRPO/PPO) using Tinker or Hatchery API.
+
+Full RL loop per step:
+  1. Extract prompts from dataset batch
+  2. Sample N completions per prompt via remote SamplingClient
+  3. Score completions with local reward functions
+  4. Compute GRPO-style advantages (per-group normalization)
+  5. Send (prompt+completion, logprobs, advantages) as forward_backward
+  6. Optimizer step
+"""
+
+from __future__ import annotations
+
+import importlib
+import inspect
+import re
+import time
+from typing import Any, Callable, Optional
+
+import torch
+from transformers.trainer_utils import TrainOutput
+
+from axolotl.core.trainers.base import AxolotlTrainer
+from axolotl.utils.logging import get_logger
+
+from .args import HatcheryConfig
+from .data import batch_to_datums_rl, datums_to_tinker
+from .trainer import _create_training_client
+
+LOG = get_logger(__name__)
+
+
+def _load_reward_func(fqn: str) -> Callable:
+    """Load a reward function from a fully qualified name like 'module.func'."""
+    module_path = ".".join(fqn.split(".")[:-1])
+    func_name = fqn.split(".")[-1]
+    mod = importlib.import_module(module_path)
+    func = getattr(mod, func_name)
+    if len(inspect.signature(func).parameters) < 2:
+        raise ValueError(f"Reward function {fqn} must accept (prompts, completions)")
+    return func
+
+
+class HatcheryRLTrainer(AxolotlTrainer):
+    """Remote RL trainer using Tinker/Hatchery for sampling and training."""
+
+    hatchery_args: Optional[HatcheryConfig]
+    _base_model_name: Optional[str]
+    _training_client: Any
+    _reward_functions: list[Callable]
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.hatchery_args = None
+        self._base_model_name = None
+        self._training_client = None
+        self._reward_functions = []
+
+    def _ensure_reward_functions(self):
+        if self._reward_functions:
+            return
+        args = self.hatchery_args
+        if not args or not args.reward_funcs:
+            raise ValueError(
+                "No reward functions configured. Set hatchery.reward_funcs "
+                "in YAML, e.g. reward_funcs: ['my_module.my_reward']"
+            )
+        for fqn in args.reward_funcs:
+            self._reward_functions.append(_load_reward_func(fqn))
+        LOG.info(f"Loaded {len(self._reward_functions)} reward function(s)")
+
+    def _get_training_client(self):
+        if self._training_client is not None:
+            return self._training_client
+
+        self._training_client = _create_training_client(
+            self.hatchery_args, self._base_model_name
+        )
+        LOG.info(
+            f"Remote RL session created: backend={self.hatchery_args.backend}, "
+            f"model={self._base_model_name}, rank={self.hatchery_args.lora_rank}"
+        )
+        return self._training_client
+
+    def _sample_completions(self, prompt_ids_list: list[list[int]]):
+        """Sample completions for prompts via remote API."""
+        import tinker.types as tt
+
+        tc = self._get_training_client()
+        args = self.hatchery_args
+        assert args is not None  # validated by _get_training_client
+        results = []
+
+        sc = tc.save_weights_and_get_sampling_client()
+
+        for prompt_ids in prompt_ids_list:
+            if hasattr(sc, "sampling_session_id"):
+                sample_result = sc.sample(
+                    prompt_ids,
+                    max_tokens=args.max_sample_tokens,
+                    temperature=args.sample_temperature,
+                    n=args.num_samples,
+                ).result(timeout=args.future_timeout)
+            else:
+                mi = tt.ModelInput.from_ints(prompt_ids)
+                sp = tt.SamplingParams(
+                    max_tokens=args.max_sample_tokens,
+                    temperature=args.sample_temperature,
+                    top_p=0.95,
+                    top_k=-1,
+                )
+                sample_result = sc.sample(
+                    prompt=mi,
+                    num_samples=args.num_samples,
+                    sampling_params=sp,
+                ).result(timeout=args.future_timeout)
+
+            sequences = (
+                sample_result.sequences
+                if hasattr(sample_result, "sequences")
+                else sample_result.get("sequences", [])
+            )
+            for seq in sequences:
+                tokens = (
+                    list(seq.tokens)
+                    if hasattr(seq, "tokens")
+                    else seq.get("tokens", [])
+                )
+                logprobs = (
+                    list(seq.logprobs)
+                    if hasattr(seq, "logprobs") and seq.logprobs
+                    else seq.get("logprobs", [])
+                )
+                results.append(
+                    {
+                        "tokens": list(prompt_ids) + tokens,
+                        "completion_tokens": tokens,
+                        "logprobs": logprobs,
+                        "prompt_len": len(prompt_ids),
+                    }
+                )
+
+        return results
+
+    def _compute_rewards(
+        self, prompts: list[str], completions: list[str]
+    ) -> list[float]:
+        total_rewards = [0.0] * len(completions)
+        for reward_fn in self._reward_functions:
+            rewards = reward_fn(prompts, completions)
+            for i, r in enumerate(rewards):
+                total_rewards[i] += r
+        return total_rewards
+
+    @staticmethod
+    def _compute_advantages(rewards: list[float], group_size: int) -> list[float]:
+        advantages = []
+        for i in range(0, len(rewards), group_size):
+            group = rewards[i : i + group_size]
+            mean = sum(group) / len(group)
+            var = sum((r - mean) ** 2 for r in group) / max(len(group), 1)
+            std = var**0.5 if var > 1e-8 else 1.0
+            advantages.extend([(r - mean) / std for r in group])
+        return advantages
+
+    def _do_optim_step(self):
+        import tinker.types as tt
+
+        tc = self._get_training_client()
+        return tc.optim_step(tt.AdamParams(**self._optim_params))
+
+    def train(
+        self,
+        resume_from_checkpoint: Optional[str] = None,
+        trial: Any = None,
+        ignore_keys_for_eval: Optional[list[str]] = None,
+        **kwargs,
+    ) -> TrainOutput:
+        args = self.hatchery_args
+        if args is None:
+            raise RuntimeError("hatchery_args not configured")
+
+        self._ensure_reward_functions()
+
+        train_dataloader = self.get_train_dataloader()
+        num_train_epochs = int(self.args.num_train_epochs)
+        max_steps = self.args.max_steps if self.args.max_steps > 0 else 1000
+
+        LOG.info(
+            f"Remote RL training: max_steps={max_steps}, "
+            f"loss_fn={args.loss_fn}, samples/prompt={args.num_samples}"
+        )
+
+        self.state.max_steps = max_steps
+        self.state.num_train_epochs = num_train_epochs
+        self.state.is_local_process_zero = True
+        self.state.is_world_process_zero = True
+
+        self.control = self.callback_handler.on_train_begin(
+            self.args,
+            self.state,
+            self.control,  # type: ignore[has-type]
+        )
+
+        tokenizer = self.processing_class
+        global_step = 0
+        total_loss = 0.0
+        total_reward = 0.0
+        start_time = time.time()
+
+        for _epoch in range(num_train_epochs):
+            if global_step >= max_steps:
+                break
+
+            for batch in train_dataloader:
+                if global_step >= max_steps:
+                    break
+
+                self.control = self.callback_handler.on_step_begin(
+                    self.args, self.state, self.control
+                )
+
+                prompt_ids_batch = batch["input_ids"]
+                # Full prompt text (with gold tag) for reward scoring
+                prompt_texts = tokenizer.batch_decode(
+                    prompt_ids_batch, skip_special_tokens=False
+                )
+
+                # Strip <|gold|>...<|/gold|> from token ids before
+                # sending to the model for sampling — the gold answer
+                # must only be visible to the local reward function.
+                sampling_prompts = []
+                for prompt_text in prompt_texts:
+                    clean = re.sub(r"<\|gold\|>.*?<\|/gold\|>", "", prompt_text)
+                    clean_ids = tokenizer.encode(clean, add_special_tokens=False)
+                    sampling_prompts.append(clean_ids)
+
+                # 1. Sample completions (without gold answer)
+                t0 = time.time()
+                samples = self._sample_completions(sampling_prompts)
+                t_sample = time.time() - t0
+
+                if not samples:
+                    LOG.warning("No samples generated, skipping step")
+                    continue
+                LOG.info(
+                    f"Sampled {len(samples)} completions, "
+                    f"avg_len={sum(len(s['completion_tokens']) for s in samples) / len(samples):.0f}tok"
+                )
+
+                # 2. Decode and score
+                completion_texts = [
+                    tokenizer.decode(s["completion_tokens"], skip_special_tokens=False)
+                    for s in samples
+                ]
+                sample_prompts = []
+                for prompt_text in prompt_texts:
+                    sample_prompts.extend([prompt_text] * args.num_samples)
+
+                rewards = self._compute_rewards(sample_prompts, completion_texts)
+
+                # 3. GRPO advantages
+                advantages_list = self._compute_advantages(
+                    rewards, group_size=args.num_samples
+                )
+
+                # 4. Build training data
+                all_datums = []
+                for i, sample in enumerate(samples):
+                    full_tokens = sample["tokens"]
+                    prompt_len = sample["prompt_len"]
+                    seq_len = len(full_tokens)
+
+                    input_ids = torch.tensor([full_tokens], dtype=torch.long)
+                    labels = torch.full((1, seq_len), -100, dtype=torch.long)
+                    labels[0, prompt_len:] = torch.tensor(full_tokens[prompt_len:])
+
+                    logprobs_t = torch.zeros(1, seq_len)
+                    if sample["logprobs"]:
+                        lp = sample["logprobs"][: seq_len - prompt_len]
+                        logprobs_t[0, prompt_len : prompt_len + len(lp)] = torch.tensor(
+                            lp
+                        )
+
+                    adv_t = torch.zeros(1, seq_len)
+                    adv_t[0, prompt_len:] = advantages_list[i]
+
+                    all_datums.extend(
+                        batch_to_datums_rl(input_ids, labels, logprobs_t, adv_t)
+                    )
+
+                # 5. Forward backward (one datum at a time for memory) + optim
+                t0 = time.time()
+                tc = self._get_training_client()
+                step_loss = 0.0
+                for datum in all_datums:
+                    fb_future = tc.forward_backward(
+                        datums_to_tinker([datum]),
+                        loss_fn=args.loss_fn,
+                        loss_fn_config=args.loss_fn_config,
+                    )
+                    fb_result = fb_future.result(timeout=args.future_timeout)
+                    if hasattr(fb_result, "metrics"):
+                        step_loss += float(
+                            (fb_result.metrics or {}).get("loss:sum", 0.0)
+                        )
+                    elif isinstance(fb_result, dict):
+                        step_loss += float(
+                            fb_result.get("metrics", {}).get("loss:sum", 0.0)
+                        )
+                optim_future = self._do_optim_step()
+                if not args.pipeline:
+                    optim_future.result(timeout=args.future_timeout)
+                t_train = time.time() - t0
+
+                mean_reward = sum(rewards) / len(rewards)
+                accuracy = sum(1 for r in rewards if r > 0) / len(rewards)
+                mean_adv = sum(abs(a) for a in advantages_list) / len(advantages_list)
+                global_step += 1
+                total_loss += step_loss
+                total_reward += mean_reward
+                self.state.global_step = global_step
+
+                log_interval = self.args.logging_steps or 1
+                if global_step % log_interval == 0:
+                    elapsed = time.time() - start_time
+                    LOG.info(
+                        f"[step {global_step}/{max_steps}] "
+                        f"acc={accuracy:.2f} reward={mean_reward:.3f} "
+                        f"|adv|={mean_adv:.3f} loss:sum={step_loss:.1f} "
+                        f"sample={t_sample:.1f}s train={t_train:.1f}s "
+                        f"{elapsed / global_step:.1f}s/step"
+                    )
+                    self.log(
+                        {
+                            "loss": step_loss,
+                            "reward": mean_reward,
+                            "accuracy": accuracy,
+                            "mean_abs_advantage": mean_adv,
+                            "learning_rate": self._optim_params["learning_rate"],
+                        }
+                    )
+
+                if args.save_steps and global_step % args.save_steps == 0:
+                    self._save_remote_checkpoint(global_step)
+
+                self.control = self.callback_handler.on_step_end(
+                    self.args, self.state, self.control
+                )
+                if self.control.should_training_stop:
+                    break
+
+            if self.control.should_training_stop:
+                break
+
+        if global_step > 0:
+            self._save_remote_checkpoint(global_step, name="final")
+
+        elapsed = time.time() - start_time
+        avg_loss = total_loss / max(global_step, 1)
+        avg_reward = total_reward / max(global_step, 1)
+
+        LOG.info(
+            f"RL training complete: {global_step} steps, {elapsed:.1f}s, "
+            f"avg_reward={avg_reward:.4f}"
+        )
+
+        self.control = self.callback_handler.on_train_end(
+            self.args, self.state, self.control
+        )
+
+        return TrainOutput(
+            global_step=global_step,
+            training_loss=avg_loss,
+            metrics={
+                "train_loss": avg_loss,
+                "train_reward": avg_reward,
+                "train_runtime": elapsed,
+            },
+        )
+
+    def _save_remote_checkpoint(self, step: int, name: Optional[str] = None):
+        tc = self._get_training_client()
+        args = self.hatchery_args
+        assert args is not None  # validated by _get_training_client
+        ckpt_name = name or f"{args.save_name_prefix}-{step:06d}"
+        try:
+            future = tc.save_state(ckpt_name)
+            future.result(timeout=args.future_timeout)
+            LOG.info(f"Remote checkpoint saved: {ckpt_name}")
+        except Exception:
+            LOG.exception(f"Failed to save checkpoint {ckpt_name}")
+            if name == "final":
+                raise
+
+    def save_model(self, output_dir=None, _internal_call=False):
+        self._save_remote_checkpoint(
+            step=self.state.global_step,
+            name=output_dir or "hf-save",
+        )
+
+    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
+        raise NotImplementedError(
+            "HatcheryRLTrainer uses remote API; compute_loss not called locally."
+        )
--- a/src/axolotl/integrations/hatchery/trainer.py
+++ b/src/axolotl/integrations/hatchery/trainer.py
@@ -0,0 +1,327 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""Remote trainer that dispatches to Tinker or Hatchery API."""
+
+from __future__ import annotations
+
+import os
+import time
+from typing import Any, Optional
+
+import torch
+from transformers.trainer_utils import TrainOutput
+
+from axolotl.core.trainers.base import AxolotlTrainer
+from axolotl.utils.logging import get_logger
+
+from .args import HatcheryConfig
+from .data import batch_to_datums_sft, datums_to_tinker
+
+LOG = get_logger(__name__)
+
+
+def _extract_loss(result) -> float:
+    """Extract loss:sum from a forward_backward result.
+
+    Tinker's cross_entropy (and other losses) return the SUM of per-token
+    losses, not the mean. This is by design — it lets users control
+    normalization via the weights tensor. The trainer logs this raw sum;
+    users who want per-token loss should divide by number of active tokens.
+    """
+    if hasattr(result, "metrics"):
+        metrics = result.metrics or {}
+        return float(metrics.get("loss:sum", metrics.get("loss", 0.0)))
+    if isinstance(result, dict):
+        metrics = result.get("metrics", {})
+        return float(metrics.get("loss:sum", metrics.get("loss", 0.0)))
+    return 0.0
+
+
+def _create_training_client(args: HatcheryConfig, base_model: str):
+    """Create a training client for either Tinker or Hatchery backend."""
+    if args.backend == "tinker":
+        import tinker
+
+        api_key = args.api_key or os.environ.get("TINKER_API_KEY")
+        if not api_key:
+            raise ValueError(
+                "Tinker API key required. Set `hatchery.api_key` in config "
+                "or TINKER_API_KEY env var."
+            )
+        os.environ["TINKER_API_KEY"] = api_key
+
+        service = tinker.ServiceClient(project_id=args.project_id)
+        return service.create_lora_training_client(
+            base_model=base_model,
+            rank=args.lora_rank,
+            train_mlp=args.train_mlp,
+            train_attn=args.train_attn,
+            train_unembed=args.train_unembed,
+        )
+
+    from hatchery.core.client import HatcheryClient
+
+    base_url = args.base_url or os.environ.get("HATCHERY_URL", "http://127.0.0.1:8420")
+    token = args.api_key or os.environ.get("HATCHERY_API_KEY", "dev")
+
+    client = HatcheryClient(base_url=base_url, token=token, timeout=args.future_timeout)
+    return client.create_lora_training_client(
+        base_model=base_model,
+        rank=args.lora_rank,
+        train_attn=args.train_attn,
+        train_mlp=args.train_mlp,
+        train_unembed=args.train_unembed,
+    )
+
+
+class HatcheryTrainer(AxolotlTrainer):
+    """Trainer that sends preprocessed batches to a remote training API.
+
+    Replaces local forward/backward with remote API calls to Tinker or
+    Hatchery. Uses axolotl's full data preprocessing pipeline (tokenization,
+    chat templates, packing, etc.) but offloads compute to remote GPUs.
+    """
+
+    hatchery_args: Optional[HatcheryConfig]
+    _base_model_name: Optional[str]
+    _training_client: Any
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.hatchery_args = None
+        self._base_model_name = None
+        self._training_client = None
+
+    def _get_training_client(self):
+        """Lazily create the remote training session."""
+        if self._training_client is not None:
+            return self._training_client
+
+        args = self.hatchery_args
+        if args is None:
+            raise RuntimeError(
+                "HatcheryTrainer.hatchery_args not set. "
+                "Ensure the HatcheryPlugin is registered."
+            )
+
+        base_model = self._base_model_name
+        if not base_model:
+            raise RuntimeError("HatcheryTrainer._base_model_name not set.")
+
+        self._training_client = _create_training_client(args, base_model)
+
+        LOG.info(
+            f"Remote training session created: backend={args.backend}, "
+            f"model={base_model}, rank={args.lora_rank}"
+        )
+        return self._training_client
+
+    def _send_batch(self, batch: dict[str, torch.Tensor]):
+        """Convert batch to datums and send forward_backward to remote.
+
+        Returns (future, n_active_tokens) where n_active_tokens counts
+        the completion tokens in this batch (for loss normalization).
+        """
+        input_ids = batch["input_ids"]
+        labels = batch["labels"]
+        attention_mask = batch.get("attention_mask")
+
+        n_active = int((labels[:, 1:] != -100).sum().item())
+        datums = batch_to_datums_sft(input_ids, labels, attention_mask)
+
+        tc = self._get_training_client()
+        args = self.hatchery_args
+        assert args is not None  # validated by _get_training_client
+        send_datums = datums_to_tinker(datums)
+
+        future = tc.forward_backward(
+            send_datums,
+            loss_fn=args.loss_fn,
+            loss_fn_config=args.loss_fn_config,
+        )
+        return future, n_active
+
+    def _do_optim_step(self):
+        """Send optimizer step to remote using axolotl's training params."""
+        import tinker.types as tt
+
+        tc = self._get_training_client()
+        return tc.optim_step(tt.AdamParams(**self._optim_params))
+
+    def train(
+        self,
+        resume_from_checkpoint: Optional[str] = None,
+        trial: Any = None,
+        ignore_keys_for_eval: Optional[list[str]] = None,
+        **kwargs,
+    ) -> TrainOutput:
+        """Main training loop — sends batches to remote API."""
+        args = self.hatchery_args
+        if args is None:
+            raise RuntimeError("hatchery_args not configured")
+
+        train_dataloader = self.get_train_dataloader()
+        num_batches = len(train_dataloader)
+
+        grad_accum = self.args.gradient_accumulation_steps
+        num_train_epochs = int(self.args.num_train_epochs)
+        steps_per_epoch = max(num_batches // grad_accum, 1)
+        max_steps = (
+            self.args.max_steps
+            if self.args.max_steps > 0
+            else steps_per_epoch * num_train_epochs
+        )
+
+        LOG.info(
+            f"Remote training: {num_batches} batches/epoch, "
+            f"{grad_accum} grad_accum, {max_steps} max steps, "
+            f"{num_train_epochs} epochs"
+        )
+
+        self.state.max_steps = max_steps
+        self.state.num_train_epochs = num_train_epochs
+        self.state.is_local_process_zero = True
+        self.state.is_world_process_zero = True
+
+        self.control = self.callback_handler.on_train_begin(
+            self.args,
+            self.state,
+            self.control,  # type: ignore[has-type]
+        )
+
+        global_step = 0
+        total_loss = 0.0
+        start_time = time.time()
+
+        for _epoch in range(num_train_epochs):
+            if global_step >= max_steps:
+                break
+
+            self.control = self.callback_handler.on_epoch_begin(
+                self.args, self.state, self.control
+            )
+
+            pending_fb_futures = []
+            accum_count = 0
+
+            for batch_idx, batch in enumerate(train_dataloader):
+                if global_step >= max_steps:
+                    break
+
+                self.control = self.callback_handler.on_step_begin(
+                    self.args, self.state, self.control
+                )
+
+                fb_future, n_active = self._send_batch(batch)
+                pending_fb_futures.append((fb_future, n_active))
+                accum_count += 1
+
+                if accum_count >= grad_accum:
+                    step_loss_sum = 0.0
+                    step_active = 0
+                    for fut, n_act in pending_fb_futures:
+                        result = fut.result(timeout=args.future_timeout)
+                        step_loss_sum += _extract_loss(result)
+                        step_active += n_act
+
+                    optim_future = self._do_optim_step()
+                    if not args.pipeline:
+                        optim_future.result(timeout=args.future_timeout)
+
+                    step_loss = (
+                        step_loss_sum / step_active
+                        if step_active > 0
+                        else step_loss_sum
+                    )
+
+                    global_step += 1
+                    total_loss += step_loss
+                    self.state.global_step = global_step
+                    self.state.epoch = _epoch + (batch_idx + 1) / num_batches
+
+                    log_interval = self.args.logging_steps or 1
+                    if global_step % log_interval == 0:
+                        elapsed = time.time() - start_time
+                        avg_loss = total_loss / global_step
+                        LOG.info(
+                            f"[step {global_step}/{max_steps}] "
+                            f"loss/tok={step_loss:.4f} avg={avg_loss:.4f} "
+                            f"active={step_active} "
+                            f"{elapsed / global_step:.2f}s/step"
+                        )
+                        self.log(
+                            {
+                                "loss": step_loss,
+                                "learning_rate": self._optim_params["learning_rate"],
+                                "epoch": self.state.epoch,
+                            }
+                        )
+
+                    if args.save_steps and global_step % args.save_steps == 0:
+                        self._save_remote_checkpoint(global_step)
+
+                    self.control = self.callback_handler.on_step_end(
+                        self.args, self.state, self.control
+                    )
+
+                    pending_fb_futures = []
+                    accum_count = 0
+
+                    if self.control.should_training_stop:
+                        break
+
+            self.control = self.callback_handler.on_epoch_end(
+                self.args, self.state, self.control
+            )
+            if self.control.should_training_stop:
+                break
+
+        if global_step > 0:
+            self._save_remote_checkpoint(global_step, name="final")
+
+        elapsed = time.time() - start_time
+        avg_loss = total_loss / max(global_step, 1)
+
+        LOG.info(
+            f"Training complete: {global_step} steps, {elapsed:.1f}s total, "
+            f"{elapsed / max(global_step, 1):.2f}s/step, avg_loss={avg_loss:.4f}"
+        )
+
+        self.control = self.callback_handler.on_train_end(
+            self.args, self.state, self.control
+        )
+
+        return TrainOutput(
+            global_step=global_step,
+            training_loss=avg_loss,
+            metrics={"train_loss": avg_loss, "train_runtime": elapsed},
+        )
+
+    def _save_remote_checkpoint(self, step: int, name: Optional[str] = None):
+        """Save a checkpoint on the remote service."""
+        tc = self._get_training_client()
+        args = self.hatchery_args
+        assert args is not None  # validated by _get_training_client
+        ckpt_name = name or f"{args.save_name_prefix}-{step:06d}"
+        try:
+            future = tc.save_state(ckpt_name)
+            future.result(timeout=args.future_timeout)
+            LOG.info(f"Remote checkpoint saved: {ckpt_name}")
+        except Exception:
+            LOG.exception(f"Failed to save checkpoint {ckpt_name}")
+            if name == "final":
+                raise
+
+    def save_model(self, output_dir=None, _internal_call=False):
+        """Delegate to remote checkpoint save so HF callbacks create checkpoints."""
+        self._save_remote_checkpoint(
+            step=self.state.global_step,
+            name=output_dir or "hf-save",
+        )
+
+    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
+        raise NotImplementedError(
+            "HatcheryTrainer uses remote API; compute_loss should not be called."
+        )
--- a/src/axolotl/integrations/kd/README.md
+++ b/src/axolotl/integrations/kd/README.md
@@ -11,7 +11,7 @@ kd_ce_alpha: 0.1
 kd_alpha: 0.9
 kd_temperature: 1.0

-torch_compile: True  # torch>=2.6.0, recommended to reduce vram
+torch_compile: True  # recommended to reduce vram

 datasets:
  - path: ...
--- a/Show More
+++ b/Show More