fix losses

more train steps
fix tests
2026-04-26 10:28:05 -04:00 · 2026-04-25 02:17:48 +00:00 · 2026-04-23 23:47:28 +00:00 · 2026-04-23 18:43:18 +00:00 · 2026-04-23 13:51:01 +00:00 · 2026-04-23 00:26:34 -04:00
168 changed files with 11588 additions and 2211 deletions
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -31,7 +31,10 @@ PRs are **greatly welcome**!

 Please run below to setup env
 ```bash
-pip3 install -r requirements-dev.txt -r requirements-tests.txt
+# Install axolotl + dev and test dependencies from lockfile
+export UV_TORCH_BACKEND=cu128  # or cu130
+uv sync --extra flash-attn --extra deepspeed --group dev --group test
+source .venv/bin/activate
 pre-commit install

 # test
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -6,7 +6,7 @@ on:
      types: [opened, synchronize, reopened, ready_for_review]
      paths:
       - '**.py'
-       - 'requirements.txt'
+       - 'pyproject.toml'
       - '.github/workflows/*.yml'
       - "*.[q]md"
       - "examples/**/*.y[a]?ml"
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -3,17 +3,15 @@ name: docker-multigpu-tests-biweekly
 on:
  pull_request:
    paths:
-      - 'tests/e2e/multigpu/**.py'
-      - 'requirements.txt'
-      - 'setup.py'
-      - 'pyproject.toml'
-      - '.github/workflows/multi-gpu-e2e.yml'
-      - 'scripts/cutcrossentropy_install.py'
-      - 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
-      - 'src/axolotl/utils/distributed.py'
+      - "tests/e2e/multigpu/**.py"
+      - "pyproject.toml"
+      - ".github/workflows/multi-gpu-e2e.yml"
+      - "scripts/cutcrossentropy_install.py"
+      - "src/axolotl/core/trainers/mixins/sequence_parallel.py"
+      - "src/axolotl/utils/distributed.py"
  workflow_dispatch:
  schedule:
-    - cron: '0 0 * * 1,4'  # Runs at 00:00 UTC every monday & thursday
+    - cron: "0 0 * * 1,4" # Runs at 00:00 UTC every monday & thursday

 # Cancel jobs on the same ref if a new one is triggered
 concurrency:
@@ -33,19 +31,19 @@ jobs:
      fail-fast: false
      matrix:
        include:
-#          - cuda: 129
-#            cuda_version: 12.9.1
-#            python_version: "3.12"
-#            pytorch: 2.9.1
-#            axolotl_extras: "fbgemm-gpu"
-#            num_gpus: 2
-#            dockerfile: "Dockerfile-uv.jinja"
+          #          - cuda: 129
+          #            cuda_version: 12.9.1
+          #            python_version: "3.12"
+          #            pytorch: 2.9.1
+          #            axolotl_extras: "fbgemm-gpu"
+          #            num_gpus: 2
+          #            dockerfile: "Dockerfile-uv.jinja"
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
-#            axolotl_extras: fbgemm-gpu
+            #            axolotl_extras: fbgemm-gpu
            num_gpus: 2
          - cuda: 128
            cuda_version: 12.8.1
@@ -53,7 +51,6 @@ jobs:
            pytorch: 2.10.0
            axolotl_extras: "fbgemm-gpu"
            num_gpus: 2
-            dockerfile: "Dockerfile-uv.jinja"
    runs-on: [self-hosted, modal]
    timeout-minutes: 120
    steps:
@@ -75,7 +72,7 @@ jobs:
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -8,6 +8,9 @@ on:

 permissions: {}

+env:
+  UV_SYSTEM_PYTHON: "1"
+
 jobs:
  setup_release:
    name: Create Release
@@ -41,11 +44,15 @@ jobs:
        with:
          python-version: "3.11"

+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+
      - name: Install dependencies
        run: |
-          pip3 install wheel packaging==26.0
-          pip3 install --no-build-isolation -e .
-          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+          uv pip install wheel packaging
+          uv pip install --no-build-isolation -e .
+          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
+            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse

      - name: Extract tag name
        id: tag
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -2,15 +2,18 @@ name: Tests Nightly against upstream main
 on:
  workflow_dispatch:
  schedule:
-    - cron: '0 0 * * *'  # Runs at 00:00 UTC every day
+    - cron: "0 0 * * *" # Runs at 00:00 UTC every day
  pull_request:
    types: [opened, synchronize, reopened, ready_for_review]
    paths:
-      - '.github/workflows/tests-nightly.yml'
+      - ".github/workflows/tests-nightly.yml"

 permissions:
  contents: read

+env:
+  UV_SYSTEM_PYTHON: "1"
+
 jobs:
  pre-commit:
    name: pre-commit
@@ -20,7 +23,7 @@ jobs:
      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"
-          cache: 'pip' # caching pip dependencies
+          cache: "pip" # caching pip dependencies
      - uses: pre-commit/action@v3.0.1
        env:
          SKIP: no-commit-to-branch
@@ -43,7 +46,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python_version: ["3.12"]  # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
+        python_version: ["3.12"] # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
        pytorch_version: ["2.9.1", "2.10.0"]
    timeout-minutes: 20

@@ -61,36 +64,34 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
-          cache: 'pip' # caching pip dependencies

-      - name: upgrade pip
-        run: |
-          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==26.0 setuptools==78.1.1 wheel
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7

      - name: Install PyTorch
        run: |
-          pip3 install torch==${{ matrix.pytorch_version }} torchvision
-
-      - name: Update requirements.txt
-        run: |
-          sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt
-          sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt
-          sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt
-          sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt
-          sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt
+          uv pip install torch==${{ matrix.pytorch_version }} torchvision
+          uv pip freeze | grep -E "^(torch|torchvision)==" > /tmp/torch-pin.txt

      - name: Install dependencies
        run: |
-          pip3 show torch
-          pip3 install --no-build-isolation -U -e .
-          python scripts/unsloth_install.py | sh
-          python scripts/cutcrossentropy_install.py | sh
-          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+          uv pip install --no-build-isolation -e . --override /tmp/torch-pin.txt
+          python scripts/cutcrossentropy_install.py --uv | sh
+          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
+            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
+
+      - name: Override with nightly HF packages
+        run: |
+          uv pip install --no-deps \
+            "transformers @ git+https://github.com/huggingface/transformers.git@main" \
+            "peft @ git+https://github.com/huggingface/peft.git@main" \
+            "accelerate @ git+https://github.com/huggingface/accelerate.git@main" \
+            "trl @ git+https://github.com/huggingface/trl.git@main" \
+            "datasets @ git+https://github.com/huggingface/datasets.git@main"

      - name: Make sure PyTorch version wasn't clobbered
        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__, f'Expected torch ${{ matrix.pytorch_version }} but got {torch.__version__}'"

      - name: Ensure axolotl CLI was installed
        run: |
@@ -102,9 +103,6 @@ jobs:
          pytest -v --durations=10 tests/patched/
          pytest -v --durations=10 tests/cli/

-      - name: cleanup pip cache
-        run: |
-          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

  docker-e2e-tests:
    if: github.repository_owner == 'axolotl-ai-cloud'
@@ -136,7 +134,6 @@ jobs:
            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
-            dockerfile: "Dockerfile-uv.jinja"
            nightly_build: "true"
    steps:
      - name: Checkout
@@ -157,7 +154,7 @@ jobs:
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -6,21 +6,19 @@ on:
    branches:
      - "main"
    paths:
-      - '**.py'
-      - 'requirements.txt'
-      - '.github/workflows/*.yml'
-      - 'requirements-tests.txt'
-      - 'cicd/cicd.sh'
-      - 'cicd/Dockerfile.jinja'
+      - "**.py"
+      - "pyproject.toml"
+      - ".github/workflows/*.yml"
+      - "cicd/cicd.sh"
+      - "cicd/Dockerfile-uv.jinja"
  pull_request:
-      types: [opened, synchronize, reopened, ready_for_review]
-      paths:
-       - '**.py'
-       - 'requirements.txt'
-       - '.github/workflows/*.yml'
-       - 'requirements-tests.txt'
-       - 'cicd/cicd.sh'
-       - 'cicd/Dockerfile.jinja'
+    types: [opened, synchronize, reopened, ready_for_review]
+    paths:
+      - "**.py"
+      - "pyproject.toml"
+      - ".github/workflows/*.yml"
+      - "cicd/cicd.sh"
+      - "cicd/Dockerfile-uv.jinja"
  workflow_dispatch:

 # Cancel jobs on the same ref if a new one is triggered
@@ -33,6 +31,7 @@ permissions:

 env:
  TRANSFORMERS_IS_CI: "yes"
+  UV_SYSTEM_PYTHON: "1"

 jobs:
  pre-commit:
@@ -44,7 +43,7 @@ jobs:
      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"
-          cache: 'pip' # caching pip dependencies
+          cache: "pip" # caching pip dependencies
      - uses: pre-commit/action@v3.0.1
        env:
          SKIP: no-commit-to-branch
@@ -94,32 +93,25 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
-          cache: 'pip' # caching pip dependencies

-      - name: upgrade pip
-        run: |
-          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==26.0 setuptools==75.8.0 wheel
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7

      - name: Install PyTorch
        run: |
-          pip3 install --no-cache-dir torch==${{ matrix.pytorch_version }} torchvision
+          uv pip install torch==${{ matrix.pytorch_version }} torchvision
+          uv pip freeze | grep -E "^(torch|torchvision)==" > /tmp/torch-pin.txt

      - name: Install dependencies
        run: |
-          pip3 show torch
-          pip3 install --no-cache-dir --no-build-isolation -U -e .
-          python scripts/unsloth_install.py | sh
-          python scripts/cutcrossentropy_install.py | sh
-          pip3 install -r requirements-dev.txt -r requirements-tests.txt
-
-      - name: cleanup pip cache
-        run: |
-          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
+          uv pip install --no-build-isolation -e . --override /tmp/torch-pin.txt
+          python scripts/cutcrossentropy_install.py --uv | sh
+          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
+            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse

      - name: Make sure PyTorch version wasn't clobbered
        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__, f'Expected torch ${{ matrix.pytorch_version }} but got {torch.__version__}'"

      - name: Ensure axolotl CLI was installed
        run: |
@@ -188,38 +180,42 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
-          cache: 'pip' # caching pip dependencies

-      - name: upgrade pip
-        run: |
-          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==26.0 setuptools==75.8.0 setuptools_scm build wheel psutil
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7

      - name: Install PyTorch
        run: |
-          pip3 install --no-cache-dir torch==${{ matrix.pytorch_version }} torchvision
+          uv pip install torch==${{ matrix.pytorch_version }} torchvision
+          uv pip freeze | grep -E "^(torch|torchvision)==" > /tmp/torch-pin.txt

      - name: Install dependencies
        run: |
-          pip3 show torch
+          uv pip install packaging setuptools_scm build wheel psutil
          python -m build --no-isolation --sdist
-          pip3 install --no-cache-dir --no-build-isolation dist/axolotl*.tar.gz
-          python scripts/unsloth_install.py | sh
-          python scripts/cutcrossentropy_install.py | sh
-          pip3 install -r requirements-dev.txt -r requirements-tests.txt
-
-      - name: cleanup pip cache
-        run: |
-          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
+          uv pip install --no-build-isolation dist/axolotl*.tar.gz --override /tmp/torch-pin.txt
+          python scripts/cutcrossentropy_install.py --uv | sh
+          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
+            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse

      - name: Make sure PyTorch version wasn't clobbered
        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__, f'Expected torch ${{ matrix.pytorch_version }} but got {torch.__version__}'"

      - name: Ensure axolotl CLI was installed
        run: |
          axolotl --help

+      - name: Verify agent docs are discoverable
+        run: |
+          # Agent docs live in docs/agents/ (source of truth) and are resolved
+          # at runtime from the repo checkout or via `axolotl fetch docs`
+          axolotl agent-docs --list
+          axolotl agent-docs | grep -q "Fine-tuning framework"
+          axolotl agent-docs grpo | grep -q "GRPO"
+          axolotl agent-docs sft | grep -q "SFT"
+          python -c "from axolotl.cli.agent_docs import get_doc, list_topics; assert len(list_topics()) >= 5; assert 'GRPO' in get_doc('grpo')"
+
      - name: Show HF cache
        run: hf cache ls

@@ -281,7 +277,6 @@ jobs:
            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
-            dockerfile: "Dockerfile-uv.jinja"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -302,7 +297,7 @@ jobs:
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
@@ -364,7 +359,7 @@ jobs:
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "GPU_TYPE=${{ matrix.gpu_type || 'L40S'}}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -16,6 +16,9 @@ axolotl inference config.yaml          # Interactive inference
 axolotl merge-lora config.yaml         # Merge LoRA adapter into base model
 axolotl vllm-serve config.yaml         # Start vLLM server for GRPO/EBFT training
 axolotl fetch examples                 # Download example configs
+axolotl agent-docs                     # Show agent-optimized docs (bundled with pip package)
+axolotl agent-docs grpo                # Topic-specific agent reference
+axolotl config-schema                  # Dump config JSON schema
 ```

 ## Training Methods
@@ -23,7 +26,7 @@ axolotl fetch examples                 # Download example configs
 | Method | Config Key | When to Use |
 |--------|-----------|-------------|
 | SFT | *(default)* | Input-output pairs, instruction tuning |
-| DPO/IPO | `rl: dpo` / `rl: ipo` | Paired preference data (chosen vs rejected) |
+| DPO/IPO | `rl: dpo` / `rl: dpo, dpo_loss_type: ["ipo"]` | Paired preference data (chosen vs rejected) |
 | KTO | `rl: kto` | Unpaired binary preference labels |
 | ORPO | `rl: orpo` | Single-stage alignment, no ref model |
 | GRPO | `rl: grpo` | RL with verifiable reward functions (math, code) |
@@ -35,6 +38,8 @@ Agent-specific references:
 - [docs/agents/grpo.md](docs/agents/grpo.md) — GRPO online RL with reward functions
 - [docs/agents/reward_modelling.md](docs/agents/reward_modelling.md) — outcome and process reward models
 - [docs/agents/pretraining.md](docs/agents/pretraining.md) — continual pretraining
+- [docs/agents/model_architectures.md](docs/agents/model_architectures.md) — model-specific quirks (Gemma4, Qwen3.5 MoE, etc.)
+- [docs/agents/new_model_support.md](docs/agents/new_model_support.md) — debugging and adding support for new model architectures

 ## Config Pattern

--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,6 +1,7 @@
-include requirements.txt
 include README.md
 include LICENSE
-include src/setuptools_axolotl_dynamic_dependencies.py
+include VERSION
 include src/axolotl/utils/chat_templates/templates/*.jinja
+include AGENTS.md
+recursive-include docs/agents *.md
 recursive-include axolotl *.py
--- a/README.md
+++ b/README.md
@@ -86,7 +86,7 @@ Features:
 **Requirements**:

 - NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
- Python 3.11
+- Python >=3.11 (3.12 recommended)
 - PyTorch ≥2.9.1

 ### Google Colab
@@ -95,11 +95,19 @@ Features:

 ### Installation

-#### Using pip
-
 ```bash
-pip3 install -U packaging==26.0 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
+# install uv if you don't already have it installed (restart shell after)
+curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# change depending on system
+export UV_TORCH_BACKEND=cu128
+
+# create a new virtual environment
+uv venv --python 3.12
+source .venv/bin/activate
+
+uv pip install torch==2.10.0 torchvision
+uv pip install --no-build-isolation axolotl[deepspeed]

 # Download example axolotl configs, deepspeed configs
 axolotl fetch examples
@@ -110,7 +118,7 @@ axolotl fetch deepspeed_configs  # OPTIONAL

 Installing with Docker can be less error prone than installing in your own environment.
 ```bash
-docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
+docker run --gpus '"all"' --ipc=host --rm -it axolotlai/axolotl:main-latest
 ```

 Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html).
@@ -157,6 +165,29 @@ That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/ge
 - [API Reference](https://docs.axolotl.ai/docs/api/) - Auto-generated code documentation
 - [FAQ](https://docs.axolotl.ai/docs/faq.html) - Frequently asked questions

+## AI Agent Support
+
+Axolotl ships with built-in documentation optimized for AI coding agents (Claude Code, Cursor, Copilot, etc.). These docs are bundled with the pip package — no repo clone needed.
+
+```bash
+# Show overview and available training methods
+axolotl agent-docs
+
+# Topic-specific references
+axolotl agent-docs sft                 # supervised fine-tuning
+axolotl agent-docs grpo                # GRPO online RL
+axolotl agent-docs preference_tuning   # DPO, KTO, ORPO, SimPO
+axolotl agent-docs reward_modelling    # outcome and process reward models
+axolotl agent-docs pretraining         # continual pretraining
+axolotl agent-docs --list              # list all topics
+
+# Dump config schema for programmatic use
+axolotl config-schema
+axolotl config-schema --field adapter
+```
+
+If you're working with the source repo, agent docs are also available at `docs/agents/` and the project overview is in `AGENTS.md`.
+
 ## 🤝 Getting Help

 - Join our [Discord community](https://discord.gg/HhrNrHJPRb) for support
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -134,7 +134,6 @@ quartodoc:
        - monkeypatch.stablelm_attn_hijack_flash
        - monkeypatch.trainer_fsdp_optim
        - monkeypatch.transformers_fa_utils
-        - monkeypatch.unsloth_
        - monkeypatch.data.batch_dataset_fetcher
        - monkeypatch.mixtral
        - monkeypatch.gradient_checkpointing.offload_cpu
@@ -327,7 +326,6 @@ website:
        - section: "Advanced Features"
          contents:
            - docs/fsdp_qlora.qmd
-            - docs/unsloth.qmd
            - docs/torchao.qmd
            - docs/custom_integrations.qmd
            - docs/sequence_parallelism.qmd
--- a/cicd/Dockerfile-uv.jinja
+++ b/cicd/Dockerfile-uv.jinja
@@ -22,15 +22,6 @@ WORKDIR /workspace/axolotl
 RUN git fetch origin +$GITHUB_REF && \
    git checkout FETCH_HEAD

-# If AXOLOTL_EXTRAS is set, append it in brackets
-RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
-        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
-        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
-        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
-        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
-        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
-    fi
-
 RUN uv pip install packaging==26.0 setuptools==78.1.1
 RUN uv pip install torchvision
 RUN uv pip uninstall causal_conv1d
@@ -40,11 +31,21 @@ RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
    fi

-RUN python scripts/unsloth_install.py --uv | sh
+# Override with nightly HF packages for nightly builds
+RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
+        uv pip install --no-deps \
+            "transformers @ git+https://github.com/huggingface/transformers.git@main" \
+            "peft @ git+https://github.com/huggingface/peft.git@main" \
+            "accelerate @ git+https://github.com/huggingface/accelerate.git@main" \
+            "trl @ git+https://github.com/huggingface/trl.git@main" \
+            "datasets @ git+https://github.com/huggingface/datasets.git@main"; \
+    fi
+
 RUN python scripts/cutcrossentropy_install.py --uv | sh

 # So we can test the Docker image
-RUN uv pip install -r requirements-dev.txt -r requirements-tests.txt
+RUN uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
+    codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse

 # fix so that git fetch/pull from remote works
 RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -1,54 +0,0 @@
-FROM axolotlai/axolotl-base:{{ BASE_TAG }}
-
-ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
-ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
-ENV CUDA="{{ CUDA }}"
-ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
-ENV GITHUB_REF="{{ GITHUB_REF }}"
-ENV GITHUB_SHA="{{ GITHUB_SHA }}"
-ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
-ENV HF_HOME="{{ HF_HOME }}"
-ENV AXOLOTL_DATASET_NUM_PROC="8"
-
-RUN apt-get update && \
-    apt-get install -y --allow-change-held-packages vim curl nano zstd libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm
-
-WORKDIR /workspace
-
-RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
-
-WORKDIR /workspace/axolotl
-
-RUN git fetch origin +$GITHUB_REF && \
-    git checkout FETCH_HEAD
-
-# If AXOLOTL_EXTRAS is set, append it in brackets
-RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
-        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
-        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
-        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
-        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
-        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
-    fi
-
-RUN pip install packaging==26.0 setuptools==78.1.1 psutil
-RUN pip uninstall -y causal_conv1d
-RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
-    else \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
-    fi
-
-RUN python scripts/unsloth_install.py | sh
-RUN python scripts/cutcrossentropy_install.py | sh
-
-# So we can test the Docker image
-RUN pip install -r requirements-dev.txt -r requirements-tests.txt
-
-# fix so that git fetch/pull from remote works
-RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
-    git config --get remote.origin.fetch
-
-# helper for huggingface-login cli
-RUN git config --global credential.helper store
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -e

-python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"
+python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__, f'Expected torch $PYTORCH_VERSION but got {torch.__version__}'"

 set -o pipefail
 for i in 1 2 3; do
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -17,7 +17,7 @@ template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
 template_env = jinja2.Environment(
    loader=template_loader, autoescape=select_autoescape()
 )
-dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja")
+dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile-uv.jinja")
 df_template = template_env.get_template(dockerfile)

 df_args = {
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
@@ -16,7 +16,7 @@ template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
 template_env = jinja2.Environment(
    loader=template_loader, autoescape=select_autoescape()
 )
-dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja")
+dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile-uv.jinja")
 df_template = template_env.get_template(dockerfile)

 df_args = {
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -32,7 +32,7 @@ RUN if [ "$TARGETARCH" = "arm64" ]; then \
        pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
        pip install --no-build-isolation -e .[$BASE_EXTRAS] $AXOLOTL_ARGS; \
-    fi && \    python scripts/unsloth_install.py | sh && \
+    fi && \
    python scripts/cutcrossentropy_install.py | sh && \
    pip install pytest && \
    pip cache purge
--- a/docker/Dockerfile-uv
+++ b/docker/Dockerfile-uv
@@ -33,7 +33,6 @@ RUN if [ "$TARGETARCH" = "arm64" ]; then \
    else \
        uv pip install --no-build-isolation -e .[$BASE_EXTRAS] $AXOLOTL_ARGS; \
    fi && \
-    python scripts/unsloth_install.py --uv | sh && \
    python scripts/cutcrossentropy_install.py --uv | sh && \
    uv pip install pytest && \
    uv cache clean
--- a/docs/agents/model_architectures.md
+++ b/docs/agents/model_architectures.md
@@ -0,0 +1,198 @@
+# Model Architectures — Agent Reference
+
+Model-specific quirks, required settings, and known issues. Check this before debugging training failures on specific model families.
+
+## VLM (Vision Language Model) Quick Start
+
+All VLM configs require these four lines:
+```yaml
+processor_type: AutoProcessor
+skip_prepare_dataset: true
+remove_unused_columns: false
+sample_packing: false
+```
+
+Decision tree for VLM config:
+```text
+Is the model multimodal (has vision/audio encoder)?
+  ├─ YES: Add `freeze_mm_modules: true` if training text only
+  │       Add `chat_template: <model_template>` (e.g. gemma4, qwen3_5, gemma3)
+  │       LoRA: use regex `lora_target_modules` to restrict to language model
+  └─ NO: Train as a regular text model
+
+Is the model MoE (e.g. Gemma4 26B-A4B, Qwen3.5 35B-A3B)?
+  ├─ YES: Add `lora_target_parameters` for expert LoRA
+  │       Consider ScatterMoE kernels (see Plugins section)
+  └─ NO: Standard LoRA config
+```
+
+## Plugins & Optimizations
+
+### Cut Cross Entropy (CCE)
+
+Computes loss from hidden states + lm_head weight without materializing the full logits tensor, saving significant VRAM. Install if not already present:
+
+```bash
+uv pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@main"
+```
+
+```yaml
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+```
+
+### ScatterMoE Kernels
+
+Fuses expert + LoRA computation into a single kernel for MoE models. Significant speedup for models with many experts.
+
+```yaml
+plugins:
+  - axolotl.integrations.kernels.KernelsPlugin
+use_kernels: true
+use_scattermoe: true
+experts_implementation: scattermoe
+
+# Expert LoRA targets (3D parameter tensors, not nn.Linear):
+lora_target_parameters:
+  - experts.gate_up_proj
+  - experts.down_proj
+```
+
+Supported: Gemma4 (`gemma4_text`), Mixtral, Qwen MoE variants. The plugin auto-detects model type and routing function. Without ScatterMoE, expert LoRA still works but runs base expert matmul and LoRA as separate operations.
+
+## Gemma 4
+
+**Models**: `google/gemma-4-26B-A4B` (MoE), `google/gemma-4-31B` (dense), `google/gemma-4-E2B`, `google/gemma-4-E4B`
+
+**Architecture**: Multimodal wrapper (`Gemma4ForConditionalGeneration`) over a text backbone (`Gemma4TextModel`), with optional vision/audio encoders. All Gemma4 HF repos have `model_type: "gemma4"` — even text-only variants load as multimodal with a vision tower.
+
+### Required settings
+
+```yaml
+# Always needed for Gemma4:
+freeze_mm_modules: true          # Freeze vision/audio encoders for text-only training
+gradient_checkpointing_kwargs:
+  use_reentrant: false           # Shared per-layer norms cause "marked ready twice" with reentrant
+
+# LoRA target — restrict to language model only (DO NOT use lora_target_linear: true):
+lora_target_modules: 'model.language_model.layers.[\d]+.(_checkpoint_wrapped_module.)?(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'
+```
+
+### Auto-detection
+
+Axolotl auto-detects Gemma4 and applies:
+- `use_reentrant: false` for gradient checkpointing
+- `ddp_find_unused_parameters: true` for DDP (skipped when `activation_offloading: true`)
+
+### Multi-GPU
+
+| Strategy | Works? | Notes |
+|----------|--------|-------|
+| DDP | Yes | Auto-sets `ddp_find_unused_parameters=True` |
+| DDP + activation_offloading | Yes | `find_unused_parameters` is skipped (conflicts with checkpoint wrappers) |
+| FSDP1 | No | OOM during dequantization/sharding with QLoRA |
+| FSDP2 | Yes | Use `Gemma4TextDecoderLayer` (not `Gemma4DecoderLayer`) as wrap class |
+| FSDP2 + activation_offloading | Yes | Lowest VRAM (~26 GiB/GPU for 26B-A4B) |
+
+FSDP2 config:
+```yaml
+fsdp:
+  - full_shard
+  - auto_wrap
+fsdp_config:
+  fsdp_version: 2
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_transformer_layer_cls_to_wrap: Gemma4TextDecoderLayer
+```
+
+### MoE (26B-A4B)
+
+- `enable_moe_block: true`, 256 experts, top-k routing
+- No separate `SparseMoeBlock` — MoE is embedded in each decoder layer
+- Expert LoRA targets 3D parameter tensors:
+  ```yaml
+  lora_target_parameters:
+    - experts.gate_up_proj
+    - experts.down_proj
+  ```
+- ScatterMoE kernel acceleration:
+  ```yaml
+  plugins:
+    - axolotl.integrations.kernels.KernelsPlugin
+  use_kernels: true
+  use_scattermoe: true
+  experts_implementation: scattermoe
+  ```
+
+### VLM (Vision) Training
+
+All Gemma4 models load as `Gemma4ForConditionalGeneration` with a vision tower. No custom `ProcessingStrategy` needed — the base class auto-detects the image token.
+
+```yaml
+base_model: google/gemma-4-E2B-it   # or E4B-it, 26B-A4B
+processor_type: AutoProcessor
+freeze_mm_modules: true
+chat_template: gemma4
+
+skip_prepare_dataset: true
+remove_unused_columns: false
+sample_packing: false
+```
+
+A starting VLM loss of ~8-15 is typical. In most runs, loss converges below 1.0 within ~30-50 steps, though results may vary across configurations.
+
+For the 26B-A4B MoE variant with ScatterMoE + expert LoRA + CCE, add:
+```yaml
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+  - axolotl.integrations.kernels.KernelsPlugin
+use_kernels: true
+use_scattermoe: true
+experts_implementation: scattermoe
+lora_target_parameters:
+  - experts.gate_up_proj
+  - experts.down_proj
+```
+
+### Common issues
+
+| Symptom | Cause | Fix |
+|---------|-------|-----|
+| `mm_token_type_ids is required` in DDP | `model.config` not accessible through DDP wrapper | Already fixed — `unwrap_model()` in `compute_loss` and `prediction_step` |
+| `marked a variable ready twice` in DDP | `ddp_find_unused_parameters=True` + activation_offloading checkpoint wrappers | Auto-handled — `find_unused_parameters` is skipped when `activation_offloading: true` |
+| Loss ~12 instead of ~0.5 | Using `lora_target_linear: true` (applies LoRA to vision/audio modules) | Use the regex `lora_target_modules` pattern instead |
+| FSDP2 `Could not find Gemma4AudioLayer` | Auto-wrap detects `_no_split_modules` including audio layers that don't exist | Explicitly set `fsdp_transformer_layer_cls_to_wrap: Gemma4TextDecoderLayer` |
+| `Gemma4ClippableLinear not supported` by PEFT | Vision tower uses a non-standard linear wrapper | Axolotl patches this automatically via `_patch_peft_clippable_linear()` |
+
+### E2B/E4B dense models
+
+These have `hidden_size_per_layer_input: 256` (per-layer input embeddings) and `attention_k_eq_v: False`. Known issue: loss starts higher than expected (~12 vs ~0.5 for 26B). Root cause under investigation — may be related to the per-layer input mechanism or the `Gemma4ForConditionalGeneration` loss computation.
+
+## Gemma 3
+
+**Models**: `google/gemma-3-*`
+
+- `ddp_find_unused_parameters: true` needed (multimodal unused params)
+- `use_reentrant: false` recommended
+- Attention mask must be dropped for sample packing (handled automatically)
+- Multi-GPU test currently skipped (`tests/e2e/multigpu/test_gemma3.py`)
+
+## Qwen 3.5 MoE
+
+**Models**: `Qwen/Qwen3.5-35B-A3B`
+
+- Hybrid architecture: DeltaNet linear attention (30 layers) + full attention (10 layers)
+- 256 experts, 8 active per token
+- Known weight scale drift in late DeltaNet layers (36-38) due to AdamW + rare expert interaction
+- Fix: `normalize_weight_scales` config to detect and rescale outliers:
+  ```yaml
+  normalize_weight_scales:
+    - name_pattern: 'linear_attn\.conv1d\.weight'
+      threshold: 1.3
+  ```
+
+## General MoE Notes
+
+- `lora_target_linear: true` with multimodal MoE models will apply LoRA to ALL linear modules including vision/audio encoders — use regex `lora_target_modules` to restrict to language model only
+- Rare experts get larger effective learning rate from AdamW (small second-moment estimates) — can cause weight drift in recurrent/SSM components. Use `normalize_weight_scales` with `dry_run: true` to detect.
+- For ScatterMoE kernel support, set `experts_implementation: scattermoe` and add the KernelsPlugin
--- a/docs/agents/new_model_support.md
+++ b/docs/agents/new_model_support.md
@@ -0,0 +1,181 @@
+# New Model Support — Agent Reference
+
+Guide for debugging and adding support for new model architectures in axolotl. Based on lessons learned from Gemma4, Gemma3, Qwen2-VL, and other multimodal/MoE models.
+
+## Quick Validation Checklist
+
+When testing a new model, run through these checks in order:
+
+1. **Does the model load?** `axolotl preprocess config.yaml` — catches config schema errors
+2. **Does LoRA apply?** Check for "Unsupported layer type" warnings from PEFT
+3. **Is the initial loss sane?** First-step loss for a pretrained model should be 0.5–2.0 for SFT
+4. **Does sample packing work?** Compare loss with `sample_packing: true` vs `false` — should be similar
+5. **Is CCE active?** Check for "Applying Cut Cross Entropy" log and verify peak VRAM is lower
+
+## Loss Debugging
+
+### Expected initial loss
+A pretrained model doing SFT should start with loss roughly in the 0.5–2.0 range. If loss starts above 3.0, something is wrong. If it's near `log(vocab_size)` (≈ 12 for 262K vocab), the model is predicting at random — attention masking or model weights are broken.
+
+### Direct comparison technique
+The fastest way to isolate a loss issue — bypass the trainer entirely:
+
+```python
+# Load model via axolotl's pipeline (applies all patches)
+from axolotl.cli.config import load_cfg
+from axolotl.utils.config import normalize_config, prepare_plugins
+from axolotl.loaders.tokenizer import load_tokenizer
+from axolotl.loaders.model import ModelLoader
+
+cfg = load_cfg("your_config.yaml")
+normalize_config(cfg)
+prepare_plugins(cfg)
+tokenizer = load_tokenizer(cfg)
+model, _ = ModelLoader(cfg, tokenizer).load()
+
+# Forward pass on preprocessed data
+model.train()
+out = model(input_ids, labels=labels)
+print(f"Direct loss: {out.loss.item()}")  # Compare to trainer's reported loss
+```
+
+If direct loss is correct (~1.0) but trainer reports 3–4x higher, check `model_accepts_loss_kwargs` (see below).
+
+### `model_accepts_loss_kwargs` inflation
+HF Trainer checks if the model's `forward()` has `**kwargs` and sets `model_accepts_loss_kwargs=True`. This changes loss normalization: the trainer does NOT divide loss by `gradient_accumulation_steps` before logging. The gradient is correct — only the logged loss is inflated.
+
+**Symptom**: Logged loss ≈ actual_loss × gradient_accumulation_steps.
+
+**Which models are affected**: Any model with `**kwargs` in forward (common in multimodal models for extra inputs like `mm_token_type_ids`, `pixel_values`, etc.).
+
+**Fix location**: `src/axolotl/core/trainers/base.py` `__init__()` — after `super().__init__()`, check if the unwrapped model actually has `num_items_in_batch` in its forward signature. If not, set `self.model_accepts_loss_kwargs = False`.
+
+## Multimodal Models (ForConditionalGeneration)
+
+Many recent models use `ForConditionalGeneration` as the top-level class, not `ForCausalLM`:
+- Gemma3 → `Gemma3ForConditionalGeneration`
+- Gemma4 → `Gemma4ForConditionalGeneration`
+- Qwen2-VL → `Qwen2VLForConditionalGeneration`
+- LLaVA → `LlavaForConditionalGeneration`
+
+### Why this matters
+
+| Component | Targets `ForCausalLM` | Needs `ForConditionalGeneration` |
+|-----------|----------------------|--------------------------------|
+| CCE patches | ✅ (default) | ❌ silently inactive if not patched |
+| PEFT LoRA | ✅ | May fail on custom layer types |
+| HF Trainer label handling | ✅ | May need extra inputs |
+
+### Required extra inputs
+Multimodal models require special inputs during training even for text-only data:
+
+| Model | Required Input | Value for Text-Only |
+|-------|---------------|-------------------|
+| Gemma4 | `mm_token_type_ids` | `torch.zeros_like(input_ids)` |
+| Gemma3 | `token_type_ids` | `torch.zeros_like(input_ids)` |
+
+Auto-inject in `compute_loss()` when not provided by the data collator. See `core/trainers/base.py`.
+
+### Custom layer types and PEFT
+Vision towers often use custom module wrappers that PEFT doesn't support:
+
+| Model | Custom Layer | Wraps | Fix |
+|-------|-------------|-------|-----|
+| Gemma4 | `Gemma4ClippableLinear` | `nn.Linear` | Redirect to `.linear` child |
+
+Fix location: `src/axolotl/loaders/adapter.py` `_patch_peft_clippable_linear()`.
+
+## Sample Packing
+
+### How packed sequence detection works (transformers ≥ 5.x)
+`transformers.masking_utils._preprocess_mask_arguments()` detects packed sequences from `position_ids` resets. But **only when `attention_mask is None`**:
+
+```python
+# From masking_utils.py:
+if position_ids is not None and attention_mask is None and past_key_values is None:
+    packed_sequence_mask = find_packed_sequence_indices(position_ids)
+```
+
+If the collator provides an all-ones `attention_mask`, packing detection is **skipped** and the model builds a single causal mask spanning all packed sequences → cross-sequence attention leakage → very high loss.
+
+### Fix for models using `create_causal_mask_mapping`
+For Gemma3, Gemma4, and similar models that use the new transformers masking system, remove `attention_mask` from inputs when sample packing is active:
+
+```python
+# In compute_loss():
+if (
+    self.args.sample_packing
+    and model_type in ("gemma4", "gemma3")
+    and "attention_mask" in inputs
+    and "position_ids" in inputs
+):
+    del inputs["attention_mask"]
+```
+
+Fix location: `src/axolotl/core/trainers/base.py` `compute_loss()`.
+
+### Models that DON'T need this fix
+Older models that use `_prepare_4d_causal_attention_mask` (Llama, Mistral, Qwen2, etc.) handle sample packing via axolotl's multipack attention monkeypatch instead. Only models using the new `create_causal_mask_mapping` / `create_causal_mask` masking system need the `attention_mask` removal.
+
+## Attention Backend Selection
+
+| Backend | Config | head_dim limit | torch_compile | Notes |
+|---------|--------|---------------|---------------|-------|
+| FA2 | `flash_attention: true` | 256 | ✅ | Fastest when supported |
+| FA4 | auto with `flash_attention: true` | 256 (SM90+) | ✅ | Auto-detected on H100+ |
+| SDPA | `sdp_attention: true` | None | ✅ | Universal fallback |
+| flex | `flex_attention: true` | None | ⚠️ Triton OOM for large head_dim | Good for variable head dims |
+| eager | neither set | None | ✅ | Slowest, always works |
+
+**Check model support**: Look at `_supports_flash_attn_2`, `_supports_flex_attn`, `_supports_sdpa` attributes on the model class.
+
+**head_dim gotcha**: The 256 limit is specific to flash-attn CUDA kernels, NOT PyTorch-level. SDPA and flex_attention both handle arbitrary head_dim. Models with `global_head_dim > 256` (Gemma4: 512) must use SDPA or flex.
+
+**flex + compile gotcha**: `torch_compile` with flex_attention can hit Triton shared memory OOM for large head_dim. Falls back to eager per-function (not a crash, but slower). Unsloth disables flex for Gemma4 for this reason.
+
+## Cut Cross Entropy (CCE)
+
+### How CCE patches work
+CCE replaces the model's `forward()` with a fused version that computes loss from hidden states + lm_head weight without materializing the full logits tensor. This saves ~`batch × seq_len × vocab_size × dtype_bytes` of VRAM.
+
+### Adding CCE for a new model
+1. Check if the model type is in `cut_cross_entropy.transformers.patch.PATCH_FNS`
+2. If not, axolotl's generic fallback (`integrations/cut_cross_entropy/__init__.py` `patch_llama_like()`) patches `{Prefix}ForCausalLM.forward` with `cce_forward`
+3. For multimodal models (`ForConditionalGeneration`), a model-specific patch is needed in `ml-cross-entropy` repo
+4. The multimodal `cce_forward` must accept all extra kwargs (pixel_values, mm_token_type_ids, etc.) and pop any that would conflict before calling `self.model()`
+
+### Common CCE pitfall
+If CCE appears active (log says "Applying Cut Cross Entropy") but peak VRAM doesn't decrease, check which class was patched. If the model loads as `ForConditionalGeneration` but CCE patched `ForCausalLM`, the patch is silently inactive.
+
+## MoE Models
+
+### Dense MLP vs MoE experts
+Some MoE models (e.g., Gemma4) have BOTH dense MLP layers and MoE expert layers at every decoder layer:
+- `gate_proj/up_proj/down_proj` → targets the **dense MLP** (`Gemma4TextMLP`)
+- `experts.gate_up_proj/experts.down_proj` → targets the **MoE experts** (`Gemma4TextExperts`)
+
+LoRA on the dense MLP works normally. Expert LoRA via `lora_target_parameters` requires PEFT support for the specific expert module type (may warn "Unsupported layer type").
+
+### ScatterMoE kernels
+`use_scattermoe: true` with `experts_implementation: scattermoe` registers fused expert kernels via transformers' `ExpertsInterface`. Significant speedup for MoE models. Requires the kernels plugin:
+```yaml
+plugins:
+  - axolotl.integrations.kernels.KernelsPlugin
+use_kernels: true
+use_scattermoe: true
+experts_implementation: scattermoe
+```
+
+## Where to Add Model-Specific Fixes
+
+| What | Where | Example |
+|------|-------|---------|
+| Missing forward inputs | `core/trainers/base.py` `compute_loss()` | mm_token_type_ids injection |
+| Attention mask fixes | `core/trainers/base.py` `compute_loss()` | Sample packing mask removal |
+| Loss logging fixes | `core/trainers/base.py` `__init__()` | model_accepts_loss_kwargs override |
+| PEFT/LoRA patches | `loaders/adapter.py` | ClippableLinear redirect |
+| Attention patches | `monkeypatch/attention/` | FA4 tuple fix |
+| Model-specific patches | `loaders/patch_manager.py` `_apply_model_specific_patches()` | Llama4, Kimi, NemotronH |
+| CCE patches | `ml-cross-entropy` repo `transformers/` | Per-model cce_forward |
+| Example configs | `examples/<model>/` | Validated YAML |
+| Config validation | `utils/schemas/validation.py` | Compatibility checks |
--- a/docs/agents/preference_tuning.md
+++ b/docs/agents/preference_tuning.md
@@ -38,7 +38,7 @@ No vLLM server needed (unlike GRPO). Offline RL with pre-collected preference da

 1. Paired preference data (chosen + rejected)?
   - Default → `rl: dpo`
-   - Overfitting → `rl: ipo`
+   - Overfitting → `rl: dpo, dpo_loss_type: ["ipo"]`
   - VRAM-limited → `rl: orpo` (no ref model)
   - Length-sensitive → `rl: simpo` (no ref model)
 2. Only binary labels (good/bad)? → `rl: kto`
--- a/docs/agents/sft.md
+++ b/docs/agents/sft.md
@@ -91,6 +91,30 @@ Watch for: loss never decreasing (check `train_on_inputs`, dataset, LR), loss go
 | FSDP save hangs | Use `fsdp_state_dict_type: FULL_STATE_DICT` |
 | DeepSpeed CheckpointError | Set `use_reentrant: true` in `gradient_checkpointing_kwargs` |

+## Profiling
+
+To profile training and identify optimization opportunities:
+
+```yaml
+# Profile steps 3-7 (after warmup/autotuning settles)
+profiler_steps_start: 3
+profiler_steps: 5
+```
+
+This produces `profiler_trace.json` (Chrome trace) and `snapshot.pickle` (memory snapshot) in `output_dir`.
+View the Chrome trace at `chrome://tracing`.
+
+To programmatically inspect the trace:
+```bash
+python scripts/analyze_profile.py output_dir/
+```
+
+The trace shows per-kernel CUDA times, memory allocations, and operator-level breakdown. Look for:
+- **Large matmul kernels**: candidates for fusion or quantization
+- **Memory copies (H2D/D2H)**: unnecessary data movement
+- **Small frequent kernels**: candidates for kernel fusion
+- **Gaps between kernels**: pipeline bubbles from CPU overhead
+
 Full troubleshooting: [training_stability.qmd](../training_stability.qmd), [debugging.qmd](../debugging.qmd)

 ## File Map
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -108,6 +108,14 @@ datasets:
    type: chat_template
 ```

+::: {.callout-tip}
+`chat_template_jinja` also accepts a file path to a `.jinja2` file instead of an inline string:
+
+```yaml
+chat_template_jinja: ./path/to/my_template.jinja2
+```
+:::
+
 ::: {.callout-important}
 Please make sure that your `tokenizer.eos_token` is same as EOS (End-of-Sequence) token in template. Otherwise, set `eos_token` under `special_tokens: `.
 :::
@@ -294,6 +302,113 @@ datasets:
 It is not necessary to set both `message_field_training` and `message_field_training_detail` at once.
 :::

+#### Content parts with per-part training control
+
+Instead of using character offsets with `train_detail`, you can split a message's content into a list of parts, each with its own training flag. This is useful when you want to mask specific sections of a response (e.g., mask reasoning but train on the answer).
+
+```{.json filename="data.jsonl"}
+{
+  "messages": [
+    {"role": "user", "content": [{"type": "text", "text": "What is 2+2?"}]},
+    {
+      "role": "assistant",
+      "content": [
+        {"type": "text", "text": "Let me think step by step...", "train": false},
+        {"type": "text", "text": " The answer is 4.", "train": true}
+      ]
+    }
+  ]
+}
+```
+
+The configuration is the same as standard `chat_template` — no extra fields needed:
+
+```yaml
+datasets:
+  - path: ...
+    type: chat_template
+    roles_to_train: ["assistant"]
+```
+
+Each content part supports:
+
+- `type`: `"text"` (required)
+- `text`: the text value (also accepts `content` or `value` as the key)
+- `train`: `true`/`false` (optional) — whether to train on this part
+- `weight`: `0`/`1` (optional) — alternative to `train`
+
+If a part has no `train` or `weight` flag, it inherits the turn-level training decision (from `roles_to_train`, `message_field_training`, or `train_on_inputs`).
+
+::: {.callout-warning title="Whitespace at part boundaries"}
+BPE tokenizers (used by Llama, Qwen, Mistral, GPT, etc.) prepend spaces to word tokens. For example, `" answer"` is a single token — the space is part of it. This means **where you place whitespace between content parts matters**:
+
+**Split BEFORE spaces** (space goes with the next part):
+
+```json
+[
+  {"type": "text", "text": "Let me think...", "train": false},
+  {"type": "text", "text": " The answer is 4.", "train": true}
+]
+```
+
+**DON'T put trailing spaces** on a part (the space merges with the next word into one token that straddles the boundary, and straddling tokens are masked):
+
+```json
+[
+  {"type": "text", "text": "Let me think... ", "train": false},
+  {"type": "text", "text": "The answer is 4.", "train": true}
+]
+```
+
+In the bad example, `" The"` becomes a single token that spans both parts. Because it straddles the boundary, it is conservatively **masked** (not trained) — even though the second part has `train: true`.
+
+**Newlines** typically merge with preceding punctuation (e.g., `":\n"` is one token). Keep newlines with the preceding part:
+
+```json
+[
+  {"type": "text", "text": "Thinking:\n", "train": false},
+  {"type": "text", "text": "The answer is 4.", "train": true}
+]
+```
+
+Axolotl will log a warning if it detects trailing whitespace at a boundary between parts with different training flags.
+:::
+
+::: {.callout-note}
+When all content parts in a message are strings, they are concatenated before being passed to the chat template. This means content parts work with **any** Jinja template — the template sees a plain string, and the per-part training flags are applied during tokenization.
+:::
+
+##### Per-part training on reasoning_content
+
+For templates that support a separate `reasoning_content` field (e.g., `qwen3`), the same content-parts format works on `reasoning_content`. This is useful for masking incorrect reasoning steps while training on self-corrections:
+
+```{.json filename="data.jsonl"}
+{
+  "messages": [
+    {"role": "user", "content": [{"type": "text", "text": "What is 2+2?"}]},
+    {
+      "role": "assistant",
+      "reasoning_content": [
+        {"type": "text", "text": "Hmm maybe 2+2=5.", "train": false},
+        {"type": "text", "text": " Wait no, 2+2=4.", "train": true}
+      ],
+      "content": [
+        {"type": "text", "text": "The answer is 4.", "train": true}
+      ]
+    }
+  ]
+}
+```
+
+The `reasoning_content` and `content` fields are handled independently — each has its own token boundaries and per-part masking. No additional configuration is needed beyond what the template already requires.
+
+::: {.callout-tip}
+When `reasoning_content` is provided as a separate field, `split_thinking` is not needed — the reasoning is already separated from the content in the data.
+:::
+
+The same whitespace rules apply to `reasoning_content` parts as to `content` parts — split before spaces, keep newlines with the preceding part.
+
+
 #### Reasoning split

 (For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.
--- a/docs/debugging.qmd
+++ b/docs/debugging.qmd
@@ -76,8 +76,9 @@ datasets:
 Make sure you have an [editable install](https://setuptools.pypa.io/en/latest/userguide/development_mode.html) of Axolotl, which ensures that changes you make to the code are reflected at runtime.  Run the following commands from the root of this project:

 ```bash
-pip3 install packaging
-pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
+export UV_TORCH_BACKEND=cu128  # or cu130
+uv sync --extra flash-attn --extra deepspeed --group dev --group test
+source .venv/bin/activate
 ```

 #### Remote Hosts
@@ -208,17 +209,17 @@ cd axolotl
 Next, run the desired docker image and mount the current directory. Below is a docker command you can run to do this:[^2]

 ```bash
-docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl:main-py3.10-cu118-2.0.1
+docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl-uv:main-latest
 ```

 >[!Tip]
 > To understand which containers are available, see the [Docker section of the README](../README.md#docker) and the [DockerHub repo](https://hub.docker.com/r/axolotlai/axolotl/tags).  For details of how the Docker containers are built, see axolotl's [Docker CI builds](../.github/workflows/main.yml).

-You will now be in the container.  Next, perform an editable install of Axolotl:
+You will now be in the container.  Next, install Axolotl with dev dependencies:

 ```bash
-pip3 install packaging
-pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
+uv sync --extra flash-attn --extra deepspeed --group dev --group test
+source .venv/bin/activate
 ```

 ### Attach To Container
--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -6,23 +6,30 @@ format:
    toc-depth: 4
 ---

-This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai).
+This section describes the different Docker images that are released by AxolotlAI at
+[Docker Hub](https://hub.docker.com/u/axolotlai).

 ::: {.callout-important}
-For Blackwell GPUs, please use the tags with PyTorch 2.7.1 and CUDA 12.8.
+For Blackwell GPUs, please use the tags with PyTorch 2.9.1 and CUDA 12.8.
+:::
+
+::: {.callout-tip}
+Each image below is available in a **uv variant** that uses [uv](https://docs.astral.sh/uv/) with
+a relocatable venv (`/workspace/axolotl-venv`) instead of Miniconda + pip. Append `-uv` to the image name
+(e.g. `axolotlai/axolotl-base-uv`). Tags follow the same format. We recommend the uv images for new deployments.
 :::

 ## Base

-The base image is the most minimal image that can install Axolotl. It is based on the `nvidia/cuda` image. It includes python, torch, git, git-lfs, awscli, pydantic, and more.
+The base image is the most minimal image that can install Axolotl. It is based on the `nvidia/cuda` image.
+It includes python, torch, git, git-lfs, awscli, pydantic, and more.

 #### Image

-```
-axolotlai/axolotl-base
-```
-
-Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-base)
+| Variant | Image | Docker Hub |
+|---------|-------|------------|
+| pip | `axolotlai/axolotl-base` | [Link](https://hub.docker.com/r/axolotlai/axolotl-base) |
+| uv | `axolotlai/axolotl-base-uv` | [Link](https://hub.docker.com/r/axolotlai/axolotl-base-uv) |

 #### Tags format

@@ -32,8 +39,10 @@ main-base-py{python_version}-cu{cuda_version}-{pytorch_version}

 Tags examples:

- `main-base-py3.11-cu128-2.8.0`
 - `main-base-py3.11-cu128-2.9.1`
+- `main-base-py3.12-cu128-2.10.0`
+- `main-base-py3.12-cu130-2.9.1`
+- `main-base-py3.12-cu130-2.10.0`

 ## Main

@@ -41,11 +50,10 @@ The main image is the image that is used to run Axolotl. It is based on the `axo

 #### Image

-```
-axolotlai/axolotl
-```
-
-Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl)
+| Variant | Image | Docker Hub |
+|---------|-------|------------|
+| pip | `axolotlai/axolotl` | [Link](https://hub.docker.com/r/axolotlai/axolotl) |
+| uv | `axolotlai/axolotl-uv` | [Link](https://hub.docker.com/r/axolotlai/axolotl-uv) |

 #### Tags format {#sec-main-tags}

@@ -53,7 +61,7 @@ Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl)
 # on push to main
 main-py{python_version}-cu{cuda_version}-{pytorch_version}

-# latest main (currently torch 2.6.0, python 3.11, cuda 12.4)
+# latest main (currently torch 2.9.1, python 3.11, cuda 12.8)
 main-latest

 # nightly build
@@ -71,11 +79,12 @@ There may be some extra tags appended to the image, like `-vllm` which installs

 Tags examples:

- `main-py3.11-cu128-2.8.0`
 - `main-py3.11-cu128-2.9.1`
+- `main-py3.12-cu128-2.10.0`
+- `main-py3.12-cu130-2.9.1`
+- `main-py3.12-cu130-2.10.0`
 - `main-latest`
- `main-20250303-py3.11-cu124-2.6.0`
- `main-20250303-py3.11-cu126-2.6.0`
+- `main-20260315-py3.11-cu128-2.9.1`
 - `0.12.0`

 ## Cloud
@@ -90,11 +99,10 @@ Jupyter lab is run by default. Set `JUPYTER_DISABLE=1` in the environment variab

 #### Image

-```
-axolotlai/axolotl-cloud
-```
-
-Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-cloud)
+| Variant | Image | Docker Hub |
+|---------|-------|------------|
+| pip | `axolotlai/axolotl-cloud` | [Link](https://hub.docker.com/r/axolotlai/axolotl-cloud) |
+| uv | `axolotlai/axolotl-cloud-uv` | [Link](https://hub.docker.com/r/axolotlai/axolotl-cloud-uv) |

 #### Tags format

--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -15,64 +15,30 @@ This guide covers all the ways you can install and set up Axolotl for your envir

 - NVIDIA GPU (Ampere architecture or newer for `bf16` and Flash Attention) or AMD GPU
 - Python ≥3.11
- PyTorch ≥2.6.0
+- PyTorch ≥2.9.0

-## Installation Methods {#sec-installation-methods}
-
-::: {.callout-important}
-Please make sure to have Pytorch installed before installing Axolotl in your local environment.
-
-Follow the instructions at: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
-:::
+## Installation {#sec-installation}

 ::: {.callout-important}
 For Blackwell GPUs, please use Pytorch 2.9.1 and CUDA 12.8.
 :::

-### PyPI Installation (Recommended) {#sec-pypi}
+### Quick Install {#sec-uv}

-```{.bash}
-pip3 install -U packaging setuptools wheel ninja
-pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
-```
+Axolotl uses [uv](https://docs.astral.sh/uv/) as its package manager. uv is a fast, reliable Python package installer and resolver built in Rust.

-We use `--no-build-isolation` in order to detect the installed PyTorch version (if
-installed) in order not to clobber it, and so that we set the correct version of
-dependencies that are specific to the PyTorch version or other installed
-co-dependencies.
-
-### uv Installation {#sec-uv}
-
-uv is a fast, reliable Python package installer and resolver built in Rust. It offers significant performance improvements over pip and provides better dependency resolution, making it an excellent choice for complex environments.
-
-Install uv if not already installed
+Install uv if not already installed:
 ```{.bash}
 curl -LsSf https://astral.sh/uv/install.sh | sh
 source $HOME/.local/bin/env
 ```

-Choose your CUDA version to use with PyTorch; e.g. `cu124`, `cu126`, `cu128`,
-then create the venv and activate
+Choose your CUDA version (e.g. `cu128`, `cu130`), create a venv, and install:
 ```{.bash}
-export UV_TORCH_BACKEND=cu126
+export UV_TORCH_BACKEND=cu128  # or cu130
 uv venv --no-project --relocatable
 source .venv/bin/activate
-```
-
-Install PyTorch
- PyTorch 2.6.0 recommended
-```{.bash}
-uv pip install packaging setuptools wheel
-uv pip install torch==2.6.0
-uv pip install awscli pydantic
-```
-
-Install axolotl from PyPi
-```{.bash}
-uv pip install --no-build-isolation axolotl[deepspeed,flash-attn]
-
-# optionally install with vLLM if you're using torch==2.6.0 and want to train w/ GRPO
-uv pip install --no-build-isolation axolotl[deepspeed,flash-attn,vllm]
+uv pip install --no-build-isolation axolotl[flash-attn,deepspeed]
 ```

 ### Edge/Development Build {#sec-edge-build}
@@ -82,14 +48,17 @@ For the latest features between releases:
 ```{.bash}
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
-pip3 install -U packaging setuptools wheel ninja
-pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
+export UV_TORCH_BACKEND=cu128  # or cu130
+uv sync --extra flash-attn --extra deepspeed
+source .venv/bin/activate
 ```

+`uv sync` creates a `.venv`, installs exact pinned versions from `uv.lock`, and sets up an editable install automatically.
+
 ### Docker {#sec-docker}

 ```{.bash}
-docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
+docker run --gpus '"all"' --rm -it --ipc=host axolotlai/axolotl-uv:main-latest
 ```

 For development with Docker:
@@ -106,12 +75,12 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it \
  --ulimit memlock=-1 --ulimit stack=67108864 \
  --mount type=bind,src="${PWD}",target=/workspace/axolotl \
  -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-  axolotlai/axolotl:main-latest
+  axolotlai/axolotl-uv:main-latest
 ```
 :::

 ::: {.callout-important}
-For Blackwell GPUs, please use `axolotlai/axolotl:main-py3.11-cu128-2.9.1` or the cloud variant `axolotlai/axolotl-cloud:main-py3.11-cu128-2.9.1`.
+For Blackwell GPUs, please use `axolotlai/axolotl-uv:main-py3.11-cu128-2.9.1` or the cloud variant `axolotlai/axolotl-cloud-uv:main-py3.11-cu128-2.9.1`.
 :::

 Please refer to the [Docker documentation](docker.qmd) for more information on the different Docker images that are available.
@@ -122,7 +91,7 @@ Please refer to the [Docker documentation](docker.qmd) for more information on t

 For providers supporting Docker:

- Use `axolotlai/axolotl-cloud:main-latest`
+- Use `axolotlai/axolotl-cloud-uv:main-latest`
 - Available on:
    - [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
    - [Vast.ai](https://cloud.vast.ai?ref_id=62897&template_id=bdd4a49fa8bce926defc99471864cace&utm_source=axolotl&utm_medium=partner&utm_campaign=template_launch_july2025&utm_content=docs_link)
@@ -141,7 +110,7 @@ For providers supporting Docker:
 ### macOS {#sec-macos}

 ```{.bash}
-pip3 install --no-build-isolation -e '.'
+uv pip install --no-build-isolation -e '.'
 ```

 See @sec-troubleshooting for Mac-specific issues.
@@ -152,21 +121,44 @@ See @sec-troubleshooting for Mac-specific issues.
 We recommend using WSL2 (Windows Subsystem for Linux) or Docker.
 :::

-## Environment Managers {#sec-env-managers}
+## Migrating from pip to uv {#sec-migrating}

-### Conda/Pip venv {#sec-conda}
+If you have an existing pip-based Axolotl installation, you can migrate to uv:

-1. Install Python ≥3.11
-2. Install PyTorch: https://pytorch.org/get-started/locally/
-3. Install Axolotl:
-   ```{.bash}
-   pip3 install -U packaging setuptools wheel ninja
-   pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
-   ```
-4. (Optional) Login to Hugging Face:
-   ```{.bash}
-   hf auth login
-   ```
+```{.bash}
+# Install uv
+curl -LsSf https://astral.sh/uv/install.sh | sh
+source $HOME/.local/bin/env
+
+# Create a fresh venv (recommended for a clean start)
+export UV_TORCH_BACKEND=cu128  # or cu130
+uv venv --no-project --relocatable
+source .venv/bin/activate
+
+# Reinstall axolotl
+uv pip install --no-build-isolation axolotl[flash-attn,deepspeed]
+```
+
+## Using pip (Alternative) {#sec-pip}
+
+If you are unable to install uv, you can still use pip directly.
+
+::: {.callout-important}
+Please make sure to have PyTorch installed before installing Axolotl with pip.
+
+Follow the instructions at: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
+:::
+
+```{.bash}
+pip3 install -U packaging setuptools wheel ninja
+pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
+```
+
+For editable/development installs:
+```{.bash}
+pip3 install -U packaging setuptools wheel ninja
+pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
+```

 ## Troubleshooting {#sec-troubleshooting}

--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -8,6 +8,7 @@ format:

 ## Supported Models

+- [Gemma-4](#sec-gemma-4) *(NEW)*
 - [Mllama](#sec-mllama)
 - [Llama4](#sec-llama4)
 - [Pixtral](#sec-pixtral)
@@ -138,6 +139,40 @@ base_model: mistralai/Voxtral-Mini-3B-2507
 processor_type: VoxtralProcessor
 ```

+### Gemma-4 {#sec-gemma-4}
+
+All Gemma 4 variants (E2B, E4B, 26B-A4B, 31B) load as multimodal models even for text-only training.
+
+```yaml
+base_model: google/gemma-4-E2B-it  # or E4B-it, 26B-A4B, 31B
+
+chat_template: gemma4
+freeze_mm_modules: true  # freeze vision/audio encoders for text-only or vision LoRA
+
+# For the 26B-A4B MoE model, enable ScatterMoE and expert LoRA:
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+  - axolotl.integrations.kernels.KernelsPlugin
+use_kernels: true
+use_scattermoe: true
+experts_implementation: scattermoe
+
+lora_target_modules: 'model.language_model.layers.[\d]+.(_checkpoint_wrapped_module.)?(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'
+
+# MoE expert LoRA (3D tensors, not nn.Linear) — only for 26B-A4B:
+lora_target_parameters:
+  - experts.gate_up_proj
+  - experts.down_proj
+```
+
+::: {.callout-warning}
+Gemma 4 VLM training starts with high loss (~8-15). This is expected — see the [training stability guide](training_stability.qmd) for details.
+:::
+
+::: {.callout-tip}
+For DDP training, axolotl auto-detects Gemma4 and sets `use_reentrant=False` and `ddp_find_unused_parameters=True`. However, when `activation_offloading: true`, `ddp_find_unused_parameters` is skipped (checkpoint wrappers conflict with it); use `freeze_mm_modules: true` instead to handle unused vision/audio params. For FSDP2, use `fsdp_transformer_layer_cls_to_wrap: Gemma4TextDecoderLayer`.
+:::
+
 ### Gemma-3 {#sec-gemma-3}

 ::: {.callout-tip}
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -320,8 +320,10 @@ The input format is a simple JSON input with customizable fields based on the ab
 As IPO is just DPO with a different loss function, all supported dataset formats for [DPO](#dpo) are also supported for IPO.

 ```yaml
-rl: ipo
+rl: dpo
+dpo_loss_type: ["ipo"]
 ```
+*Note:* Passing `rl: ipo` directly is still supported, but will soon be deprecated.

 ### ORPO

--- a/docs/training_stability.qmd
+++ b/docs/training_stability.qmd
@@ -137,50 +137,6 @@ This means the policy has diverged significantly from the weights used by vLLM f
 - Increase `gradient_accumulation_steps` to smooth out noisy batches.
 - Check for NaN issues (see next section).

-## MoE Weight Scale Drift
-
-**Symptom**: Model works on short prompts but loses coherence on long conversations — repeating itself, "philosophizing", or generating broken code. Particularly affects MoE models with recurrent/SSM components (e.g. DeltaNet linear attention).
-
-**Root cause**: In MoE models trained with AdamW, rarely-activated experts accumulate smaller second-moment estimates. This gives them a disproportionately large effective learning rate, causing their weights to drift to higher variance than the group norm. In recurrent components like `conv1d` in DeltaNet layers, this amplifies short-range context and washes out long-range state.
-
-**Detection**: Use `normalize_weight_scales` with `dry_run: true` to scan for anomalies without modifying weights:
-
-```yaml
-normalize_weight_scales:
-  - name_pattern: 'linear_attn\.conv1d\.weight'
-    threshold: 1.3
-    dry_run: true
-```
-
-This logs any tensors matching the pattern whose standard deviation exceeds 1.3x the group median. Example output:
-
-```
-normalize_weight_scales [DRY RUN]: pattern 'linear_attn\.conv1d\.weight' —
-  3/30 tensors outside 1.3x threshold (median std=0.062733):
-    layers.36.linear_attn.conv1d.weight: std=0.101870 (1.62x median)
-    layers.37.linear_attn.conv1d.weight: std=0.102362 (1.63x median)
-    layers.38.linear_attn.conv1d.weight: std=0.089227 (1.42x median)
-```
-
-Each rule accepts:
-
- `name_pattern`: regex matched against parameter names. All matching tensors form a group.
- `threshold`: flag tensors whose std deviates from the group median by more than this factor (default: 1.5).
- `dry_run`: when `true`, log anomalies without modifying weights (default: `false`).
-
-Multiple rules can target different tensor patterns:
-
-```yaml
-normalize_weight_scales:
-  - name_pattern: 'linear_attn\.conv1d\.weight'
-    threshold: 1.3
-  - name_pattern: 'experts\.gate_up_proj'
-    threshold: 1.5
-    dry_run: true  # just check these, don't fix
-```
-
-The transform runs after model loading but before adapter injection, so it modifies the base model weights directly.
-
 ## NaN and Inf Handling

 ### Common Causes
--- a/docs/unsloth.qmd
+++ b/docs/unsloth.qmd
@@ -1,53 +0,0 @@
---
-title: "Unsloth"
-description: "Hyper-optimized QLoRA finetuning for single GPUs"
---
-
-### Overview
-
-Unsloth provides hand-written optimized kernels for LLM finetuning that slightly improve speed and VRAM over
-standard industry baselines.
-
-::: {.callout-important}
-Due to breaking changes in transformers `v4.48.0`, users will need to downgrade to `<=v4.47.1` to use this patch.
-
-This will later be deprecated in favor of [LoRA Optimizations](lora_optims.qmd).
-:::
-
-
-### Installation
-
-The following will install the correct unsloth and extras from source.
-
-```bash
-python scripts/unsloth_install.py | sh
-```
-
-### Usage
-
-Axolotl exposes a few configuration options to try out unsloth and get most of the performance gains.
-
-Our unsloth integration is currently limited to the following model architectures:
- - llama
-
-These options are specific to LoRA finetuning and cannot be used for multi-GPU finetuning
-```yaml
-unsloth_lora_mlp: true
-unsloth_lora_qkv: true
-unsloth_lora_o: true
-```
-
-These options are composable and can be used with multi-gpu finetuning
-```yaml
-unsloth_cross_entropy_loss: true
-unsloth_rms_norm: true
-unsloth_rope: true
-```
-
-### Limitations
-
- Single GPU only; e.g. no multi-gpu support
- No deepspeed or FSDP support (requires multi-gpu)
- LoRA + QLoRA support only. No full fine tunes or fp8 support.
- Limited model architecture support. Llama, Phi, Gemma, Mistral only
- No MoE support.
--- a/examples/LiquidAI/README.md
+++ b/examples/LiquidAI/README.md
@@ -15,8 +15,7 @@ Thanks to the team at LiquidAI for giving us early access to prepare for these r
    Here is an example of how to install from pip:
    ```bash
    # Ensure you have a compatible version of Pytorch installed
-    pip3 install packaging setuptools wheel ninja
-    pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+    uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
    ```

 2.  Run one of the finetuning examples below.
@@ -35,7 +34,7 @@ Thanks to the team at LiquidAI for giving us early access to prepare for these r

    **LFM2-MoE**
    ```bash
-    pip install git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6
+    uv pip install git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6

    # LoRA SFT (1x48GB @ 16.2GiB)
    axolotl train examples/LiquidAI/lfm2-8b-a1b-lora.yaml
@@ -45,7 +44,7 @@ Thanks to the team at LiquidAI for giving us early access to prepare for these r

 - **Installation Error**: If you encounter `ImportError: ... undefined symbol ...` or `ModuleNotFoundError: No module named 'causal_conv1d_cuda'`, the `causal-conv1d` package may have been installed incorrectly. Try uninstalling it:
  ```bash
-  pip uninstall -y causal-conv1d
+  uv pip uninstall causal-conv1d
  ```

 - **Dataset Loading**: Read more on how to load your own dataset in our [documentation](https://docs.axolotl.ai/docs/dataset_loading.html).
--- a/examples/apertus/README.md
+++ b/examples/apertus/README.md
@@ -15,8 +15,7 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl

-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation -e '.[flash-attn]'
+uv pip install --no-build-isolation -e '.[flash-attn]'

 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
@@ -31,7 +30,7 @@ python scripts/cutcrossentropy_install.py | sh
 # For those using our Docker image, use the below path.
 export CUDA_HOME=/usr/local/cuda

-pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
+uv pip install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
 ```

 For any installation errors, see [XIELU Installation Issues](#xielu-installation-issues)
@@ -67,7 +66,7 @@ If those didn't help, please try the below solutions:
 1. Pass env for CMAKE and try install again:

    ```bash
-    Python_EXECUTABLE=$(which python) pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
+    Python_EXECUTABLE=$(which python) uv pip install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
    ```

 2. Git clone the repo and manually hardcode python path:
@@ -92,7 +91,7 @@ If those didn't help, please try the below solutions:
    ```

    ```bash
-    pip3 install . --no-build-isolation --no-deps
+    uv pip install . --no-build-isolation --no-deps
    ```

 ## Optimization Guides
--- a/examples/arcee/README.md
+++ b/examples/arcee/README.md
@@ -17,8 +17,7 @@ Thanks to the team at Arcee.ai for using Axolotl in supervised fine-tuning the A
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl

-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation -e '.[flash-attn]'
+uv pip install --no-build-isolation -e '.[flash-attn]'

 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -40,7 +40,7 @@
    "%%capture\n",
    "# This step can take ~5-10 minutes to install dependencies\n",
    "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
-    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@63b15e6\""
+    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@fec1a88\""
   ]
  },
  {
--- a/examples/devstral/README.md
+++ b/examples/devstral/README.md
@@ -16,8 +16,7 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r

 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```

 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage
--- a/examples/gemma3/gemma-3-1b-qlora.yml
+++ b/examples/gemma3/gemma-3-1b-qlora.yml
@@ -26,8 +26,8 @@ output_dir: ./outputs/out

 # Freeze vision tower
 unfrozen_parameters:
-  - ^model\.language_model\..*
-  - ^lm_head\..*
+  - ^model.language_model.*
+  - ^lm_head.*

 adapter: qlora
 lora_r: 32
--- a/examples/gemma3/gemma-3-270m-qlora.yml
+++ b/examples/gemma3/gemma-3-270m-qlora.yml
@@ -26,8 +26,8 @@ output_dir: ./outputs/out

 # Freeze vision tower
 unfrozen_parameters:
-  - ^model\.language_model\..*
-  - ^lm_head\..*
+  - ^model.language_model.*
+  - ^lm_head.*

 adapter: qlora
 lora_r: 32
--- a/examples/gemma3/gemma-3-4b-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-qlora.yml
@@ -22,8 +22,8 @@ output_dir: ./outputs/out

 # Freeze vision tower
 unfrozen_parameters:
-  - ^model\.language_model\..*
-  - ^lm_head\..*
+  - ^model.language_model.*
+  - ^lm_head.*

 adapter: qlora
 lora_model_dir:
--- a/examples/gemma3n/README.md
+++ b/examples/gemma3n/README.md
@@ -10,17 +10,16 @@ Gemma-3n is a family of multimodal models from Google found on [HuggingFace](htt

 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```

 2. In addition to Axolotl's requirements, Gemma-3n requires:

 ```bash
-pip3 install timm==1.0.17
+uv pip install timm==1.0.17

 # for loading audio data
-pip3 install librosa==0.11.0
+uv pip install librosa==0.11.0
 ```

 3. Download sample dataset files
--- a/examples/gemma4/26b-a4b-moe-qlora.yaml
+++ b/examples/gemma4/26b-a4b-moe-qlora.yaml
@@ -1,19 +1,12 @@
 # Gemma 4 26B-A4B MoE QLoRA with ScatterMoE kernels
 #
-# Validated: 50 steps on FineTome-100k, loss 7.4 -> 2.4, single RTX 5090 (32GB)
+# Validated: 50 steps on FineTome-100k, loss 8.8 -> 1.8, single RTX 5090 (32GB)
+# torch_compile=true: 21 GiB peak VRAM, ~230 tok/s, 336s total
 #
 # Key notes:
-# - Flash Attention 2 is NOT supported (global_head_dim=512 > FA2 max of 256).
-#   Use sdp_attention instead.
-# - Gemma 4 is multimodal (text+vision+audio). For text-only SFT, restrict
-#   LoRA to the text backbone via lora_target_linear_modules regex.
-# - MoE experts use `experts_implementation: scattermoe` — Gemma 4 embeds MoE
-#   directly in the decoder layer (no SparseMoeBlock), so we register ScatterMoE
-#   via the transformers ExpertsInterface.
-# - Expert LoRA targets are `experts.gate_up_proj` / `experts.down_proj`
-#   (no `mlp.` prefix, unlike Qwen/Mixtral).
-# - micro_batch_size: 1 fits 2048 seq_len on 32GB GPU with SDP attention.
-#   Use micro_batch_size: 4 with 1024 seq_len, or on 48GB+ GPUs.
+# - Max sequence length on 32GB GPU: 2048 (micro_batch_size=1, SDP attention).
+#   4096 seq_len OOMs due to head_dim=512 math SDP materializing full score matrix.
+#   Use 48GB+ GPUs for longer sequences or multi-GPU with FSDP.

 base_model: google/gemma-4-26B-A4B

@@ -24,7 +17,7 @@ plugins:
 use_kernels: true
 use_scattermoe: true
 experts_implementation: scattermoe
-torch_compile: false
+torch_compile: true
 liger_layer_norm: true
 liger_rope: true
 liger_rms_norm: true
@@ -54,12 +47,9 @@ lora_r: 16
 lora_alpha: 32
 lora_dropout: 0

-# Restrict LoRA to text backbone only (skip vision/audio encoders).
-# lora_target_modules is intentionally empty — all module targeting is done
-# via regex in lora_target_linear_modules below.
-lora_target_modules: []
-lora_target_linear_modules:
-  - language_model\.model\.layers\.\d+\.self_attn\.(q|k|v|o)_proj
+# Restrict LoRA to text backbone only (skip vision/audio encoders)
+# using regex to match only the text decoder attention projections.
+lora_target_modules: 'model.language_model.layers.[\d]+.(_checkpoint_wrapped_module.)?(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'

 # MoE expert LoRA (3D Parameter tensors, not nn.Linear)
 lora_target_parameters:
@@ -73,7 +63,7 @@ lora_o_kernel: false
 bnb_config_kwargs:
  bnb_4bit_use_double_quant: true

-wandb_project: gemma4-qlora
+wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
@@ -93,8 +83,7 @@ gradient_checkpointing: true
 activation_offloading: true
 logging_steps: 1

-# FA2 not supported — Gemma4 global_head_dim=512 exceeds FA2 max of 256
-flash_attention: false
+# FA2 not supported
 sdp_attention: true

 warmup_ratio: 0.1
--- a/examples/gemma4/31b-qlora-flex.yaml
+++ b/examples/gemma4/31b-qlora-flex.yaml
@@ -0,0 +1,71 @@
+base_model: google/gemma-4-31B
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+  - axolotl.integrations.liger.LigerPlugin
+torch_compile: true
+liger_layer_norm: true
+liger_rope: true
+liger_rms_norm: true
+liger_glu_activation: true
+liger_rms_norm_gated: true
+strict: false
+
+chat_template: gemma4
+datasets:
+  - path: mlabonne/FineTome-100k
+    type: chat_template
+    split: train[:10%]
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+val_set_size: 0.05
+output_dir: ./outputs/gemma4-31b-qlora-flex
+
+sequence_len: 2048
+sample_packing: true
+
+load_in_4bit: true
+adapter: qlora
+lora_r: 16
+lora_alpha: 32
+lora_dropout: 0
+
+# Restrict LoRA to text backbone only (skip vision/audio encoders)
+lora_target_modules: 'model.language_model.layers.[\d]+.(_checkpoint_wrapped_module.)?(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'
+
+lora_mlp_kernel: false
+lora_qkv_kernel: false
+lora_o_kernel: false
+
+bnb_config_kwargs:
+  bnb_4bit_use_double_quant: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+optimizer: adamw_torch_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+activation_offloading: true
+logging_steps: 1
+
+# FA not supported
+flex_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 4
+saves_per_epoch: 1
+weight_decay: 0.0
+special_tokens:
--- a/examples/gemma4/31b-qlora.yaml
+++ b/examples/gemma4/31b-qlora.yaml
@@ -0,0 +1,69 @@
+base_model: google/gemma-4-31B
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+  - axolotl.integrations.liger.LigerPlugin
+torch_compile: false
+liger_layer_norm: true
+liger_rope: true
+liger_rms_norm: true
+liger_glu_activation: true
+liger_rms_norm_gated: true
+strict: false
+
+chat_template: gemma4
+datasets:
+  - path: mlabonne/FineTome-100k
+    type: chat_template
+    split: train[:10%]
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+val_set_size: 0.05
+output_dir: ./outputs/gemma4-31b-qlora
+
+sequence_len: 2048
+sample_packing: true
+
+load_in_4bit: true
+adapter: qlora
+lora_r: 16
+lora_alpha: 32
+lora_dropout: 0
+
+# Restrict LoRA to text backbone only (skip vision/audio encoders)
+# using regex to match only the text decoder attention projections.
+lora_target_modules: 'model.language_model.layers.[\d]+.(_checkpoint_wrapped_module.)?(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'
+
+bnb_config_kwargs:
+  bnb_4bit_use_double_quant: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_torch_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+activation_offloading: true
+logging_steps: 1
+
+# FA not supported
+sdp_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 4
+saves_per_epoch: 1
+weight_decay: 0.0
+special_tokens:
--- a/examples/gemma4/README.md
+++ b/examples/gemma4/README.md
@@ -0,0 +1,60 @@
+# Finetune Google's Gemma 4 with Axolotl
+
+[Gemma 4](https://huggingface.co/collections/google/gemma-4) is a family of multimodal models from Google. This guide covers how to train them with Axolotl.
+
+## Getting started
+
+1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
+
+2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
+
+3. Run the finetuning example:
+
+```bash
+# 26B MoE QLoRA (1x80GB @ ~50 GiB)
+axolotl train examples/gemma4/26b-a4b-moe-qlora.yaml
+
+# 31B Dense QLoRA (1x80GB @ ~44 GiB)
+axolotl train examples/gemma4/31b-qlora.yaml
+
+# 31B Dense QLoRA Flex Attn (1x80GB @ ~26 GiB)
+axolotl train examples/gemma4/31b-qlora-flex.yaml
+```
+
+### MoE Expert Quantization & Expert LoRA (26B-A4B only)
+
+The 26B-A4B config uses ScatterMoE kernels via the transformers `ExpertsInterface` and quantizes expert weights on load. To learn about expert quantization, expert LoRA targeting, and related limitations, see the [MoE Expert Quantization](https://docs.axolotl.ai/docs/expert_quantization.html) docs.
+
+## Flex Attention
+
+Reduce ~40% VRAM (at the cost of up to half throughput) by setting the below (shown in `examples/gemma4/31b-qlora-flex.yaml`):
+
+```yaml
+torch_compile: true
+flex_attention: true
+```
+
+This works for both the MoE and Dense model.
+
+## Limitations
+
+- **Flash Attention**: FA2 (max head_dim=256) and FA4 (max head_dim=128) cannot support Gemma 4's `global_head_dim=512`. Use SDP or flex attention instead.
+- **LoRA kernels**: Not supported due to KV-sharing layers.
+- **lora_target_linear**: Incompatible for multimodal models — use `lora_target_modules` with a regex to restrict LoRA to the text backbone.
+
+### TIPS
+
+- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
+- You can run full finetuning by removing `adapter: qlora`, `load_in_4bit: true`, and `quantize_moe_experts: true` from the config. This is heavy and has not been tested.
+
+## Optimization Guides
+
+Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
+
+## Related Resources
+
+- [Gemma 4 Blog](https://huggingface.co/blog/gemma4)
+- [Axolotl Docs](https://docs.axolotl.ai)
+- [Axolotl Website](https://axolotl.ai)
+- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
+- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
--- a/examples/gemma4/e2b-vision-lora.yaml
+++ b/examples/gemma4/e2b-vision-lora.yaml
@@ -0,0 +1,62 @@
+# Gemma 4 E2B Vision LoRA
+#
+# Fine-tuning LM LoRA adapters on multimodal Gemma4 with vision/multimodal modules frozen.
+# Uses the base ProcessingStrategy (auto-detects image_token from processor).
+
+base_model: google/gemma-4-E2B-it
+processor_type: AutoProcessor
+freeze_mm_modules: true
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+strict: false
+
+# Required for vision/multimodal training
+skip_prepare_dataset: true
+remove_unused_columns: false
+sample_packing: false
+
+chat_template: gemma4
+datasets:
+  - path: HuggingFaceH4/llava-instruct-mix-vsft
+    type: chat_template
+    split: train[:100]
+
+val_set_size: 0
+output_dir: ./outputs/gemma4-e2b-vision-lora
+
+adapter: lora
+sequence_len: 2048
+pad_to_sequence_len: false
+
+lora_r: 16
+lora_alpha: 32
+lora_dropout: 0
+# Target language model only — vision encoder is frozen via freeze_mm_modules
+lora_target_modules: 'model.language_model.layers.[\d]+.(_checkpoint_wrapped_module.)?(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 1
+max_steps: 10
+optimizer: adamw_torch_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+logging_steps: 1
+sdp_attention: true
+
+warmup_ratio: 0.1
+weight_decay: 0.0
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
--- a/examples/gpt-oss/README.md
+++ b/examples/gpt-oss/README.md
@@ -14,8 +14,7 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations

 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```

 2. Choose one of the following configs below for training the 20B model. (for 120B, see [below](#training-120b))
@@ -87,7 +86,7 @@ for more information about using a special vllm-openai docker image for inferenc
 Optionally, vLLM can be installed from nightly:

 ```bash
-pip install --no-build-isolation --pre -U vllm --extra-index-url https://wheels.vllm.ai/nightly
+uv pip install --no-build-isolation --pre -U vllm --extra-index-url https://wheels.vllm.ai/nightly
 ```
 and the vLLM server can be started with the following command (modify `--tensor-parallel-size 8` to match your environment):
 ```bash
--- a/examples/granite4/README.md
+++ b/examples/granite4/README.md
@@ -15,8 +15,7 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl

-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation -e '.[flash-attn]'
+uv pip install --no-build-isolation -e '.[flash-attn]'

 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
--- a/examples/hunyuan/README.md
+++ b/examples/hunyuan/README.md
@@ -13,8 +13,7 @@ Tencent released a family of opensource models called HunYuan with varying param
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl

-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation -e '.[flash-attn]'
+uv pip install --no-build-isolation -e '.[flash-attn]'

 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
--- a/examples/internvl3_5/README.md
+++ b/examples/internvl3_5/README.md
@@ -11,7 +11,7 @@ This guide shows how to fine-tune it with Axolotl.
 2. Install `timm` for vision model support:

    ```bash
-    pip install timm==1.0.19
+    uv pip install timm==1.0.19
    ```

 3. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
--- a/examples/magistral/README.md
+++ b/examples/magistral/README.md
@@ -14,8 +14,7 @@ Thanks to the team at MistralAI for giving us early access to prepare for these

 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.7.0 min)
-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```

 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage
--- a/examples/magistral/vision/README.md
+++ b/examples/magistral/vision/README.md
@@ -12,7 +12,7 @@ Before starting, ensure you have:

 1. Install the required vision lib:
    ```bash
-    pip install 'mistral-common[opencv]==1.8.5'
+    uv pip install 'mistral-common[opencv]==1.8.5'
    ```

 2. Download the example dataset image:
--- a/examples/ministral3/README.md
+++ b/examples/ministral3/README.md
@@ -23,7 +23,7 @@ Note: This is still experimental given it is based on transformers v5 RC.
    git checkout transformers-v5

    # Install packages for transformers v5
-    pip install -e .
+    uv pip install -e .
    ```

 4. Run the fine-tuning:
--- a/examples/ministral3/vision/README.md
+++ b/examples/ministral3/vision/README.md
@@ -12,7 +12,7 @@ Before starting, ensure you have:

 1. Install the required vision lib:
    ```bash
-    pip install 'mistral-common[opencv]==1.8.6'
+    uv pip install 'mistral-common[opencv]==1.8.6'
    ```

 2. Download the example dataset image:
--- a/examples/mistral-small/README.md
+++ b/examples/mistral-small/README.md
@@ -12,7 +12,7 @@ Before starting, ensure you have:

 1. Install the required vision lib:
    ```bash
-    pip install 'mistral-common[opencv]==1.8.5'
+    uv pip install 'mistral-common[opencv]==1.8.5'
    ```

 2. Download the example dataset image:
--- a/examples/mistral4/README.md
+++ b/examples/mistral4/README.md
@@ -13,7 +13,7 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r
 3. Install transformers from main

  ```bash
-  pip install git+https://github.com/huggingface/transformers.git
+  uv pip install git+https://github.com/huggingface/transformers.git
  ```

 4. Run one of the example configs:
--- a/examples/nemotron-h/120b-a12b-qlora.yaml
+++ b/examples/nemotron-h/120b-a12b-qlora.yaml
@@ -1,5 +1,15 @@
 base_model: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16

+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+  - axolotl.integrations.liger.LigerPlugin
+
+liger_layer_norm: true
+liger_rope: true
+liger_rms_norm: true
+liger_glu_activation: true
+liger_rms_norm_gated: true
+
 # LoRA kernel patches are incompatible with this architecture — see README.
 lora_mlp_kernel: false
 lora_qkv_kernel: false
@@ -22,8 +32,6 @@ dataset_prepared_path: last_run_prepared
 sequence_len: 4096
 sample_packing: true

-use_cut_cross_entropy: true
-
 load_in_4bit: true
 quantize_moe_experts: true
 adapter: qlora
@@ -31,16 +39,16 @@ lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.0
 lora_target_modules:
-  # Attention projection layers (present in ~12 attention layers out of 88)
  - q_proj
  - k_proj
  - v_proj
  - o_proj
-  # To also train MoE expert weights, add them via lora_target_parameters
-  # (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj):
-  #   lora_target_parameters:
-  #     - up_proj
-  #     - down_proj
+
+# To also train MoE expert weights, add them via lora_target_parameters
+# (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj):
+# lora_target_parameters:
+#   - up_proj
+#   - down_proj

 wandb_project:
 wandb_entity:
--- a/examples/nemotron-h/nano-30b-a3b-qlora.yaml
+++ b/examples/nemotron-h/nano-30b-a3b-qlora.yaml
@@ -1,6 +1,16 @@
 # See examples/nemotron-h/README.md for architecture notes and requirements.
 base_model: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16

+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+  - axolotl.integrations.liger.LigerPlugin
+
+liger_layer_norm: true
+liger_rope: true
+liger_rms_norm: true
+liger_glu_activation: true
+liger_rms_norm_gated: true
+
 # LoRA kernel patches are incompatible with this architecture — see README.
 lora_mlp_kernel: false
 lora_qkv_kernel: false
@@ -23,8 +33,6 @@ dataset_prepared_path: last_run_prepared
 sequence_len: 4096
 sample_packing: true

-use_cut_cross_entropy: true
-
 load_in_4bit: true
 quantize_moe_experts: true
 adapter: qlora
@@ -36,11 +44,12 @@ lora_target_modules:
  - k_proj
  - v_proj
  - o_proj
-  # To also train MoE expert weights, add them via lora_target_parameters
-  # (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj):
-  #   lora_target_parameters:
-  #     - up_proj
-  #     - down_proj
+
+# To also train MoE expert weights, add them via lora_target_parameters
+# (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj):
+# lora_target_parameters:
+#   - up_proj
+#   - down_proj

 wandb_project:
 wandb_entity:
--- a/examples/qwen3-next/README.md
+++ b/examples/qwen3-next/README.md
@@ -12,7 +12,7 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations

 3. Install FLA for improved performance
 ```bash
-pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.4.1
+uv pip uninstall causal-conv1d && uv pip install flash-linear-attention==0.4.1
 ```

 4. Run the finetuning example:
--- a/examples/qwen3.5/27b-fft.yaml
+++ b/examples/qwen3.5/27b-fft.yaml
@@ -26,8 +26,8 @@ sample_packing: true

 # Freeze vision encoder
 unfrozen_parameters:
-  - model\.language_model\..*
-  - lm_head\..*
+  - model.language_model.*
+  - lm_head.*

 wandb_project:
 wandb_entity:
--- a/examples/qwen3.5/35b-a3b-moe-vision-lora.yaml
+++ b/examples/qwen3.5/35b-a3b-moe-vision-lora.yaml
@@ -0,0 +1,62 @@
+# Qwen 3.5 35B-A3B MoE Vision LoRA
+#
+# Vision fine-tuning of the hybrid DeltaNet + Attention MoE model.
+# 256 experts, 8 active per token, with early-fusion vision support.
+
+base_model: Qwen/Qwen3.5-35B-A3B
+processor_type: AutoProcessor
+
+# Required for vision/multimodal training
+skip_prepare_dataset: true
+remove_unused_columns: false
+sample_packing: false
+
+chat_template: qwen3_5
+datasets:
+  - path: HuggingFaceH4/llava-instruct-mix-vsft
+    type: chat_template
+    split: train[:100]
+
+val_set_size: 0
+output_dir: ./outputs/qwen35-35b-a3b-vision-lora
+
+adapter: lora
+sequence_len: 4096
+pad_to_sequence_len: false
+
+lora_r: 16
+lora_alpha: 32
+lora_dropout: 0
+lora_target_modules:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - down_proj
+  - up_proj
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 1
+max_steps: 10
+optimizer: adamw_torch_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+weight_decay: 0.0
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
--- a/examples/qwen3.5/README.md
+++ b/examples/qwen3.5/README.md
@@ -10,7 +10,7 @@

 3. Install FLA for sample packing support with the Gated DeltaNet linear attention layers:
  ```bash
-  pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.4.1
+  uv pip uninstall causal-conv1d && uv pip install flash-linear-attention==0.4.1
  ```
  > FLA is required when `sample_packing: true`. Without it, training raises a `RuntimeError` on packed sequences. Vision configs use `sample_packing: false` so FLA is optional there.

--- a/examples/seed-oss/README.md
+++ b/examples/seed-oss/README.md
@@ -11,8 +11,7 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
    Here is an example of how to install from pip:
    ```bash
    # Ensure you have a compatible version of Pytorch installed
-    pip3 install packaging setuptools wheel ninja
-    pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+    uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'

    # Install Cut Cross Entropy
    python scripts/cutcrossentropy_install.py | sh
--- a/examples/smolvlm2/README.md
+++ b/examples/smolvlm2/README.md
@@ -13,14 +13,13 @@ This guide shows how to fine-tune SmolVLM2 models with Axolotl.
    Here is an example of how to install from pip:
    ```bash
    # Ensure you have a compatible version of Pytorch installed
-    pip3 install packaging setuptools wheel ninja
-    pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+    uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
    ```

 2. Install an extra dependency:

    ```bash
-    pip3 install num2words==0.5.14
+    uv pip install num2words==0.5.14
    ```

 3.  Run the finetuning example:
--- a/examples/voxtral/README.md
+++ b/examples/voxtral/README.md
@@ -12,16 +12,15 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r

 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```

 2. Please install the below.

 ```bash
 # audio
-pip3 install librosa==0.11.0
-pip3 install 'mistral_common[audio]==1.8.3'
+uv pip install librosa==0.11.0
+uv pip install 'mistral_common[audio]==1.8.3'

 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,15 +1,165 @@
 [build-system]
-requires = ["setuptools>=64", "wheel", "setuptools_scm>=8", "packaging==26.0"]
+requires = ["setuptools>=64", "wheel", "setuptools_scm>=8"]
 build-backend = "setuptools.build_meta"

 [project]
 name = "axolotl"
-dynamic = ["version", "dependencies", "optional-dependencies"]
+dynamic = ["version"]
 description = "LLM Trainer"
 readme = "README.md"
 requires-python = ">=3.10"
 # license = "Apache-2.0"

+dependencies = [
+    # Core ML stack
+    "torch>=2.6.0",
+    "packaging==26.0",
+    "huggingface_hub>=1.1.7",
+    "peft>=0.19.1,<0.20.0",
+    "tokenizers>=0.22.1",
+    "transformers==5.5.4",
+    "accelerate==1.13.0",
+    "datasets>=4.8.4,<4.9.0",
+    "trl==1.1.0",
+    "hf_xet==1.4.3",
+    "kernels==0.13.0",
+    "trackio>=0.16.1",
+    "typing-extensions>=4.15.0",
+    "optimum==1.16.2",
+    "hf_transfer",
+    "sentencepiece",
+    "gradio>=6.2.0,<7.0",
+    "modal==1.3.0.post1",
+    "pydantic>=2.10.6",
+    "addict",
+    "fire",
+    "PyYAML>=6.0",
+    "requests",
+    "wandb",
+    "einops",
+    "colorama",
+    "numba>=0.61.2",
+    "numpy>=2.2.6",
+
+    # Evaluation & metrics
+    "evaluate==0.4.1",
+    "scipy",
+    "nvidia-ml-py==12.560.30",
+    "art",
+    "tensorboard",
+    "python-dotenv==1.0.1",
+
+    # Remote filesystems
+    "s3fs>=2024.5.0",
+    "gcsfs>=2025.3.0",
+    "adlfs>=2024.5.0",
+    "ocifs==1.3.2",
+
+    "zstandard==0.22.0",
+    "fastcore",
+
+    # lm eval harness
+    "lm_eval==0.4.11",
+    "langdetect==1.0.9",
+    "immutabledict==4.2.0",
+    "antlr4-python3-runtime==4.13.2",
+
+    "schedulefree==1.4.1",
+    "openenv-core==0.1.0",
+
+    # Axolotl contribs
+    "axolotl-contribs-lgpl==0.0.7",
+    "axolotl-contribs-mit==0.0.6",
+
+    # Telemetry
+    "posthog==6.7.11",
+
+    "mistral-common==1.11.0",
+
+    # Platform-specific (Linux only)
+    "bitsandbytes==0.49.1 ; sys_platform != 'darwin'",
+    "triton>=3.4.0 ; sys_platform != 'darwin'",
+    "xformers>=0.0.23.post1 ; sys_platform != 'darwin'",
+    "liger-kernel==0.7.0 ; sys_platform != 'darwin'",
+    "torchao==0.17.0 ; sys_platform != 'darwin' and platform_machine != 'aarch64'",
+
+    # Architecture-specific
+    "fla-core==0.4.1 ; platform_machine != 'aarch64'",
+    "flash-linear-attention==0.4.1 ; platform_machine != 'aarch64'",
+]
+
+[project.optional-dependencies]
+flash-attn = ["flash-attn==2.8.3"]
+ring-flash-attn = [
+    "flash-attn==2.8.3",
+    "ring-flash-attn>=0.1.7",
+]
+deepspeed = [
+    "deepspeed>=0.18.6,<0.19.0",
+    "deepspeed-kernels",
+]
+mamba-ssm = [
+    "mamba-ssm==1.2.0.post1",
+    "causal_conv1d",
+]
+auto-gptq = [
+    "auto-gptq==0.5.1",
+]
+mlflow = [
+    "mlflow",
+]
+galore = [
+    "galore_torch",
+]
+apollo = [
+    "apollo-torch",
+]
+optimizers = [
+    "galore_torch",
+    "apollo-torch",
+    "lomo-optim==0.1.1",
+    "torch-optimi==0.2.1",
+    "came_pytorch==0.1.3",
+]
+ray = [
+    "ray[train]>=2.52.1",
+]
+vllm = [
+    "vllm>=0.15.0",
+]
+llmcompressor = [
+    "llmcompressor>=0.10.0",
+]
+fbgemm-gpu = ["fbgemm-gpu-genai>=1.3.0"]
+opentelemetry = [
+    "opentelemetry-api",
+    "opentelemetry-sdk",
+    "opentelemetry-exporter-prometheus",
+    "prometheus-client",
+]
+
+[dependency-groups]
+dev = [
+    "black",
+    "mypy",
+    "pre-commit",
+    "types-requests",
+    "quartodoc",
+    "jupyter",
+    "blobfile",
+    "tiktoken",
+]
+test = [
+    "codecov",
+    "codecov-cli",
+    "pytest",
+    "pytest-cov",
+    "pytest-retry",
+    "pytest-sugar",
+    "pytest-xdist",
+    "tbparse",
+]
+
 [project.scripts]
 axolotl = "axolotl.cli.main:main"

@@ -18,18 +168,15 @@ Homepage = "https://axolotl.ai/"
 Documentation = "https://docs.axolotl.ai/"
 Repository = "https://github.com/axolotl-ai-cloud/axolotl.git"

-[tool.setuptools_scm]
-
 [tool.setuptools]
-py-modules = ["setuptools_axolotl_dynamic_dependencies"]
 include-package-data = true

+[tool.setuptools.packages.find]
+where = ["src"]
+
 [tool.setuptools.dynamic]
 version = { file = "VERSION" }

-[tool.setuptools.cmdclass]
-build_py = "setuptools_axolotl_dynamic_dependencies.BuildPyCommand"
-
 [tool.ruff]
 line-length = 88
 target-version = "py310"
@@ -67,5 +214,43 @@ markers = [
    "slow: marks tests as slow",
 ]

+# UV specific configuration
+[tool.uv]
+prerelease = "allow"
+conflicts = [
+    [
+        { package = "axolotl" },
+        { extra = "vllm" },
+    ],
+    [
+        { package = "axolotl" },
+        { extra = "flash-attn" },
+    ],
+    [
+        { package = "axolotl" },
+        { extra = "ring-flash-attn" },
+    ],
+    [
+        { package = "axolotl" },
+        { extra = "mamba-ssm" },
+    ],
+    [
+        { package = "axolotl" },
+        { extra = "auto-gptq" },
+    ],
+    [
+        { package = "axolotl" },
+        { extra = "fbgemm-gpu" },
+    ],
+    [
+        { package = "axolotl" },
+        { extra = "llmcompressor" },
+    ],
+]
+
 [tool.uv.extra-build-dependencies]
-axolotl = ["huggingface_hub"]
+mamba-ssm = [{ requirement = "torch", match-runtime = true }]
+causal-conv1d = [{ requirement = "torch", match-runtime = true }]
+flash-attn = [{ requirement = "torch", match-runtime = true }]
+deepspeed = [{ requirement = "torch", match-runtime = true }]
+auto-gptq = [{ requirement = "torch", match-runtime = true }]
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,8 +0,0 @@
-black
-mypy
-pre-commit
-types-requests
-quartodoc
-jupyter
-blobfile
-tiktoken
--- a/requirements-tests.txt
+++ b/requirements-tests.txt
@@ -1,8 +0,0 @@
-codecov
-codecov-cli
-pytest
-pytest-cov
-pytest-retry
-pytest-sugar
-pytest-xdist
-tbparse
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,78 +0,0 @@
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
-
-# START section of dependencies that don't install on Darwin/MacOS
-bitsandbytes==0.49.1
-triton>=3.4.0
-mamba-ssm==1.2.0.post1
-xformers>=0.0.23.post1
-liger-kernel==0.7.0
-# END section
-
-packaging==26.0
-huggingface_hub>=1.1.7
-peft>=0.18.1
-tokenizers>=0.22.1
-transformers==5.5.0
-accelerate==1.13.0
-datasets==4.5.0
-deepspeed>=0.18.6,<0.19.0
-trl==0.29.0
-hf_xet==1.3.2
-kernels==0.12.2
-
-fla-core==0.4.1
-flash-linear-attention==0.4.1
-
-trackio>=0.16.1
-typing-extensions>=4.15.0
-
-optimum==1.16.2
-hf_transfer
-sentencepiece
-gradio>=6.2.0,<7.0
-
-modal==1.3.0.post1
-pydantic>=2.10.6
-addict
-fire
-PyYAML>=6.0
-requests
-wandb
-einops
-colorama
-numba>=0.61.2
-numpy>=2.2.6
-
-# qlora things
-evaluate==0.4.1
-scipy
-nvidia-ml-py==12.560.30
-art
-tensorboard
-python-dotenv==1.0.1
-
-# remote filesystems
-s3fs>=2024.5.0
-gcsfs>=2025.3.0
-adlfs>=2024.5.0
-ocifs==1.3.2
-
-zstandard==0.22.0
-fastcore
-
-# lm eval harness
-lm_eval==0.4.11
-langdetect==1.0.9
-immutabledict==4.2.0
-antlr4-python3-runtime==4.13.2
-
-torchao==0.17.0
-openenv-core==0.1.0
-schedulefree==1.4.1
-
-axolotl-contribs-lgpl==0.0.7
-axolotl-contribs-mit==0.0.6
-# telemetry
-posthog==6.7.11
-
-mistral-common==1.11.0
--- a/scripts/analyze_profile.py
+++ b/scripts/analyze_profile.py
--- a/scripts/cutcrossentropy_install.py
+++ b/scripts/cutcrossentropy_install.py
@@ -29,5 +29,5 @@ UV_PREFIX = "uv " if USE_UV else ""

 print(
    UNINSTALL_PREFIX
-    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@63b15e6"'
+    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@fec1a88"'
 )
--- a/scripts/unsloth_install.py
+++ b/scripts/unsloth_install.py
@@ -1,40 +0,0 @@
-# noqa
-import sys
-
-try:
-    import torch
-except ImportError as error:
-    raise ImportError("Install torch via `pip install torch`") from error
-from packaging.version import Version as V
-
-use_uv = "--uv" in sys.argv[1:]
-
-v = V(torch.__version__)
-cuda = str(torch.version.cuda)
-try:
-    is_ampere = torch.cuda.get_device_capability()[0] >= 8
-except RuntimeError:
-    is_ampere = False
-if cuda != "12.1" and cuda != "11.8" and cuda != "12.4":
-    raise RuntimeError(f"CUDA = {cuda} not supported!")
-if v <= V("2.1.0"):
-    raise RuntimeError(f"Torch = {v} too old!")
-elif v <= V("2.1.1"):
-    x = "cu{}{}-torch211"
-elif v <= V("2.1.2"):
-    x = "cu{}{}-torch212"
-elif v < V("2.3.0"):
-    x = "cu{}{}-torch220"
-elif v < V("2.4.0"):
-    x = "cu{}{}-torch230"
-elif v < V("2.5.0"):
-    x = "cu{}{}-torch240"
-elif v < V("2.6.0"):
-    x = "cu{}{}-torch250"
-else:
-    raise RuntimeError(f"Torch = {v} too new!")
-x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "")
-uv_prefix = "uv " if use_uv else ""
-print(
-    f'{uv_prefix}pip install unsloth-zoo==2024.12.1 && {uv_prefix}pip install --no-deps "unsloth[{x}]==2024.12.4"'
-)
--- a/setup.py
+++ b/setup.py
@@ -1,230 +0,0 @@
-"""setup.py for axolotl"""
-
-import os
-import platform
-import re
-from importlib.metadata import PackageNotFoundError, version
-from pathlib import Path
-
-from setuptools import find_packages, setup
-
-
-def parse_requirements(extras_require_map):
-    _install_requires = []
-    _dependency_links = []
-    with open("./requirements.txt", encoding="utf-8") as requirements_file:
-        lines = [r.strip() for r in requirements_file.readlines()]
-        for line in lines:
-            is_extras = "deepspeed" in line or "mamba-ssm" in line
-            if line.startswith("--extra-index-url"):
-                # Handle custom index URLs
-                _, url = line.split()
-                _dependency_links.append(url)
-            elif not is_extras and line and line[0] != "#":
-                # Handle standard packages
-                _install_requires.append(line)
-    try:
-        xformers_version = [req for req in _install_requires if "xformers" in req][0]
-        install_xformers = platform.machine() != "aarch64"
-        if platform.machine() == "aarch64":
-            # skip on ARM64
-            skip_packages = [
-                "torchao",
-                "fla-core",
-                "flash-linear-attention",
-            ]
-            _install_requires = [
-                req
-                for req in _install_requires
-                if re.split(r"[>=<]", req)[0].strip() not in skip_packages
-            ]
-        if "Darwin" in platform.system():
-            # skip packages not compatible with OSX
-            skip_packages = [
-                "bitsandbytes",
-                "triton",
-                "mamba-ssm",
-                "xformers",
-                "liger-kernel",
-            ]
-            _install_requires = [
-                req
-                for req in _install_requires
-                if re.split(r"[>=<]", req)[0].strip() not in skip_packages
-            ]
-            print(
-                _install_requires, [req in skip_packages for req in _install_requires]
-            )
-        else:
-            # detect the version of torch already installed
-            # and set it so dependencies don't clobber the torch version
-            try:
-                torch_version = version("torch")
-            except PackageNotFoundError:
-                torch_version = "2.8.0"  # default to torch 2.8.0
-            _install_requires.append(f"torch=={torch_version}")
-
-            version_match = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?", torch_version)
-            if version_match:
-                major, minor, patch = version_match.groups()
-                major, minor = int(major), int(minor)
-                patch = (
-                    int(patch) if patch is not None else 0
-                )  # Default patch to 0 if not present
-            else:
-                raise ValueError("Invalid version format")
-
-            torch_parts = torch_version.split("+")
-            if len(torch_parts) == 2:
-                torch_cuda_version = torch_parts[1]
-                _dependency_links.append(
-                    f"https://download.pytorch.org/whl/{torch_cuda_version}"
-                )
-
-            if (major, minor) >= (2, 10):
-                extras_require_map.pop("fbgemm-gpu")
-                extras_require_map["fbgemm-gpu"] = [
-                    "fbgemm-gpu==1.5.0",
-                    "fbgemm-gpu-genai==1.5.0",
-                ]
-                if not install_xformers:
-                    _install_requires.pop(_install_requires.index(xformers_version))
-                extras_require_map["vllm"] = ["vllm>=0.17.1"]
-            elif (major, minor) >= (2, 9):
-                extras_require_map.pop("fbgemm-gpu")
-                extras_require_map["fbgemm-gpu"] = [
-                    "fbgemm-gpu==1.4.0",
-                    "fbgemm-gpu-genai==1.4.2",
-                ]
-                if not install_xformers:
-                    _install_requires.pop(_install_requires.index(xformers_version))
-                if patch == 0:
-                    extras_require_map["vllm"] = ["vllm==0.13.0"]
-                else:
-                    extras_require_map["vllm"] = ["vllm==0.14.0"]
-            elif (major, minor) >= (2, 8):
-                extras_require_map.pop("fbgemm-gpu")
-                extras_require_map["fbgemm-gpu"] = ["fbgemm-gpu-genai==1.3.0"]
-                extras_require_map["vllm"] = ["vllm==0.11.0"]
-                if not install_xformers:
-                    _install_requires.pop(_install_requires.index(xformers_version))
-            elif (major, minor) >= (2, 7):
-                _install_requires.pop(_install_requires.index(xformers_version))
-                if patch == 0:
-                    if install_xformers:
-                        _install_requires.append("xformers==0.0.30")
-                    # vllm 0.9.x is incompatible with latest transformers
-                    extras_require_map.pop("vllm")
-                else:
-                    if install_xformers:
-                        _install_requires.append("xformers==0.0.31")
-                    extras_require_map["vllm"] = ["vllm==0.10.1"]
-            elif (major, minor) >= (2, 6):
-                _install_requires.pop(_install_requires.index(xformers_version))
-                if install_xformers:
-                    _install_requires.append("xformers==0.0.29.post3")
-                # since we only support 2.6.0+cu126
-                _dependency_links.append("https://download.pytorch.org/whl/cu126")
-                extras_require_map.pop("vllm")
-            elif (major, minor) >= (2, 5):
-                _install_requires.pop(_install_requires.index(xformers_version))
-                if install_xformers:
-                    if patch == 0:
-                        _install_requires.append("xformers==0.0.28.post2")
-                    else:
-                        _install_requires.append("xformers>=0.0.28.post3")
-                extras_require_map.pop("vllm")
-            elif (major, minor) >= (2, 4):
-                extras_require_map.pop("vllm")
-                if install_xformers:
-                    if patch == 0:
-                        _install_requires.pop(_install_requires.index(xformers_version))
-                        _install_requires.append("xformers>=0.0.27")
-                    else:
-                        _install_requires.pop(_install_requires.index(xformers_version))
-                        _install_requires.append("xformers==0.0.28.post1")
-            else:
-                raise ValueError("axolotl requires torch>=2.4")
-
-    except PackageNotFoundError:
-        pass
-    return _install_requires, _dependency_links, extras_require_map
-
-
-def get_package_version():
-    with open(
-        Path(os.path.dirname(os.path.abspath(__file__))) / "VERSION",
-        "r",
-        encoding="utf-8",
-    ) as fin:
-        version_ = fin.read().strip()
-    return version_
-
-
-extras_require = {
-    "flash-attn": ["flash-attn==2.8.3"],
-    "ring-flash-attn": [
-        "flash-attn==2.8.3",
-        "ring-flash-attn>=0.1.7",
-    ],
-    "deepspeed": [
-        "deepspeed==0.18.2",
-        "deepspeed-kernels",
-    ],
-    "mamba-ssm": [
-        "mamba-ssm==1.2.0.post1",
-        "causal_conv1d",
-    ],
-    "auto-gptq": [
-        "auto-gptq==0.5.1",
-    ],
-    "mlflow": [
-        "mlflow",
-    ],
-    "galore": [
-        "galore_torch",
-    ],
-    "apollo": [
-        "apollo-torch",
-    ],
-    "optimizers": [
-        "galore_torch",
-        "apollo-torch",
-        "lomo-optim==0.1.1",
-        "torch-optimi==0.2.1",
-        "came_pytorch==0.1.3",
-    ],
-    "ray": [
-        "ray[train]>=2.52.1",
-    ],
-    "vllm": [
-        "vllm==0.10.0",
-    ],
-    "llmcompressor": [
-        "llmcompressor==0.5.1",
-    ],
-    "fbgemm-gpu": ["fbgemm-gpu-genai==1.3.0"],
-    "opentelemetry": [
-        "opentelemetry-api",
-        "opentelemetry-sdk",
-        "opentelemetry-exporter-prometheus",
-        "prometheus-client",
-    ],
-}
-install_requires, dependency_links, extras_require_build = parse_requirements(
-    extras_require
-)
-
-setup(
-    version=get_package_version(),
-    package_dir={"": "src"},
-    packages=find_packages("src"),
-    install_requires=install_requires,
-    dependency_links=dependency_links,
-    entry_points={
-        "console_scripts": [
-            "axolotl=axolotl.cli.main:main",
-        ],
-    },
-    extras_require=extras_require_build,
-)
--- a/src/axolotl/cli/agent_docs/init.py
+++ b/src/axolotl/cli/agent_docs/init.py
@@ -0,0 +1,108 @@
+"""Bundled agent documentation for axolotl.
+
+These docs are optimized for consumption by AI coding agents.
+The source of truth is docs/agents/*.md and AGENTS.md in the repo root.
+This module resolves those paths at runtime — no files are duplicated
+into the package.
+
+For pip-only installs (no repo checkout), run `axolotl fetch docs` first
+to download the docs locally.
+"""
+
+from pathlib import Path
+
+# Topic name -> (filename in docs/agents/, fallback filename for AGENTS.md)
+TOPICS = {
+    "overview": "AGENTS.md",
+    "sft": "docs/agents/sft.md",
+    "grpo": "docs/agents/grpo.md",
+    "preference_tuning": "docs/agents/preference_tuning.md",
+    "reward_modelling": "docs/agents/reward_modelling.md",
+    "pretraining": "docs/agents/pretraining.md",
+    "model_architectures": "docs/agents/model_architectures.md",
+    "new_model_support": "docs/agents/new_model_support.md",
+}
+
+
+def _find_repo_root() -> Path | None:
+    """Walk up from this file to find the repo root (contains AGENTS.md)."""
+    # In an editable install or repo checkout, walk up from
+    # src/axolotl/cli/agent_docs/ to find the repo root
+    current = Path(__file__).resolve().parent
+    while current != current.parent:
+        if (current / "AGENTS.md").exists() and (current / "docs" / "agents").is_dir():
+            return current
+        current = current.parent
+    return None
+
+
+def _find_docs_dir() -> Path | None:
+    """Find a fetched docs directory (from `axolotl fetch docs`)."""
+    # axolotl fetch docs --dest defaults to ./docs/ in cwd
+    cwd_docs = Path.cwd() / "docs" / "agents"
+    if cwd_docs.is_dir():
+        return Path.cwd()
+    return None
+
+
+def _resolve_path(topic: str) -> Path:
+    """Resolve a topic name to the actual file path."""
+    if topic not in TOPICS:
+        available = ", ".join(sorted(TOPICS.keys()))
+        raise FileNotFoundError(f"Unknown topic: {topic!r}. Available: {available}")
+
+    relative_path = TOPICS[topic]
+
+    # Try repo root first (editable install / repo checkout)
+    repo_root = _find_repo_root()
+    if repo_root:
+        candidate = repo_root / relative_path
+        if candidate.exists():
+            return candidate
+
+    # Try cwd (fetched docs via `axolotl fetch docs`)
+    docs_root = _find_docs_dir()
+    if docs_root:
+        candidate = docs_root / relative_path
+        if candidate.exists():
+            return candidate
+
+    # Also check cwd directly for AGENTS.md
+    if topic == "overview":
+        cwd_agents = Path.cwd() / "AGENTS.md"
+        if cwd_agents.exists():
+            return cwd_agents
+
+    raise FileNotFoundError(
+        f"Could not find {relative_path!r}. "
+        f"If you installed axolotl via pip, run `axolotl fetch docs` first "
+        f"to download the documentation."
+    )
+
+
+def get_doc(topic: str = "overview") -> str:
+    """Return the content of an agent doc by topic name.
+
+    Args:
+        topic: One of the keys in TOPICS, or "overview" (default).
+
+    Returns:
+        The markdown content of the doc.
+
+    Raises:
+        FileNotFoundError: If the topic can't be found.
+    """
+    return _resolve_path(topic).read_text()
+
+
+def list_topics() -> dict[str, str]:
+    """Return a dict of topic name -> first line (title) of each doc."""
+    result = {}
+    for topic in sorted(TOPICS.keys()):
+        try:
+            path = _resolve_path(topic)
+            first_line = path.read_text().split("\n", 1)[0].lstrip("# ").strip()
+            result[topic] = first_line
+        except FileNotFoundError:
+            result[topic] = "(not found — run `axolotl fetch docs`)"
+    return result
--- a/src/axolotl/cli/main.py
+++ b/src/axolotl/cli/main.py
@@ -294,7 +294,9 @@ def merge_lora(config: str, **kwargs):


@cli.command()
-@click.argument("directory", type=click.Choice(["examples", "deepspeed_configs"]))
+@click.argument(
+    "directory", type=click.Choice(["examples", "deepspeed_configs", "docs"])
+)
@click.option("--dest", help="Destination directory")
 def fetch(directory: str, dest: Optional[str]):
    """
@@ -303,9 +305,10 @@ def fetch(directory: str, dest: Optional[str]):
    Available directories:
    - examples: Example configuration files
    - deepspeed_configs: DeepSpeed configuration files
+    - docs: Full documentation (Quarto markdown files)

    Args:
-        directory: One of `examples`, `deepspeed_configs`.
+        directory: One of `examples`, `deepspeed_configs`, `docs`.
        dest: Optional destination directory.
    """
    fetch_from_github(f"{directory}/", dest)
@@ -340,6 +343,112 @@ def delinearize_llama4(model: str, output: str):
    do_delinearize_llama4(model, output)


+@cli.command("agent-docs")
+@click.argument("topic", required=False, default=None)
+@click.option("--list", "list_topics", is_flag=True, help="List available topics")
+def agent_docs(topic: Optional[str], list_topics: bool):
+    """Show agent-optimized documentation.
+
+    Prints reference docs designed for AI coding agents.
+    These docs are bundled with the package — no network access needed.
+
+    \b
+    Examples:
+        axolotl agent-docs              # overview (start here)
+        axolotl agent-docs grpo         # GRPO reference
+        axolotl agent-docs sft          # SFT reference
+        axolotl agent-docs --list       # list all topics
+    """
+    from axolotl.cli.agent_docs import get_doc, list_topics as _list_topics
+
+    if list_topics:
+        for name, title in _list_topics().items():
+            click.echo(f"  {name:25s} {title}")
+        return
+
+    if topic is None:
+        topic = "overview"
+
+    try:
+        click.echo(get_doc(topic))
+    except FileNotFoundError as exc:
+        raise click.BadParameter(str(exc)) from exc
+
+
+@cli.command("config-schema")
+@click.option(
+    "--format",
+    "output_format",
+    type=click.Choice(["json", "yaml"]),
+    default="json",
+    help="Output format (default: json)",
+)
+@click.option("--field", help="Show schema for a specific field only")
+def config_schema(output_format: str, field: Optional[str]):
+    """Dump the full config JSON schema.
+
+    Useful for AI agents and tooling to discover all available config options,
+    their types, defaults, and descriptions.
+
+    \b
+    Examples:
+        axolotl config-schema                    # full JSON schema
+        axolotl config-schema --format yaml      # YAML format
+        axolotl config-schema --field adapter     # single field
+    """
+    import json
+
+    try:
+        schema = AxolotlInputConfig.model_json_schema()
+    except (TypeError, ValueError, AttributeError) as exc:
+        # Fallback: dump field names, types, and defaults when full schema
+        # generation fails (e.g. torch.dtype not JSON-serializable)
+        LOG.warning(
+            "Full JSON schema generation failed, using simplified fallback: %s", exc
+        )
+        fields = {}
+        for name, field_info in AxolotlInputConfig.model_fields.items():
+            entry = {}
+            if field_info.description:
+                entry["description"] = field_info.description
+            if field_info.default is not None:
+                try:
+                    json.dumps(field_info.default)
+                    entry["default"] = field_info.default
+                except (TypeError, ValueError):
+                    entry["default"] = str(field_info.default)
+            annotation = field_info.annotation
+            if annotation is not None:
+                entry["type"] = str(annotation)
+            fields[name] = entry
+        schema = {
+            "properties": fields,
+            "_note": "simplified schema (full generation failed)",
+        }
+
+    if field:
+        props = schema.get("properties", {})
+        if field not in props:
+            # Try case-insensitive match
+            matches = [k for k in props if k.lower() == field.lower()]
+            if matches:
+                field = matches[0]
+            else:
+                raise click.BadParameter(
+                    f"Unknown field: {field!r}. "
+                    f"Omit --field to dump the full schema, "
+                    f"or pipe to jq: axolotl config-schema | jq '.properties | keys'"
+                )
+        schema = {field: props[field]}
+
+    if output_format == "yaml":
+        import yaml  # pylint: disable=import-outside-toplevel
+
+        click.echo(yaml.dump(schema, default_flow_style=False, sort_keys=False))
+    else:
+        click.echo(json.dumps(schema, indent=2))
+
+
 cli.add_command(lm_eval)


--- a/src/axolotl/cli/merge_lora.py
+++ b/src/axolotl/cli/merge_lora.py
@@ -115,6 +115,7 @@ def _do_merge_lora_efficient(*, cfg: DictDefault) -> None:
        simulate_nf4_experts=simulate_nf4_experts,
        nf4_blocksize=nf4_blocksize,
        nf4_double_quant=nf4_double_quant,
+        trust_remote_code=bool(getattr(cfg, "trust_remote_code", False)),
    )

    LOG.debug("Memory-efficient LoRA merge completed successfully!")
--- a/src/axolotl/cli/utils/lora_merge.py
+++ b/src/axolotl/cli/utils/lora_merge.py
@@ -17,6 +17,93 @@ from axolotl.utils.logging import get_logger
 LOG = get_logger(__name__)


+def _build_layer_type_map(
+    base_model_path: Path, trust_remote_code: bool = False
+) -> dict[str, str]:
+    """Build a map of module_name -> layer_type using a meta-device model.
+
+    Instantiates the model architecture on the meta device (zero memory)
+    to inspect which modules are Linear vs Conv1d/Conv2d/Conv3d.
+    This avoids relying on weight tensor ndim heuristics.
+    """
+    import json as _json
+
+    import torch.nn as nn
+    from transformers import AutoConfig
+
+    config_path = base_model_path / "config.json"
+    if not config_path.exists():
+        return {}
+
+    try:
+        with open(config_path) as f:
+            model_config = _json.load(f)
+    except (OSError, _json.JSONDecodeError):
+        return {}
+
+    architectures = model_config.get("architectures", [])
+    if not architectures:
+        return {}
+
+    try:
+        config = AutoConfig.from_pretrained(
+            str(base_model_path), trust_remote_code=trust_remote_code
+        )
+    except Exception:
+        LOG.debug("Could not load config for layer type introspection")
+        return {}
+
+    # Determine the right Auto class from architectures
+    from transformers import (
+        AutoModel,
+        AutoModelForCausalLM,
+    )
+
+    auto_classes = [AutoModelForCausalLM, AutoModel]
+    try:
+        from transformers import AutoModelForImageTextToText
+
+        auto_classes.insert(0, AutoModelForImageTextToText)
+    except ImportError:
+        pass
+
+    model = None
+    for auto_cls in auto_classes:
+        try:
+            with torch.device("meta"):
+                model = auto_cls.from_config(
+                    config, trust_remote_code=trust_remote_code
+                )
+            break
+        except Exception:  # noqa: BLE001
+            LOG.debug(
+                "Could not instantiate meta model with %s, trying next",
+                auto_cls.__name__,
+            )
+
+    if model is None:
+        LOG.debug("Could not instantiate meta model for layer type introspection")
+        return {}
+
+    layer_types = {}
+    for name, module in model.named_modules():
+        if isinstance(module, nn.Conv3d):
+            layer_types[name] = "Conv3d"
+        elif isinstance(module, nn.Conv2d):
+            layer_types[name] = "Conv2d"
+        elif isinstance(module, nn.Conv1d):
+            layer_types[name] = "Conv1d"
+        elif isinstance(module, nn.Linear):
+            layer_types[name] = "Linear"
+
+    del model
+    LOG.debug(
+        f"Layer type map: {len(layer_types)} modules "
+        f"({sum(1 for v in layer_types.values() if 'Conv' in v)} conv layers)"
+    )
+    return layer_types
+
+
 def _simulate_nf4_roundtrip(
    tensor: torch.Tensor,
    blocksize: Optional[int] = None,
@@ -191,6 +278,7 @@ def _build_peft_layer_and_get_delta(
    adapter_name: str = "default",
    is_param_wrapper: bool = False,
    magnitude: Optional[torch.Tensor] = None,
+    layer_type: Optional[str] = None,
 ) -> torch.Tensor:
    """
    Use PEFT's own layer classes to compute the LoRA delta weight.
@@ -211,7 +299,7 @@ def _build_peft_layer_and_get_delta(
    out_features = lora_b.shape[0]
    lora_alpha = lora_config_dict.get("lora_alpha", lora_config_dict.get("r", 1))
    use_rslora = bool(lora_config_dict.get("use_rslora", False))
-    use_dora = bool(lora_config_dict.get("use_dora", False)) and magnitude is not None
+    use_dora = bool(lora_config_dict.get("use_dora", False))

    if is_param_wrapper:
        from peft.tuners.lora.layer import ParamWrapper
@@ -227,18 +315,110 @@ def _build_peft_layer_and_get_delta(
            "weight", nn.Parameter(base_tensor.clone(), requires_grad=False)
        )

+        # ParamWrapper rejects dropout/fan_in_fan_out/lora_bias/use_dora, so
+        # build a minimal config with only the fields it accepts.
+        pw_config = LoraConfig(
+            r=r,
+            lora_alpha=lora_alpha,
+            lora_dropout=0.0,
+            fan_in_fan_out=False,
+            use_rslora=use_rslora,
+            use_dora=False,
+            lora_bias=False,
+        )
+
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", UserWarning)
            layer = ParamWrapper(
                fake,
                adapter_name=adapter_name,
                parameter_name="weight",
+                config=pw_config,
                r=r,
                lora_alpha=lora_alpha,
-                use_rslora=use_rslora,
            )
        layer.lora_A[adapter_name].weight.data = lora_a
        layer.lora_B[adapter_name].weight.data = lora_b
+        delta = layer.get_delta_weight(adapter_name)
+        # peft >=0.19.1 may return delta with transposed dims for 3D params
+        if delta.shape != base_tensor.shape and delta.ndim == 3:
+            delta = delta.transpose(1, 2).contiguous()
+        return delta
+    elif (
+        layer_type and "Conv" in layer_type or (layer_type is None and lora_a.ndim > 2)
+    ):
+        # Conv layer detected via model introspection (or ndim fallback)
+
+        from peft.tuners.lora import layer as peft_lora_layer
+
+        # Determine conv type from layer_type map or fall back to ndim
+        if layer_type and "Conv" in layer_type:
+            conv_type: str = layer_type
+        else:
+            ndim = lora_a.ndim
+            _conv_map = {3: "Conv1d", 4: "Conv2d", 5: "Conv3d"}
+            if ndim not in _conv_map:
+                raise ValueError(
+                    f"Unsupported LoRA weight dimensionality {ndim} for conv layer"
+                )
+            conv_type = _conv_map[ndim]
+            LOG.warning(
+                f"Using ndim-based fallback for conv detection (ndim={ndim}). "
+                f"Consider providing layer_type from meta-device introspection."
+            )
+
+        conv_cls_map = {"Conv1d": nn.Conv1d, "Conv2d": nn.Conv2d, "Conv3d": nn.Conv3d}
+        ConvCls = conv_cls_map[conv_type]
+        PeftConvCls = getattr(peft_lora_layer, conv_type)
+
+        # Reconstruct conv parameters from base tensor and lora_a shapes
+        # base_tensor: [out_channels, in_channels/groups, *kernel_size]
+        # lora_a:      [r, in_channels/groups, *kernel_size]
+        # lora_b:      [out_channels, r, *ones]
+        out_channels = base_tensor.shape[0]
+        in_channels = base_tensor.shape[1]
+        kernel_size = tuple(base_tensor.shape[2:])
+        stride = (1,) * (base_tensor.ndim - 2)
+        padding = (0,) * (base_tensor.ndim - 2)
+
+        base_layer = ConvCls(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=False,
+        )
+        base_layer.weight.data = base_tensor.clone()
+
+        conv_config = LoraConfig(
+            r=r_total,
+            lora_alpha=lora_alpha,
+            use_rslora=use_rslora,
+            use_dora=use_dora,
+        )
+        layer = PeftConvCls(
+            base_layer,
+            adapter_name=adapter_name,
+            config=conv_config,
+            r=r_total,
+            lora_alpha=lora_alpha,
+        )
+        layer.lora_A[adapter_name].weight.data = lora_a
+        layer.lora_B[adapter_name].weight.data = lora_b
+
+        if use_dora:
+            if magnitude is None:
+                raise ValueError(
+                    f"DoRA merge requires a magnitude vector but none was found "
+                    f"for conv layer (adapter={adapter_name}). Check that the "
+                    f"adapter checkpoint contains lora_magnitude_vector weights."
+                )
+            mag_layer = layer.lora_magnitude_vector[adapter_name]
+            mag_layer.weight = nn.Parameter(magnitude)
+            layer.merge(adapter_names=[adapter_name])
+            return base_layer.weight.data - base_tensor
+
        return layer.get_delta_weight(adapter_name)
    else:
        from peft.tuners.lora.layer import Linear as LoraLinear
@@ -251,15 +431,20 @@ def _build_peft_layer_and_get_delta(
            or lora_config_dict.get("lora_fan_in_fan_out", False)
        )

-        layer = LoraLinear(
-            base_layer,
-            adapter_name=adapter_name,
+        linear_config = LoraConfig(
            r=r_total,
            lora_alpha=lora_alpha,
            fan_in_fan_out=fan_in_fan_out,
            use_rslora=use_rslora,
            use_dora=use_dora,
        )
+        layer = LoraLinear(
+            base_layer,
+            adapter_name=adapter_name,
+            config=linear_config,
+            r=r_total,
+            lora_alpha=lora_alpha,
+        )
        layer.lora_A[adapter_name].weight.data = lora_a
        layer.lora_B[adapter_name].weight.data = lora_b

@@ -267,6 +452,12 @@ def _build_peft_layer_and_get_delta(
            # DoRA merges magnitude normalization into the weight directly.
            # Use PEFT's merge() which handles DoRA internally, then
            # compute the delta as merged_weight - original_weight.
+            if magnitude is None:
+                raise ValueError(
+                    f"DoRA merge requires a magnitude vector but none was found "
+                    f"for linear layer (adapter={adapter_name}). Check that the "
+                    f"adapter checkpoint contains lora_magnitude_vector weights."
+                )
            mag_layer = layer.lora_magnitude_vector[adapter_name]
            mag_layer.weight = nn.Parameter(magnitude)
            layer.merge(adapter_names=[adapter_name])
@@ -382,6 +573,7 @@ def _merge_tensor_with_lora(
    nf4_double_quant: bool = True,
    use_dora: bool = False,
    weight_renamings: Optional[Dict[str, str]] = None,
+    layer_type_map: Optional[Dict[str, str]] = None,
 ) -> tuple[torch.Tensor, bool]:
    """
    Helper function to merge a single tensor with its corresponding LoRA weights.
@@ -426,12 +618,30 @@ def _merge_tensor_with_lora(
            if use_dora
            else None
        )
+
+        # Look up layer type from meta-device model introspection
+        _layer_type = None
+        if layer_type_map:
+            mod_path = key.rsplit(".weight", 1)[0] if key.endswith(".weight") else key
+            _layer_type = layer_type_map.get(mod_path)
+            # Try common prefix variations (e.g. with/without "model." prefix)
+            if _layer_type is None:
+                for prefix in [
+                    "model.",
+                    "model.language_model.",
+                    "model.language_model.model.",
+                ]:
+                    _layer_type = layer_type_map.get(prefix + mod_path)
+                    if _layer_type:
+                        break
+
        delta = _build_peft_layer_and_get_delta(
            lora_a.to(device),
            lora_b.to(device),
            lora_config_dict,
            tensor.to(device),
            magnitude=magnitude.to(device) if magnitude is not None else None,
+            layer_type=_layer_type,
        )
        merged_tensor = (
            (tensor.to(device).to(torch.float32) + delta.to(torch.float32))
@@ -556,6 +766,7 @@ def _fuse_and_unfuse_with_merge(
    nf4_double_quant: bool = True,
    use_dora: bool = False,
    weight_renamings: Optional[Dict[str, str]] = None,
+    layer_type_map: Optional[Dict[str, str]] = None,
 ) -> tuple[Dict[str, torch.Tensor], int, set]:
    """
    For tensors matching WeightConverter patterns (MoE expert weights):
@@ -696,12 +907,32 @@ def _fuse_and_unfuse_with_merge(
                    if use_dora
                    else None
                )
+                # Look up layer type for the fused key
+                _layer_type = None
+                if layer_type_map:
+                    mod_path = (
+                        fused_key.rsplit(".weight", 1)[0]
+                        if fused_key.endswith(".weight")
+                        else fused_key
+                    )
+                    _layer_type = layer_type_map.get(mod_path)
+                    if _layer_type is None:
+                        for prefix in [
+                            "model.",
+                            "model.language_model.",
+                            "model.language_model.model.",
+                        ]:
+                            _layer_type = layer_type_map.get(prefix + mod_path)
+                            if _layer_type:
+                                break
+
                delta = _build_peft_layer_and_get_delta(
                    lora_a.to(device),
                    lora_b.to(device),
                    lora_config_dict,
                    fused_tensor.to(device),
                    magnitude=magnitude.to(device) if magnitude is not None else None,
+                    layer_type=_layer_type,
                )
                fused_tensor = (
                    (
@@ -740,6 +971,7 @@ def merge_lora_sharded_efficient(
    simulate_nf4_experts: bool = False,
    nf4_blocksize: Optional[int] = None,
    nf4_double_quant: bool = True,
+    trust_remote_code: bool = False,
 ) -> None:
    """
    Memory-efficient LoRA merging that processes shards individually
@@ -750,6 +982,8 @@ def merge_lora_sharded_efficient(
        simulate_nf4_experts: Apply NF4 roundtrip only to MoE expert tensors
            (for quantize_moe_experts). Expert tensors are identified by having
            "expert" in the key name and ndim >= 3.
+        trust_remote_code: Whether to trust remote code when loading model
+            config for layer-type introspection. Defaults to False for safety.
    """
    base_model_path = Path(base_model_path)
    lora_adapter_path = Path(lora_adapter_path)
@@ -780,6 +1014,10 @@ def merge_lora_sharded_efficient(

    use_dora = bool(lora_config_dict.get("use_dora", False))

+    # Build layer type map via meta-device model introspection
+    layer_type_map = _build_layer_type_map(
+        base_model_path, trust_remote_code=trust_remote_code
+    )
    unsupported_methods = []

    # Check for AdaLoRA (Adaptive LoRA)
@@ -904,6 +1142,7 @@ def merge_lora_sharded_efficient(
                nf4_double_quant=nf4_double_quant,
                use_dora=use_dora,
                weight_renamings=weight_renamings,
+                layer_type_map=layer_type_map,
            )
            merged_count += fused_merged

@@ -926,6 +1165,7 @@ def merge_lora_sharded_efficient(
                nf4_double_quant=nf4_double_quant,
                use_dora=use_dora,
                weight_renamings=weight_renamings,
+                layer_type_map=layer_type_map,
            )
            merged_tensors[key] = merged_tensor
            if was_merged:
--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -41,6 +41,7 @@ from axolotl.utils.callbacks import (
    GCCallback,
    SaveAxolotlConfigtoWandBCallback,
    SaveModelOnFirstStepCallback,
+    SkipEvalOnResumeCallback,
 )
 from axolotl.utils.callbacks.profiler import PytorchProfilerCallback
 from axolotl.utils.distributed import build_parallelism_config
@@ -118,6 +119,9 @@ class TrainerBuilderBase(abc.ABC):
            plugin_manager.add_callbacks_pre_trainer(cfg=self.cfg, model=self.model)
        )

+        if self.cfg.resume_from_checkpoint:
+            callbacks.append(SkipEvalOnResumeCallback())
+
        if self.cfg.gc_steps:
            callbacks.append(GCCallback(gc_steps=self.cfg.gc_steps))

--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -100,6 +100,27 @@ class AxolotlTrainer(
        self._signature_columns = None  # workaround for pylint

        super().__init__(*_args, **kwargs)
+
+        # Gemma4 (and similar multimodal models) declare **kwargs in forward() for
+        # extra inputs like mm_token_type_ids.  HF Trainer interprets VAR_KEYWORD as
+        # "the model handles num_items_in_batch internally" and skips the loss ÷
+        # gradient_accumulation_steps normalisation, which inflates the *logged* loss
+        # (the gradient itself is still correct). Override to False when the model
+        # doesn't actually consume num_items_in_batch.
+        if self.model_accepts_loss_kwargs:
+            model_to_check = self.accelerator.unwrap_model(self.model)
+            if hasattr(model_to_check, "base_model"):  # PEFT wrapper
+                model_to_check = model_to_check.base_model
+            if hasattr(model_to_check, "model"):
+                model_to_check = model_to_check.model
+            fwd = getattr(model_to_check, "forward", None)
+            if fwd is not None:
+                import inspect
+
+                params = inspect.signature(fwd).parameters
+                if "num_items_in_batch" not in params:
+                    self.model_accepts_loss_kwargs = False
+
        self.train_data_collator = self.data_collator
        self._stored_metrics = defaultdict(
            lambda: defaultdict(lambda: {"values": [], "reduction": "mean"})
@@ -383,13 +404,29 @@ class AxolotlTrainer(

        # Gemma4 requires mm_token_type_ids during training (even for text-only).
        # Inject zeros (= text token type) when not provided by the data collator.
+        # Use unwrap_model to handle DDP/FSDP wrappers that don't proxy .config.
+        _unwrapped = self.accelerator.unwrap_model(model)
+        _model_type = getattr(getattr(_unwrapped, "config", None), "model_type", None)
        if (
            "mm_token_type_ids" not in inputs
            and "input_ids" in inputs
-            and getattr(getattr(model, "config", None), "model_type", None) == "gemma4"
+            and _model_type == "gemma4"
        ):
            inputs["mm_token_type_ids"] = torch.zeros_like(inputs["input_ids"])

+        # Gemma4 (and Gemma3): transformers' masking_utils detects packed sequences
+        # from position_ids, but only when attention_mask is None.  When sample
+        # packing is active the collator provides an all-ones attention_mask that
+        # prevents this detection — remove it so the model builds the correct
+        # per-sequence causal masks.
+        if (
+            self.args.sample_packing
+            and _model_type in ("gemma4", "gemma3")
+            and "attention_mask" in inputs
+            and "position_ids" in inputs
+        ):
+            del inputs["attention_mask"]
+
        if self.args.orpo_alpha:
            return self.orpo_compute_loss(
                model,
@@ -398,6 +435,23 @@ class AxolotlTrainer(
                num_items_in_batch=num_items_in_batch,
            )

+        # Gemma4ForConditionalGeneration computes loss with a manual
+        # nn.CrossEntropyLoss() that bypasses proper num_items_in_batch
+        # normalization and does redundant attention_mask filtering.
+        # Compute loss externally using the standard loss_function instead.
+        if _model_type == "gemma4" and "labels" in inputs:
+            labels = inputs.pop("labels")
+            outputs = model(**inputs)
+            logits = outputs.logits
+            unwrapped = self.accelerator.unwrap_model(model)
+            vocab_size = unwrapped.config.get_text_config().vocab_size
+            loss = unwrapped.loss_function(
+                logits, labels, vocab_size, num_items_in_batch=num_items_in_batch
+            )
+            if return_outputs:
+                return loss, outputs
+            return loss
+
        return super().compute_loss(
            model,
            inputs,
@@ -410,6 +464,21 @@ class AxolotlTrainer(
        LOG.info("Running evaluation step...")
        return super().evaluate(*args, **kwargs)

+    @override
+    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
+        # Gemma4 requires mm_token_type_ids even during evaluation.
+        _unwrapped = self.accelerator.unwrap_model(model)
+        _model_type = getattr(getattr(_unwrapped, "config", None), "model_type", None)
+        if (
+            "mm_token_type_ids" not in inputs
+            and "input_ids" in inputs
+            and _model_type == "gemma4"
+        ):
+            inputs["mm_token_type_ids"] = torch.zeros_like(inputs["input_ids"])
+        return super().prediction_step(
+            model, inputs, prediction_loss_only, ignore_keys=ignore_keys
+        )
+
    @staticmethod
    def orpo_concatenate_inputs(inputs, label_pad_token=-100, pad_token=0, device=None):
        concatenated_batch = {}
--- a/src/axolotl/core/trainers/dpo/init.py
+++ b/src/axolotl/core/trainers/dpo/init.py
@@ -20,8 +20,16 @@ class DPOStrategy:
    @classmethod
    def set_training_args_kwargs(cls, cfg):
        training_args_kwargs = {}
+        if cfg.rl is RLType.DPO:
+            if cfg.dpo_loss_type is not None:
+                training_args_kwargs["loss_type"] = cfg.dpo_loss_type
+
+            if cfg.dpo_loss_weights is not None:
+                training_args_kwargs["loss_weights"] = cfg.dpo_loss_weights
+
        if cfg.rl is RLType.IPO:
            training_args_kwargs["loss_type"] = ["ipo"]
+
        # Label smoothing is not compatible with IPO
        if cfg.rl is RLType.DPO and cfg.dpo_label_smoothing:
            training_args_kwargs["label_smoothing"] = cfg.dpo_label_smoothing
--- a/src/axolotl/core/trainers/grpo/async_trainer.py
+++ b/src/axolotl/core/trainers/grpo/async_trainer.py
@@ -242,6 +242,85 @@ class ProducerConfig:
            )


+class _GroupShardedSampler:
+    """Rank-aware shard of a ``RepeatSampler`` that preserves GRPO groups.
+
+    ``RepeatSampler`` yields ``num_generations`` consecutive copies of
+    each prompt, forming a GRPO group. For distributed training each
+    rank must see a disjoint slice of prompts (otherwise every rank
+    dogpiles on the first 1/world_size of the batch) while keeping each
+    group intact on a single rank so advantage normalization sees all
+    peer generations.
+
+    ``accelerator.prepare(DataLoader)`` does not handle this correctly
+    for custom samplers with ``split_batches=False`` (the default): it
+    leaves the sampler alone and every rank replays identical indices.
+    This wrapper fixes that by consuming the inner sampler's full
+    output, chunking it into ``num_generations``-sized groups, and
+    round-robining whole groups across ranks.
+
+    Intended to be used ONLY when distributed training is active
+    (``num_replicas > 1``); for single-rank it is a no-op but still
+    correct.
+    """
+
+    def __init__(
+        self,
+        inner: Any,
+        num_generations: int,
+        rank: int,
+        num_replicas: int,
+    ):
+        if num_generations < 1:
+            raise ValueError(f"num_generations must be >= 1, got {num_generations}")
+        if num_replicas < 1:
+            raise ValueError(f"num_replicas must be >= 1, got {num_replicas}")
+        if not (0 <= rank < num_replicas):
+            raise ValueError(f"rank must be in [0, {num_replicas}), got {rank}")
+        self.inner = inner
+        self.num_generations = num_generations
+        self.rank = rank
+        self.num_replicas = num_replicas
+
+    def __iter__(self):
+        all_indices = list(self.inner)
+        if len(all_indices) % self.num_generations != 0:
+            raise ValueError(
+                f"inner sampler yielded {len(all_indices)} indices, "
+                f"not a multiple of num_generations={self.num_generations}"
+            )
+        # Chunk the flat index sequence into groups of num_generations
+        # consecutive indices. ``RepeatSampler`` guarantees that each
+        # group contains num_generations copies of the same prompt id.
+        groups = [
+            all_indices[i : i + self.num_generations]
+            for i in range(0, len(all_indices), self.num_generations)
+        ]
+        # Round-robin whole groups across ranks. Round-robin (vs.
+        # contiguous chunking) preserves approximate shuffled order on
+        # each rank even when the group count is small relative to the
+        # world size.
+        for group in groups[self.rank :: self.num_replicas]:
+            yield from group
+
+    def __len__(self):
+        try:
+            inner_len = len(self.inner)
+        except TypeError:
+            # Non-sized inner sampler — we can't know the per-rank
+            # length without materializing. Return 0 as a hint that the
+            # DataLoader should fall back to iteration.
+            return 0
+        total_groups = inner_len // self.num_generations
+        # Ceiling division for the trailing groups that don't divide
+        # evenly — extra groups go to the first ``total_groups %
+        # num_replicas`` ranks, matching the round-robin above.
+        my_groups = (
+            total_groups + self.num_replicas - self.rank - 1
+        ) // self.num_replicas
+        return my_groups * self.num_generations
+
+
 class DataProducer(ABC):
    """Abstract base class for online data producers.

@@ -556,6 +635,34 @@ class GRPODataProducer(BaseDataProducer):
            seed=self._seed,
        )

+        # Shard the sampler across distributed ranks so each rank sees
+        # a disjoint slice of prompts. ``RepeatSampler`` groups each
+        # prompt with ``num_generations`` consecutive copies — our
+        # wrapper round-robins WHOLE groups across ranks so all
+        # generations of a given prompt stay on the same rank (needed
+        # for GRPO advantage normalization within a group).
+        #
+        # Without this, ``accelerator.prepare(dl)`` with the default
+        # ``split_batches=False`` leaves the custom sampler alone, so
+        # every rank iterates the identical index sequence and the
+        # cluster dogpiles on the first 1/world_size of the prompts.
+        num_replicas = max(1, trainer.accelerator.num_processes)
+        if num_replicas > 1:
+            sampler = _GroupShardedSampler(
+                inner=sampler,
+                num_generations=self._num_generations,
+                rank=trainer.accelerator.process_index,
+                num_replicas=num_replicas,
+            )
+            logger.info(
+                "[RANK:%d] _GroupShardedSampler active "
+                "(num_replicas=%d, num_generations=%d, gen_batch=%d)",
+                trainer.accelerator.process_index,
+                num_replicas,
+                self._num_generations,
+                self._generation_batch_size,
+            )
+
        # Use identity collator (same as stock GRPOTrainer)
        def _identity(x):
            return x
@@ -574,12 +681,11 @@ class GRPODataProducer(BaseDataProducer):
                rank=trainer.args.process_index,
            ),
        )
-        self._prompt_dl = trainer.accelerator.prepare(dl)
-
-        # Don't let accelerator track this dataloader
-        acc_dls = trainer.accelerator._dataloaders
-        if self._prompt_dl in acc_dls:
-            acc_dls.remove(self._prompt_dl)
+        # Skip accelerator.prepare — we're handling per-rank sharding
+        # ourselves via ``_GroupShardedSampler``. ``prepare()`` would
+        # otherwise try to wrap the DataLoader with its own sharding
+        # logic which does not understand our group structure.
+        self._prompt_dl = dl

        self._prompt_iter = iter(self._prompt_dl)

@@ -1103,11 +1209,22 @@ class AsyncGRPOTrainer(GRPOTrainer):
        - vllm_lora_sync: saves adapter to filesystem, vLLM loads natively
        - PEFT no-merge: computes merged weights as new tensors, NCCL broadcast
        - Non-PEFT: stock sync_weights via merge_adapter + NCCL
+
+        This is the canonical sync trigger and runs in BOTH async and
+        synchronous modes from ``_prepare_inputs_with_data_producer`` /
+        ``_prepare_inputs_legacy_async``. The ``_generate_single_turn``
+        patch is a parallel backup for non-data-producer paths (vanilla
+        GRPO without NeMo Gym), where the data producer is bypassed
+        entirely and TRL's stock generate-then-sync flow is used instead.
        """
-        if not (self.use_vllm and self.args.async_prefetch):
+        if not self.use_vllm:
            return
        step = self.state.global_step
-        interval = self.args.vllm_sync_interval
+        # Default to syncing every step when no interval is configured —
+        # otherwise ``step % None`` would TypeError, and the previous
+        # behavior of crashing on the first sync was strictly worse than
+        # the standard "sync every optimizer step".
+        interval = self.args.vllm_sync_interval or 1
        if step != self._last_synced_step and step % interval == 0:
            if step == 0:
                logger.info("Skipping vLLM weight sync at step 0 (no training yet)")
@@ -1202,13 +1319,42 @@ class AsyncGRPOTrainer(GRPOTrainer):

        # Permanently replace vllm_generation.sync_weights with our custom
        # sync to avoid merge_adapter (fails on FP8 / races with training).
-        # For LoRA sync mode, make it a no-op here since _maybe_sync_vllm_weights
-        # handles the sync with proper interval tracking.
+        #
+        # The design has two modes that have to be threaded carefully:
+        #
+        #   - Async prefetch ON: BG generation thread can't safely call
+        #     sync_weights mid-rollout (it races with the trainer's optimizer
+        #     step and can corrupt weights). We no-op the stock sync hook and
+        #     drive sync ourselves from ``_maybe_sync_vllm_weights`` after the
+        #     optimizer step on the main thread.
+        #
+        #   - Async prefetch OFF (synchronous mode): TRL's stock
+        #     ``_generate_single_turn`` calls ``sync_weights`` once per step
+        #     boundary. There's no BG thread to race with, and
+        #     ``_maybe_sync_vllm_weights`` short-circuits with
+        #     ``if not async_prefetch: return``, so we MUST wire the stock
+        #     hook directly to our LoRA sync helper — otherwise nothing ever
+        #     pushes weights to vLLM and the trainer becomes a no-op (vLLM
+        #     keeps serving the base model, every rollout in every group
+        #     produces identical outputs, advantages are zero, optimizer
+        #     step gets skipped, repeat).
        if not getattr(self, "_patched_sync_weights", False):
            if self.use_vllm and hasattr(self, "vllm_generation"):
                if getattr(self.args, "vllm_lora_sync", False):
-                    # No-op: LoRA sync is driven by _maybe_sync_vllm_weights
-                    self.vllm_generation.sync_weights = lambda: None
+                    if getattr(self.args, "async_prefetch", False):
+                        # Async: drive sync from main thread via
+                        # _maybe_sync_vllm_weights instead.
+                        self.vllm_generation.sync_weights = lambda: None
+                    else:
+                        # Sync mode: TRL's _generate_single_turn already
+                        # calls sync_weights once per step boundary. Wire
+                        # it directly to our LoRA filesystem sync helper.
+                        sync_helper = self._sync_lora_adapter
+
+                        def _lora_filesystem_sync():
+                            sync_helper()
+
+                        self.vllm_generation.sync_weights = _lora_filesystem_sync
                    self._patched_sync_weights = True
                else:
                    from accelerate.utils import is_peft_model
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -19,7 +19,7 @@ python scripts/cutcrossentropy_install.py | sh

 - If you are installing from pip
 ```bash
-pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@63b15e6"
+pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@fec1a88"
 ```

 ## Usage
@@ -44,6 +44,7 @@ plugins:
 - gemma3_text
 - gemma3n
 - gemma3n_text
+- gemma4
 - glm
 - glm4
 - glm4_moe
--- a/src/axolotl/integrations/cut_cross_entropy/init.py
+++ b/src/axolotl/integrations/cut_cross_entropy/init.py
@@ -35,7 +35,7 @@ LOG = get_logger(__name__)

 _CCE_INSTALL_MESSAGE = (
    "Please install Axolotl's fork of cut_cross_entropy with transformers support using "
-    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@63b15e6"`'
+    '`pip uninstall -y cut-cross-entropy && pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@fec1a88"`'
 )


--- a/src/axolotl/integrations/hatchery/init.py
+++ b/src/axolotl/integrations/hatchery/init.py
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""Hatchery/Tinker remote training integration for Axolotl.
+
+Routes axolotl's preprocessed data to a remote training API (Tinker or
+Hatchery) instead of running forward/backward locally. The remote
+service handles model weights, LoRA adapters, and gradient updates.
+"""
+
+from .args import HatcheryArgs, HatcheryConfig
+from .plugin import HatcheryPlugin
+
+__all__ = ["HatcheryArgs", "HatcheryConfig", "HatcheryPlugin"]
+
+# Usage:
+#   plugins:
+#     - axolotl.integrations.hatchery.HatcheryPlugin
+#
+#   hatchery:
+#     backend: tinker  # or "hatchery"
+#     lora_rank: 32
+#     loss_fn: cross_entropy  # SFT
+#     # loss_fn: ppo         # RL (auto-selects HatcheryRLTrainer)
+#
+#   learning_rate: 1e-4  # top-level, not under hatchery:
--- a/src/axolotl/integrations/hatchery/args.py
+++ b/src/axolotl/integrations/hatchery/args.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""Pydantic config schema for the Hatchery integration."""
+
+from __future__ import annotations
+
+from typing import Any, Literal, Optional
+
+from pydantic import BaseModel, Field
+
+
+class HatcheryConfig(BaseModel):
+    """Nested config under `hatchery:` in the axolotl YAML.
+
+    Only contains hatchery-specific settings. Standard training params
+    (learning_rate, weight_decay, adam_beta1/2, max_grad_norm,
+    gradient_accumulation_steps) are read from axolotl's top-level config.
+    """
+
+    # Backend & connection
+    backend: Literal["tinker", "hatchery"] = "tinker"
+    base_url: Optional[str] = None
+    api_key: Optional[str] = None
+    project_id: Optional[str] = None
+
+    # LoRA config sent to remote
+    lora_rank: int = Field(32, ge=1, le=256)
+    train_attn: bool = True
+    train_mlp: bool = True
+    train_unembed: bool = True
+
+    # Loss function
+    loss_fn: Literal["cross_entropy", "importance_sampling", "ppo", "cispo", "dro"] = (
+        "cross_entropy"
+    )
+    loss_fn_config: Optional[dict[str, Any]] = None
+
+    # Pipelining: submit next batch before awaiting previous result
+    pipeline: bool = True
+
+    # Sampling params (for RL flows)
+    max_sample_tokens: int = 256
+    sample_temperature: float = 1.0
+    num_samples: int = 4
+
+    # Reward functions (for RL) — list of fully qualified names
+    reward_funcs: Optional[list[str]] = None
+
+    # Checkpointing
+    save_steps: Optional[int] = None
+    save_name_prefix: str = "checkpoint"
+
+    # Timeout per future (seconds)
+    future_timeout: float = 600.0
+
+
+class HatcheryArgs(BaseModel):
+    """Top-level mixin that adds the nested `hatchery:` field."""
+
+    hatchery: Optional[HatcheryConfig] = None
--- a/src/axolotl/integrations/hatchery/data.py
+++ b/src/axolotl/integrations/hatchery/data.py
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""Convert axolotl batch tensors to Tinker/Hatchery Datum format.
+
+Both Tinker and Hatchery expect the client to apply the causal LM shift:
+
+  Original tokens:  [t0, t1, t2, ..., t_{L-1}]
+  model_input:      [t0, t1, ..., t_{L-2}]       (last token dropped)
+  target_tokens:    [t1, t2, ..., t_{L-1}]        (first token dropped)
+  weights:          [w1, w2, ..., w_{L-1}]        (aligned to targets)
+
+At position i, the model sees t_i and predicts target_tokens[i] = t_{i+1}.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+import torch
+
+
+def _tensor_to_wire(t: torch.Tensor) -> dict[str, Any]:
+    """Serialize a tensor to the TensorData wire dict."""
+    flat = t.detach().cpu().flatten()
+    dtype_map = {
+        torch.float32: "float32",
+        torch.float16: "float16",
+        torch.bfloat16: "bfloat16",
+        torch.int64: "int64",
+        torch.int32: "int32",
+    }
+    return {
+        "dtype": dtype_map.get(flat.dtype, "float32"),
+        "shape": list(t.shape),
+        "data": flat.tolist(),
+    }
+
+
+def _make_datum(
+    tokens: list[int],
+    loss_fn_inputs: dict[str, torch.Tensor],
+) -> dict[str, Any]:
+    """Build a Datum as a plain dict (wire-compatible with both Tinker and Hatchery)."""
+    return {
+        "model_input": {
+            "chunks": [{"type": "encoded_text", "tokens": tokens}],
+        },
+        "loss_fn_inputs": {
+            key: _tensor_to_wire(tensor) for key, tensor in loss_fn_inputs.items()
+        },
+    }
+
+
+def datums_to_tinker(datums: list[dict[str, Any]]):
+    """Wrap plain-dict datums into tinker.types.Datum objects.
+
+    Both the Tinker SDK and updated Hatchery client accept these.
+    """
+    import tinker.types as tt
+
+    result = []
+    for d in datums:
+        tokens = d["model_input"]["chunks"][0]["tokens"]
+        tinker_inputs = {}
+        for key, wire in d["loss_fn_inputs"].items():
+            tinker_inputs[key] = tt.TensorData(
+                data=wire["data"],
+                dtype=wire["dtype"],
+                shape=wire["shape"],
+            )
+        result.append(
+            tt.Datum(
+                model_input=tt.ModelInput.from_ints(tokens),
+                loss_fn_inputs=tinker_inputs,
+            )
+        )
+    return result
+
+
+def batch_to_datums_sft(
+    input_ids: torch.Tensor,
+    labels: torch.Tensor,
+    attention_mask: torch.Tensor | None = None,
+) -> list[dict[str, Any]]:
+    """Convert an axolotl SFT batch to Datum dicts with causal shift."""
+    batch_size = input_ids.size(0)
+    datums = []
+
+    for i in range(batch_size):
+        ids = input_ids[i]
+        lbl = labels[i]
+
+        if attention_mask is not None:
+            seq_len = int(attention_mask[i].sum().item())
+            ids = ids[:seq_len]
+            lbl = lbl[:seq_len]
+
+        model_tokens = ids[:-1].tolist()
+        shifted_labels = lbl[1:]
+
+        target_tokens = shifted_labels.clone()
+        weights = (shifted_labels != -100).float()
+        target_tokens[target_tokens == -100] = 0
+
+        datums.append(
+            _make_datum(
+                model_tokens,
+                {
+                    "target_tokens": target_tokens,
+                    "weights": weights,
+                },
+            )
+        )
+
+    return datums
+
+
+def batch_to_datums_rl(
+    input_ids: torch.Tensor,
+    labels: torch.Tensor,
+    logprobs: torch.Tensor,
+    advantages: torch.Tensor,
+    attention_mask: torch.Tensor | None = None,
+) -> list[dict[str, Any]]:
+    """Convert an RL batch to importance_sampling/ppo Datum dicts with causal shift."""
+    batch_size = input_ids.size(0)
+    datums = []
+
+    for i in range(batch_size):
+        ids = input_ids[i]
+        lbl = labels[i]
+
+        if attention_mask is not None:
+            seq_len = int(attention_mask[i].sum().item())
+        else:
+            seq_len = ids.size(0)
+        ids = ids[:seq_len]
+        lbl = lbl[:seq_len]
+        lp = logprobs[i, :seq_len]
+        adv = advantages[i, :seq_len]
+
+        model_tokens = ids[:-1].tolist()
+
+        target_tokens = lbl[1:].clone()
+        target_tokens[target_tokens == -100] = 0
+
+        datums.append(
+            _make_datum(
+                model_tokens,
+                {
+                    "target_tokens": target_tokens,
+                    "logprobs": lp[1:],
+                    "advantages": adv[1:],
+                },
+            )
+        )
+
+    return datums
--- a/src/axolotl/integrations/hatchery/examples/prep_math_rl.py
+++ b/src/axolotl/integrations/hatchery/examples/prep_math_rl.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""Prepare hendrycks_math for RL training with Hatchery/Tinker.
+
+Creates a dataset with chat-formatted prompts that include
+a hidden gold answer tag for the reward function.
+
+Run:
+  python src/axolotl/integrations/hatchery/examples/prep_math_rl.py
+"""
+
+import os
+import re
+
+from datasets import Dataset, load_dataset
+from transformers import AutoTokenizer
+
+
+def extract_boxed(text: str) -> str:
+    match = re.search(r"\\boxed\{", text)
+    if not match:
+        return ""
+    start = match.end()
+    depth = 1
+    i = start
+    while i < len(text) and depth > 0:
+        if text[i] == "{":
+            depth += 1
+        elif text[i] == "}":
+            depth -= 1
+        i += 1
+    return text[start : i - 1] if depth == 0 else ""
+
+
+def main():
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B", trust_remote_code=True)
+
+    ds = load_dataset("EleutherAI/hendrycks_math", "algebra", split="test")
+    level = os.environ.get("MATH_LEVEL", "Level 1")
+    filtered_rows = [x for x in ds if x["level"] == level]
+    print(f"{level} algebra: {len(filtered_rows)} problems")
+
+    rows = []
+    for prob in filtered_rows:
+        gold = extract_boxed(prob["solution"])
+        if not gold:
+            continue
+
+        # Format as chat prompt with hidden gold tag
+        prompt = (
+            f"Solve the following math problem. "
+            f"Show your work and put your final answer in \\boxed{{}}.\n\n"
+            f"{prob['problem']}"
+            f"<|gold|>{gold}<|/gold|>"
+        )
+
+        # Tokenize the prompt
+        text = tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}],
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        prompt_ids = tokenizer.encode(text, add_special_tokens=False)
+
+        rows.append(
+            {
+                "input_ids": prompt_ids,
+                "labels": [-100] * len(prompt_ids),
+                "attention_mask": [1] * len(prompt_ids),
+            }
+        )
+
+    out = Dataset.from_list(rows)
+    out_dir = f"./data/math_rl_{level.lower().replace(' ', '')}"
+    out.save_to_disk(out_dir)
+    print(f"Saved {len(out)} examples to {out_dir}")
+    if rows:
+        print(
+            f"Prompt length range: {min(len(r['input_ids']) for r in rows)}"
+            f"-{max(len(r['input_ids']) for r in rows)}"
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/src/axolotl/integrations/hatchery/examples/tinker_rl.yaml
+++ b/src/axolotl/integrations/hatchery/examples/tinker_rl.yaml
@@ -0,0 +1,47 @@
+# RL (GRPO): hendrycks_math Level 1 via Tinker with Qwen3-8B
+#
+# Prep:
+#   python src/axolotl/integrations/hatchery/examples/prep_math_rl.py
+#
+# Run:
+#   export TINKER_API_KEY="your-key"
+#   axolotl train src/axolotl/integrations/hatchery/examples/tinker_rl.yaml
+
+base_model: Qwen/Qwen3-8B
+
+plugins:
+  - axolotl.integrations.hatchery.HatcheryPlugin
+
+hatchery:
+  backend: tinker
+  lora_rank: 16
+  loss_fn: importance_sampling
+  max_sample_tokens: 2048
+  sample_temperature: 0.7
+  num_samples: 4
+  pipeline: true
+  save_steps: 5
+  reward_funcs:
+    - axolotl.integrations.hatchery.rewards.math_reward.math_reward
+
+datasets:
+  - path: ./data/math_rl_level1
+    ds_type: arrow
+    type: completion
+
+sequence_len: 2048
+
+learning_rate: 5.0e-5
+optimizer: adamw_torch
+adam_beta1: 0.9
+adam_beta2: 0.95
+weight_decay: 0.01
+max_grad_norm: 1.0
+
+max_steps: 10
+num_epochs: 1
+micro_batch_size: 1
+gradient_accumulation_steps: 1
+logging_steps: 1
+
+output_dir: ./outputs/tinker-rl-math
--- a/src/axolotl/integrations/hatchery/examples/tinker_sft.yaml
+++ b/src/axolotl/integrations/hatchery/examples/tinker_sft.yaml
@@ -0,0 +1,42 @@
+# SFT: KIMI-K2 thinking data via Tinker remote API with Qwen3-8B
+#
+# Usage:
+#   export TINKER_API_KEY="your-key"
+#   axolotl train src/axolotl/integrations/hatchery/examples/tinker_sft.yaml
+
+base_model: Qwen/Qwen3-8B
+
+plugins:
+  - axolotl.integrations.hatchery.HatcheryPlugin
+
+hatchery:
+  backend: tinker
+  lora_rank: 16
+  loss_fn: cross_entropy
+  pipeline: true
+  save_steps: 10
+
+datasets:
+  - path: TeichAI/kimi-k2-thinking-1000x
+    split: train[:50]
+    type: chat_template
+    chat_template: qwen3
+    split_thinking: true
+
+chat_template: qwen3
+sequence_len: 2048
+
+learning_rate: 3.0e-4
+optimizer: adamw_torch
+adam_beta1: 0.9
+adam_beta2: 0.95
+weight_decay: 0.01
+max_grad_norm: 1.0
+
+num_epochs: 1
+max_steps: 20
+micro_batch_size: 2
+gradient_accumulation_steps: 1
+logging_steps: 1
+
+output_dir: ./outputs/tinker-sft
--- a/src/axolotl/integrations/hatchery/plugin.py
+++ b/src/axolotl/integrations/hatchery/plugin.py
@@ -0,0 +1,147 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""Axolotl plugin that routes training to a remote Hatchery/Tinker API."""
+
+from __future__ import annotations
+
+import torch
+from peft import PeftModel
+from transformers import AutoConfig, PreTrainedModel, Trainer
+
+from axolotl.integrations.base import BasePlugin
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+
+class HatcheryPlugin(BasePlugin):
+    """Plugin that replaces local training with remote API calls.
+
+    Activated by adding to the axolotl YAML:
+
+        plugins:
+          - axolotl.integrations.hatchery.HatcheryPlugin
+
+        hatchery:
+          backend: tinker  # or "hatchery"
+          lora_rank: 32
+          loss_fn: cross_entropy
+          # ... see HatcheryConfig for full options
+    """
+
+    def get_input_args(self) -> str:
+        return "axolotl.integrations.hatchery.args.HatcheryArgs"
+
+    def register(self, cfg: dict):
+        """Auto-set config values needed for remote training."""
+        if cfg.get("remove_unused_columns") is None:
+            cfg["remove_unused_columns"] = False
+
+    def pre_model_load(self, cfg: DictDefault):
+        """Replace model loading with a tiny stub."""
+        hcfg = cfg.hatchery or {}
+        backend = (
+            hcfg.get("backend", "tinker")
+            if isinstance(hcfg, dict)
+            else getattr(hcfg, "backend", "tinker")
+        )
+        LOG.info(
+            f"Hatchery plugin active: training dispatched to remote "
+            f"{backend} API. Skipping local model weight loading."
+        )
+
+        from axolotl.loaders import ModelLoader
+
+        def _stub_build_model(loader_self) -> bool:
+            base_model = loader_self.cfg.base_model
+            LOG.info(f"Skipping model weight loading for: {base_model}")
+
+            config = AutoConfig.from_pretrained(
+                base_model,
+                trust_remote_code=loader_self.cfg.get("trust_remote_code", False),
+            )
+
+            class _Stub(PreTrainedModel):
+                config_class = type(config)
+                _no_split_modules: list[str] = []
+                supports_gradient_checkpointing = False
+
+                def __init__(self, cfg):
+                    super().__init__(cfg)
+                    vocab_size = getattr(cfg, "vocab_size", 32000)
+                    self.embed_tokens = torch.nn.Embedding(vocab_size, 1)
+
+                def get_input_embeddings(self):
+                    return self.embed_tokens
+
+                def set_input_embeddings(self, value):
+                    pass
+
+                def get_output_embeddings(self):
+                    return None
+
+            loader_self.model = _Stub(config)
+            return True
+
+        ModelLoader._build_model = _stub_build_model  # type: ignore[method-assign,assignment]
+
+    def get_trainer_cls(self, cfg: DictDefault) -> type[Trainer] | None:
+        """Return the appropriate remote trainer class."""
+        hcfg = cfg.hatchery
+        loss_fn = getattr(hcfg, "loss_fn", "cross_entropy") if hcfg else "cross_entropy"
+
+        if loss_fn in ("importance_sampling", "ppo", "cispo", "dro"):
+            from .rl_trainer import HatcheryRLTrainer
+
+            return HatcheryRLTrainer
+
+        from .trainer import HatcheryTrainer
+
+        return HatcheryTrainer
+
+    def post_model_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
+        model._hatchery_remote = True
+
+    def post_train(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
+        LOG.info(
+            "Hatchery: skipping local model save (weights are on remote API). "
+            "Use `tinker checkpoint download` or hatchery CLI to retrieve."
+        )
+
+    def post_trainer_create(self, cfg: DictDefault, trainer: Trainer):
+        """Inject hatchery config + axolotl training params into the trainer."""
+        from .args import HatcheryConfig
+        from .rl_trainer import HatcheryRLTrainer
+        from .trainer import HatcheryTrainer
+
+        if not isinstance(trainer, (HatcheryTrainer, HatcheryRLTrainer)):
+            return
+
+        hcfg = cfg.hatchery
+        if isinstance(hcfg, dict):
+            hatchery_config = HatcheryConfig(**hcfg)
+        elif hcfg is None:
+            hatchery_config = HatcheryConfig()
+        else:
+            hatchery_config = hcfg
+
+        trainer.hatchery_args = hatchery_config
+        trainer._base_model_name = cfg.base_model
+
+        # Pull standard training params from axolotl config so they
+        # don't need to be duplicated under hatchery:
+        trainer._optim_params = {
+            "learning_rate": cfg.learning_rate
+            if cfg.learning_rate is not None
+            else 1e-4,
+            "beta1": cfg.adam_beta1 if cfg.adam_beta1 is not None else 0.9,
+            "beta2": cfg.adam_beta2 if cfg.adam_beta2 is not None else 0.95,
+            "eps": cfg.adam_epsilon if cfg.adam_epsilon is not None else 1e-12,
+            "weight_decay": cfg.weight_decay if cfg.weight_decay is not None else 0.0,
+            "grad_clip_norm": cfg.max_grad_norm
+            if cfg.max_grad_norm is not None
+            else 0.0,
+        }
--- a/src/axolotl/integrations/hatchery/rewards/init.py
+++ b/src/axolotl/integrations/hatchery/rewards/init.py
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
--- a/src/axolotl/integrations/hatchery/rewards/math_reward.py
+++ b/src/axolotl/integrations/hatchery/rewards/math_reward.py
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""Math reward function for hendrycks_math GRPO training.
+
+Uses math_verify for robust answer comparison. Falls back to
+exact string match of \\boxed{} content only when math_verify
+is unavailable.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+
+LOG = logging.getLogger(__name__)
+
+
+def extract_boxed(text: str) -> str | None:
+    """Extract \\boxed{...} answer handling nested braces."""
+    match = re.search(r"\\boxed\{", text)
+    if not match:
+        return None
+    start = match.end()
+    depth = 1
+    i = start
+    while i < len(text) and depth > 0:
+        if text[i] == "{":
+            depth += 1
+        elif text[i] == "}":
+            depth -= 1
+        i += 1
+    return text[start : i - 1] if depth == 0 else None
+
+
+def math_reward(prompts: list[str], completions: list[str], **kwargs) -> list[float]:
+    """Score completions by checking if \\boxed{} answer matches the gold answer.
+
+    The gold answer is extracted from the prompt (appended as a hidden
+    tag by the dataset preprocessing). Format:
+      ... <|gold|>ANSWER<|/gold|>
+    """
+    rewards = []
+    for prompt, completion in zip(prompts, completions, strict=True):
+        gold_match = re.search(r"<\|gold\|>(.*?)<\|/gold\|>", prompt)
+        if not gold_match:
+            rewards.append(0.0)
+            continue
+
+        gold_answer = gold_match.group(1).strip()
+        pred_answer = extract_boxed(completion)
+
+        if pred_answer is None:
+            rewards.append(0.0)
+            continue
+
+        verified = None
+        try:
+            from math_verify import parse, verify
+
+            gold_parsed = parse(gold_answer)
+            pred_parsed = parse(pred_answer)
+            verified = verify(gold_parsed, pred_parsed)
+        except Exception:
+            LOG.debug(
+                "math_verify unavailable or failed, using string fallback",
+                exc_info=True,
+            )
+
+        if verified is not None:
+            rewards.append(1.0 if verified else 0.0)
+        elif pred_answer.strip() == gold_answer.strip():
+            rewards.append(1.0)
+        else:
+            rewards.append(0.0)
+
+    return rewards
--- a/src/axolotl/integrations/hatchery/rl_trainer.py
+++ b/src/axolotl/integrations/hatchery/rl_trainer.py
@@ -0,0 +1,409 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""Remote RL trainer (GRPO/PPO) using Tinker or Hatchery API.
+
+Full RL loop per step:
+  1. Extract prompts from dataset batch
+  2. Sample N completions per prompt via remote SamplingClient
+  3. Score completions with local reward functions
+  4. Compute GRPO-style advantages (per-group normalization)
+  5. Send (prompt+completion, logprobs, advantages) as forward_backward
+  6. Optimizer step
+"""
+
+from __future__ import annotations
+
+import importlib
+import inspect
+import re
+import time
+from typing import Any, Callable, Optional
+
+import torch
+from transformers.trainer_utils import TrainOutput
+
+from axolotl.core.trainers.base import AxolotlTrainer
+from axolotl.utils.logging import get_logger
+
+from .args import HatcheryConfig
+from .data import batch_to_datums_rl, datums_to_tinker
+from .trainer import _create_training_client
+
+LOG = get_logger(__name__)
+
+
+def _load_reward_func(fqn: str) -> Callable:
+    """Load a reward function from a fully qualified name like 'module.func'."""
+    module_path = ".".join(fqn.split(".")[:-1])
+    func_name = fqn.split(".")[-1]
+    mod = importlib.import_module(module_path)
+    func = getattr(mod, func_name)
+    if len(inspect.signature(func).parameters) < 2:
+        raise ValueError(f"Reward function {fqn} must accept (prompts, completions)")
+    return func
+
+
+class HatcheryRLTrainer(AxolotlTrainer):
+    """Remote RL trainer using Tinker/Hatchery for sampling and training."""
+
+    hatchery_args: Optional[HatcheryConfig]
+    _base_model_name: Optional[str]
+    _training_client: Any
+    _reward_functions: list[Callable]
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.hatchery_args = None
+        self._base_model_name = None
+        self._training_client = None
+        self._reward_functions = []
+
+    def _ensure_reward_functions(self):
+        if self._reward_functions:
+            return
+        args = self.hatchery_args
+        if not args or not args.reward_funcs:
+            raise ValueError(
+                "No reward functions configured. Set hatchery.reward_funcs "
+                "in YAML, e.g. reward_funcs: ['my_module.my_reward']"
+            )
+        for fqn in args.reward_funcs:
+            self._reward_functions.append(_load_reward_func(fqn))
+        LOG.info(f"Loaded {len(self._reward_functions)} reward function(s)")
+
+    def _get_training_client(self):
+        if self._training_client is not None:
+            return self._training_client
+
+        self._training_client = _create_training_client(
+            self.hatchery_args, self._base_model_name
+        )
+        LOG.info(
+            f"Remote RL session created: backend={self.hatchery_args.backend}, "
+            f"model={self._base_model_name}, rank={self.hatchery_args.lora_rank}"
+        )
+        return self._training_client
+
+    def _sample_completions(self, prompt_ids_list: list[list[int]]):
+        """Sample completions for prompts via remote API."""
+        import tinker.types as tt
+
+        tc = self._get_training_client()
+        args = self.hatchery_args
+        assert args is not None  # validated by _get_training_client
+        results = []
+
+        sc = tc.save_weights_and_get_sampling_client()
+
+        for prompt_ids in prompt_ids_list:
+            if hasattr(sc, "sampling_session_id"):
+                sample_result = sc.sample(
+                    prompt_ids,
+                    max_tokens=args.max_sample_tokens,
+                    temperature=args.sample_temperature,
+                    n=args.num_samples,
+                ).result(timeout=args.future_timeout)
+            else:
+                mi = tt.ModelInput.from_ints(prompt_ids)
+                sp = tt.SamplingParams(
+                    max_tokens=args.max_sample_tokens,
+                    temperature=args.sample_temperature,
+                    top_p=0.95,
+                    top_k=-1,
+                )
+                sample_result = sc.sample(
+                    prompt=mi,
+                    num_samples=args.num_samples,
+                    sampling_params=sp,
+                ).result(timeout=args.future_timeout)
+
+            sequences = (
+                sample_result.sequences
+                if hasattr(sample_result, "sequences")
+                else sample_result.get("sequences", [])
+            )
+            for seq in sequences:
+                tokens = (
+                    list(seq.tokens)
+                    if hasattr(seq, "tokens")
+                    else seq.get("tokens", [])
+                )
+                logprobs = (
+                    list(seq.logprobs)
+                    if hasattr(seq, "logprobs") and seq.logprobs
+                    else seq.get("logprobs", [])
+                )
+                results.append(
+                    {
+                        "tokens": list(prompt_ids) + tokens,
+                        "completion_tokens": tokens,
+                        "logprobs": logprobs,
+                        "prompt_len": len(prompt_ids),
+                    }
+                )
+
+        return results
+
+    def _compute_rewards(
+        self, prompts: list[str], completions: list[str]
+    ) -> list[float]:
+        total_rewards = [0.0] * len(completions)
+        for reward_fn in self._reward_functions:
+            rewards = reward_fn(prompts, completions)
+            for i, r in enumerate(rewards):
+                total_rewards[i] += r
+        return total_rewards
+
+    @staticmethod
+    def _compute_advantages(rewards: list[float], group_size: int) -> list[float]:
+        advantages = []
+        for i in range(0, len(rewards), group_size):
+            group = rewards[i : i + group_size]
+            mean = sum(group) / len(group)
+            var = sum((r - mean) ** 2 for r in group) / max(len(group), 1)
+            std = var**0.5 if var > 1e-8 else 1.0
+            advantages.extend([(r - mean) / std for r in group])
+        return advantages
+
+    def _do_optim_step(self):
+        import tinker.types as tt
+
+        tc = self._get_training_client()
+        return tc.optim_step(tt.AdamParams(**self._optim_params))
+
+    def train(
+        self,
+        resume_from_checkpoint: Optional[str] = None,
+        trial: Any = None,
+        ignore_keys_for_eval: Optional[list[str]] = None,
+        **kwargs,
+    ) -> TrainOutput:
+        args = self.hatchery_args
+        if args is None:
+            raise RuntimeError("hatchery_args not configured")
+
+        self._ensure_reward_functions()
+
+        train_dataloader = self.get_train_dataloader()
+        num_train_epochs = int(self.args.num_train_epochs)
+        max_steps = self.args.max_steps if self.args.max_steps > 0 else 1000
+
+        LOG.info(
+            f"Remote RL training: max_steps={max_steps}, "
+            f"loss_fn={args.loss_fn}, samples/prompt={args.num_samples}"
+        )
+
+        self.state.max_steps = max_steps
+        self.state.num_train_epochs = num_train_epochs
+        self.state.is_local_process_zero = True
+        self.state.is_world_process_zero = True
+
+        self.control = self.callback_handler.on_train_begin(
+            self.args,
+            self.state,
+            self.control,  # type: ignore[has-type]
+        )
+
+        tokenizer = self.processing_class
+        global_step = 0
+        total_loss = 0.0
+        total_reward = 0.0
+        start_time = time.time()
+
+        for _epoch in range(num_train_epochs):
+            if global_step >= max_steps:
+                break
+
+            for batch in train_dataloader:
+                if global_step >= max_steps:
+                    break
+
+                self.control = self.callback_handler.on_step_begin(
+                    self.args, self.state, self.control
+                )
+
+                prompt_ids_batch = batch["input_ids"]
+                # Full prompt text (with gold tag) for reward scoring
+                prompt_texts = tokenizer.batch_decode(
+                    prompt_ids_batch, skip_special_tokens=False
+                )
+
+                # Strip <|gold|>...<|/gold|> from token ids before
+                # sending to the model for sampling — the gold answer
+                # must only be visible to the local reward function.
+                sampling_prompts = []
+                for prompt_text in prompt_texts:
+                    clean = re.sub(r"<\|gold\|>.*?<\|/gold\|>", "", prompt_text)
+                    clean_ids = tokenizer.encode(clean, add_special_tokens=False)
+                    sampling_prompts.append(clean_ids)
+
+                # 1. Sample completions (without gold answer)
+                t0 = time.time()
+                samples = self._sample_completions(sampling_prompts)
+                t_sample = time.time() - t0
+
+                if not samples:
+                    LOG.warning("No samples generated, skipping step")
+                    continue
+                LOG.info(
+                    f"Sampled {len(samples)} completions, "
+                    f"avg_len={sum(len(s['completion_tokens']) for s in samples) / len(samples):.0f}tok"
+                )
+
+                # 2. Decode and score
+                completion_texts = [
+                    tokenizer.decode(s["completion_tokens"], skip_special_tokens=False)
+                    for s in samples
+                ]
+                sample_prompts = []
+                for prompt_text in prompt_texts:
+                    sample_prompts.extend([prompt_text] * args.num_samples)
+
+                rewards = self._compute_rewards(sample_prompts, completion_texts)
+
+                # 3. GRPO advantages
+                advantages_list = self._compute_advantages(
+                    rewards, group_size=args.num_samples
+                )
+
+                # 4. Build training data
+                all_datums = []
+                for i, sample in enumerate(samples):
+                    full_tokens = sample["tokens"]
+                    prompt_len = sample["prompt_len"]
+                    seq_len = len(full_tokens)
+
+                    input_ids = torch.tensor([full_tokens], dtype=torch.long)
+                    labels = torch.full((1, seq_len), -100, dtype=torch.long)
+                    labels[0, prompt_len:] = torch.tensor(full_tokens[prompt_len:])
+
+                    logprobs_t = torch.zeros(1, seq_len)
+                    if sample["logprobs"]:
+                        lp = sample["logprobs"][: seq_len - prompt_len]
+                        logprobs_t[0, prompt_len : prompt_len + len(lp)] = torch.tensor(
+                            lp
+                        )
+
+                    adv_t = torch.zeros(1, seq_len)
+                    adv_t[0, prompt_len:] = advantages_list[i]
+
+                    all_datums.extend(
+                        batch_to_datums_rl(input_ids, labels, logprobs_t, adv_t)
+                    )
+
+                # 5. Forward backward (one datum at a time for memory) + optim
+                t0 = time.time()
+                tc = self._get_training_client()
+                step_loss = 0.0
+                for datum in all_datums:
+                    fb_future = tc.forward_backward(
+                        datums_to_tinker([datum]),
+                        loss_fn=args.loss_fn,
+                        loss_fn_config=args.loss_fn_config,
+                    )
+                    fb_result = fb_future.result(timeout=args.future_timeout)
+                    if hasattr(fb_result, "metrics"):
+                        step_loss += float(
+                            (fb_result.metrics or {}).get("loss:sum", 0.0)
+                        )
+                    elif isinstance(fb_result, dict):
+                        step_loss += float(
+                            fb_result.get("metrics", {}).get("loss:sum", 0.0)
+                        )
+                optim_future = self._do_optim_step()
+                if not args.pipeline:
+                    optim_future.result(timeout=args.future_timeout)
+                t_train = time.time() - t0
+
+                mean_reward = sum(rewards) / len(rewards)
+                accuracy = sum(1 for r in rewards if r > 0) / len(rewards)
+                mean_adv = sum(abs(a) for a in advantages_list) / len(advantages_list)
+                global_step += 1
+                total_loss += step_loss
+                total_reward += mean_reward
+                self.state.global_step = global_step
+
+                log_interval = self.args.logging_steps or 1
+                if global_step % log_interval == 0:
+                    elapsed = time.time() - start_time
+                    LOG.info(
+                        f"[step {global_step}/{max_steps}] "
+                        f"acc={accuracy:.2f} reward={mean_reward:.3f} "
+                        f"|adv|={mean_adv:.3f} loss:sum={step_loss:.1f} "
+                        f"sample={t_sample:.1f}s train={t_train:.1f}s "
+                        f"{elapsed / global_step:.1f}s/step"
+                    )
+                    self.log(
+                        {
+                            "loss": step_loss,
+                            "reward": mean_reward,
+                            "accuracy": accuracy,
+                            "mean_abs_advantage": mean_adv,
+                            "learning_rate": self._optim_params["learning_rate"],
+                        }
+                    )
+
+                if args.save_steps and global_step % args.save_steps == 0:
+                    self._save_remote_checkpoint(global_step)
+
+                self.control = self.callback_handler.on_step_end(
+                    self.args, self.state, self.control
+                )
+                if self.control.should_training_stop:
+                    break
+
+            if self.control.should_training_stop:
+                break
+
+        if global_step > 0:
+            self._save_remote_checkpoint(global_step, name="final")
+
+        elapsed = time.time() - start_time
+        avg_loss = total_loss / max(global_step, 1)
+        avg_reward = total_reward / max(global_step, 1)
+
+        LOG.info(
+            f"RL training complete: {global_step} steps, {elapsed:.1f}s, "
+            f"avg_reward={avg_reward:.4f}"
+        )
+
+        self.control = self.callback_handler.on_train_end(
+            self.args, self.state, self.control
+        )
+
+        return TrainOutput(
+            global_step=global_step,
+            training_loss=avg_loss,
+            metrics={
+                "train_loss": avg_loss,
+                "train_reward": avg_reward,
+                "train_runtime": elapsed,
+            },
+        )
+
+    def _save_remote_checkpoint(self, step: int, name: Optional[str] = None):
+        tc = self._get_training_client()
+        args = self.hatchery_args
+        assert args is not None  # validated by _get_training_client
+        ckpt_name = name or f"{args.save_name_prefix}-{step:06d}"
+        try:
+            future = tc.save_state(ckpt_name)
+            future.result(timeout=args.future_timeout)
+            LOG.info(f"Remote checkpoint saved: {ckpt_name}")
+        except Exception:
+            LOG.exception(f"Failed to save checkpoint {ckpt_name}")
+            if name == "final":
+                raise
+
+    def save_model(self, output_dir=None, _internal_call=False):
+        self._save_remote_checkpoint(
+            step=self.state.global_step,
+            name=output_dir or "hf-save",
+        )
+
+    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
+        raise NotImplementedError(
+            "HatcheryRLTrainer uses remote API; compute_loss not called locally."
+        )
--- a/src/axolotl/integrations/hatchery/trainer.py
+++ b/src/axolotl/integrations/hatchery/trainer.py
@@ -0,0 +1,327 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""Remote trainer that dispatches to Tinker or Hatchery API."""
+
+from __future__ import annotations
+
+import os
+import time
+from typing import Any, Optional
+
+import torch
+from transformers.trainer_utils import TrainOutput
+
+from axolotl.core.trainers.base import AxolotlTrainer
+from axolotl.utils.logging import get_logger
+
+from .args import HatcheryConfig
+from .data import batch_to_datums_sft, datums_to_tinker
+
+LOG = get_logger(__name__)
+
+
+def _extract_loss(result) -> float:
+    """Extract loss:sum from a forward_backward result.
+
+    Tinker's cross_entropy (and other losses) return the SUM of per-token
+    losses, not the mean. This is by design — it lets users control
+    normalization via the weights tensor. The trainer logs this raw sum;
+    users who want per-token loss should divide by number of active tokens.
+    """
+    if hasattr(result, "metrics"):
+        metrics = result.metrics or {}
+        return float(metrics.get("loss:sum", metrics.get("loss", 0.0)))
+    if isinstance(result, dict):
+        metrics = result.get("metrics", {})
+        return float(metrics.get("loss:sum", metrics.get("loss", 0.0)))
+    return 0.0
+
+
+def _create_training_client(args: HatcheryConfig, base_model: str):
+    """Create a training client for either Tinker or Hatchery backend."""
+    if args.backend == "tinker":
+        import tinker
+
+        api_key = args.api_key or os.environ.get("TINKER_API_KEY")
+        if not api_key:
+            raise ValueError(
+                "Tinker API key required. Set `hatchery.api_key` in config "
+                "or TINKER_API_KEY env var."
+            )
+        os.environ["TINKER_API_KEY"] = api_key
+
+        service = tinker.ServiceClient(project_id=args.project_id)
+        return service.create_lora_training_client(
+            base_model=base_model,
+            rank=args.lora_rank,
+            train_mlp=args.train_mlp,
+            train_attn=args.train_attn,
+            train_unembed=args.train_unembed,
+        )
+
+    from hatchery.core.client import HatcheryClient
+
+    base_url = args.base_url or os.environ.get("HATCHERY_URL", "http://127.0.0.1:8420")
+    token = args.api_key or os.environ.get("HATCHERY_API_KEY", "dev")
+
+    client = HatcheryClient(base_url=base_url, token=token, timeout=args.future_timeout)
+    return client.create_lora_training_client(
+        base_model=base_model,
+        rank=args.lora_rank,
+        train_attn=args.train_attn,
+        train_mlp=args.train_mlp,
+        train_unembed=args.train_unembed,
+    )
+
+
+class HatcheryTrainer(AxolotlTrainer):
+    """Trainer that sends preprocessed batches to a remote training API.
+
+    Replaces local forward/backward with remote API calls to Tinker or
+    Hatchery. Uses axolotl's full data preprocessing pipeline (tokenization,
+    chat templates, packing, etc.) but offloads compute to remote GPUs.
+    """
+
+    hatchery_args: Optional[HatcheryConfig]
+    _base_model_name: Optional[str]
+    _training_client: Any
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.hatchery_args = None
+        self._base_model_name = None
+        self._training_client = None
+
+    def _get_training_client(self):
+        """Lazily create the remote training session."""
+        if self._training_client is not None:
+            return self._training_client
+
+        args = self.hatchery_args
+        if args is None:
+            raise RuntimeError(
+                "HatcheryTrainer.hatchery_args not set. "
+                "Ensure the HatcheryPlugin is registered."
+            )
+
+        base_model = self._base_model_name
+        if not base_model:
+            raise RuntimeError("HatcheryTrainer._base_model_name not set.")
+
+        self._training_client = _create_training_client(args, base_model)
+
+        LOG.info(
+            f"Remote training session created: backend={args.backend}, "
+            f"model={base_model}, rank={args.lora_rank}"
+        )
+        return self._training_client
+
+    def _send_batch(self, batch: dict[str, torch.Tensor]):
+        """Convert batch to datums and send forward_backward to remote.
+
+        Returns (future, n_active_tokens) where n_active_tokens counts
+        the completion tokens in this batch (for loss normalization).
+        """
+        input_ids = batch["input_ids"]
+        labels = batch["labels"]
+        attention_mask = batch.get("attention_mask")
+
+        n_active = int((labels[:, 1:] != -100).sum().item())
+        datums = batch_to_datums_sft(input_ids, labels, attention_mask)
+
+        tc = self._get_training_client()
+        args = self.hatchery_args
+        assert args is not None  # validated by _get_training_client
+        send_datums = datums_to_tinker(datums)
+
+        future = tc.forward_backward(
+            send_datums,
+            loss_fn=args.loss_fn,
+            loss_fn_config=args.loss_fn_config,
+        )
+        return future, n_active
+
+    def _do_optim_step(self):
+        """Send optimizer step to remote using axolotl's training params."""
+        import tinker.types as tt
+
+        tc = self._get_training_client()
+        return tc.optim_step(tt.AdamParams(**self._optim_params))
+
+    def train(
+        self,
+        resume_from_checkpoint: Optional[str] = None,
+        trial: Any = None,
+        ignore_keys_for_eval: Optional[list[str]] = None,
+        **kwargs,
+    ) -> TrainOutput:
+        """Main training loop — sends batches to remote API."""
+        args = self.hatchery_args
+        if args is None:
+            raise RuntimeError("hatchery_args not configured")
+
+        train_dataloader = self.get_train_dataloader()
+        num_batches = len(train_dataloader)
+
+        grad_accum = self.args.gradient_accumulation_steps
+        num_train_epochs = int(self.args.num_train_epochs)
+        steps_per_epoch = max(num_batches // grad_accum, 1)
+        max_steps = (
+            self.args.max_steps
+            if self.args.max_steps > 0
+            else steps_per_epoch * num_train_epochs
+        )
+
+        LOG.info(
+            f"Remote training: {num_batches} batches/epoch, "
+            f"{grad_accum} grad_accum, {max_steps} max steps, "
+            f"{num_train_epochs} epochs"
+        )
+
+        self.state.max_steps = max_steps
+        self.state.num_train_epochs = num_train_epochs
+        self.state.is_local_process_zero = True
+        self.state.is_world_process_zero = True
+
+        self.control = self.callback_handler.on_train_begin(
+            self.args,
+            self.state,
+            self.control,  # type: ignore[has-type]
+        )
+
+        global_step = 0
+        total_loss = 0.0
+        start_time = time.time()
+
+        for _epoch in range(num_train_epochs):
+            if global_step >= max_steps:
+                break
+
+            self.control = self.callback_handler.on_epoch_begin(
+                self.args, self.state, self.control
+            )
+
+            pending_fb_futures = []
+            accum_count = 0
+
+            for batch_idx, batch in enumerate(train_dataloader):
+                if global_step >= max_steps:
+                    break
+
+                self.control = self.callback_handler.on_step_begin(
+                    self.args, self.state, self.control
+                )
+
+                fb_future, n_active = self._send_batch(batch)
+                pending_fb_futures.append((fb_future, n_active))
+                accum_count += 1
+
+                if accum_count >= grad_accum:
+                    step_loss_sum = 0.0
+                    step_active = 0
+                    for fut, n_act in pending_fb_futures:
+                        result = fut.result(timeout=args.future_timeout)
+                        step_loss_sum += _extract_loss(result)
+                        step_active += n_act
+
+                    optim_future = self._do_optim_step()
+                    if not args.pipeline:
+                        optim_future.result(timeout=args.future_timeout)
+
+                    step_loss = (
+                        step_loss_sum / step_active
+                        if step_active > 0
+                        else step_loss_sum
+                    )
+
+                    global_step += 1
+                    total_loss += step_loss
+                    self.state.global_step = global_step
+                    self.state.epoch = _epoch + (batch_idx + 1) / num_batches
+
+                    log_interval = self.args.logging_steps or 1
+                    if global_step % log_interval == 0:
+                        elapsed = time.time() - start_time
+                        avg_loss = total_loss / global_step
+                        LOG.info(
+                            f"[step {global_step}/{max_steps}] "
+                            f"loss/tok={step_loss:.4f} avg={avg_loss:.4f} "
+                            f"active={step_active} "
+                            f"{elapsed / global_step:.2f}s/step"
+                        )
+                        self.log(
+                            {
+                                "loss": step_loss,
+                                "learning_rate": self._optim_params["learning_rate"],
+                                "epoch": self.state.epoch,
+                            }
+                        )
+
+                    if args.save_steps and global_step % args.save_steps == 0:
+                        self._save_remote_checkpoint(global_step)
+
+                    self.control = self.callback_handler.on_step_end(
+                        self.args, self.state, self.control
+                    )
+
+                    pending_fb_futures = []
+                    accum_count = 0
+
+                    if self.control.should_training_stop:
+                        break
+
+            self.control = self.callback_handler.on_epoch_end(
+                self.args, self.state, self.control
+            )
+            if self.control.should_training_stop:
+                break
+
+        if global_step > 0:
+            self._save_remote_checkpoint(global_step, name="final")
+
+        elapsed = time.time() - start_time
+        avg_loss = total_loss / max(global_step, 1)
+
+        LOG.info(
+            f"Training complete: {global_step} steps, {elapsed:.1f}s total, "
+            f"{elapsed / max(global_step, 1):.2f}s/step, avg_loss={avg_loss:.4f}"
+        )
+
+        self.control = self.callback_handler.on_train_end(
+            self.args, self.state, self.control
+        )
+
+        return TrainOutput(
+            global_step=global_step,
+            training_loss=avg_loss,
+            metrics={"train_loss": avg_loss, "train_runtime": elapsed},
+        )
+
+    def _save_remote_checkpoint(self, step: int, name: Optional[str] = None):
+        """Save a checkpoint on the remote service."""
+        tc = self._get_training_client()
+        args = self.hatchery_args
+        assert args is not None  # validated by _get_training_client
+        ckpt_name = name or f"{args.save_name_prefix}-{step:06d}"
+        try:
+            future = tc.save_state(ckpt_name)
+            future.result(timeout=args.future_timeout)
+            LOG.info(f"Remote checkpoint saved: {ckpt_name}")
+        except Exception:
+            LOG.exception(f"Failed to save checkpoint {ckpt_name}")
+            if name == "final":
+                raise
+
+    def save_model(self, output_dir=None, _internal_call=False):
+        """Delegate to remote checkpoint save so HF callbacks create checkpoints."""
+        self._save_remote_checkpoint(
+            step=self.state.global_step,
+            name=output_dir or "hf-save",
+        )
+
+    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
+        raise NotImplementedError(
+            "HatcheryTrainer uses remote API; compute_loss should not be called."
+        )
--- a/src/axolotl/integrations/kernels/README.md
+++ b/src/axolotl/integrations/kernels/README.md
@@ -146,10 +146,6 @@ Gemma 4 (e.g. `google/gemma-4-26B-A4B`) has a unique hybrid MoE architecture:

 Because there is no SparseMoeBlock class to patch, Gemma 4 uses a different integration path: we register `"scattermoe"` as a custom implementation in the transformers `ExpertsInterface`, and set `experts_implementation: scattermoe` in the config. The `@use_experts_implementation` decorator on `Gemma4TextExperts` then dispatches to our ScatterMoE kernel automatically. The router is untouched — it runs as-is.

-**Important limitations:**
- **Flash Attention 2 is not supported** — Gemma 4 uses `global_head_dim: 512` for full attention layers, which exceeds FA2's maximum head dimension of 256. Use `sdp_attention: true` instead.
- **Multimodal model**: Gemma 4 includes vision and audio encoders. For text-only SFT, use `lora_target_linear_modules` with a regex to restrict LoRA to the text backbone (e.g. `language_model\.model\.layers\.\d+\.self_attn\.(q|k|v|o)_proj`).
-
 ## Limitations

 - **ScatterMoE + GLM4-MoE Lite**: ScatterMoE does not work reliably for GLM 4.7 Flash (`glm4_moe_lite`).
--- a/src/axolotl/integrations/kernels/args.py
+++ b/src/axolotl/integrations/kernels/args.py
@@ -53,28 +53,6 @@ class KernelsArgs(BaseModel):

        return data

-    @model_validator(mode="before")
-    @classmethod
-    def warn_sonicmoe_lora_overhead(cls, data):
-        if data.get("use_sonicmoe") is True and data.get("adapter") in (
-            "lora",
-            "qlora",
-        ):
-            lora_target = data.get("lora_target_modules") or []
-            lora_linear = data.get("lora_target_linear_modules") or []
-            targets = (
-                lora_target if isinstance(lora_target, list) else [lora_target]
-            ) + (lora_linear if isinstance(lora_linear, list) else [lora_linear])
-            expert_keywords = ("gate_up_proj", "down_proj", "experts")
-            if any(kw in t for t in targets for kw in expert_keywords):
-                LOG.info(
-                    "SonicMoE + LoRA on expert modules uses runtime weight materialization "
-                    "(W_eff = W + scaling*B@A per forward). This has slightly higher overhead "
-                    "than ScatterMoE's fused Triton LoRA kernels but works with any CUTLASS kernel."
-                )
-
-        return data
-
    @model_validator(mode="before")
    @classmethod
    def disable_mlp_kernel(cls, data):
--- a/src/axolotl/integrations/kernels/libs/scattermoe_lora/layers.py
+++ b/src/axolotl/integrations/kernels/libs/scattermoe_lora/layers.py
@@ -60,49 +60,14 @@ def peft_lora_B_to_scattermoe(peft_B, num_experts, rank):


 def peft_lora_to_scattermoe(peft_A, peft_B, num_experts, rank):
-    """Convert peft LoRA weights to scattermoe layout (with A<->B swap).
+    """Convert peft LoRA weights to scattermoe layout.

-    peft operates on the parameter in its native storage layout ``[E, dim1, dim2]``
-    where ``in_features=dim1, out_features=dim2``.  ScatterMoE transposes the
-    parameter (``W = param.transpose(2, 1)``) giving ``[E, dim2, dim1]`` with
-    ``K=dim2, N=dim1``.  Because of this transposition, peft's A and B roles
-    are swapped relative to scattermoe's convention.
-
-    peft gives:
-        lora_A ``[r*E, dim1]``, lora_B ``[dim2, r*E]``
-
-    scattermoe needs:
-        lora_A ``[r*E, K=dim2]``, lora_B ``[N=dim1, r*E]``
-
-    This function swaps A<->B and converts B from rank-major to expert-major.
-    Uses vectorized tensor operations (no Python loop over experts).
-
-    Works for **both** gate_up_proj and down_proj since the transposition
-    issue is the same for any parameter.
+    peft >=0.19.1 assigns in/out features for 3D params such that
+    A and B already align with scattermoe's convention (no A<->B swap).
+    Only B needs rank-major → expert-major layout conversion.
    """
-    peft_B_em = peft_lora_B_to_scattermoe(peft_B, num_experts, rank)
-
-    dim1 = peft_A.shape[1]  # peft in_features -> scattermoe N
-    dim2 = peft_B_em.shape[0]  # peft out_features -> scattermoe K
-
-    # smoe_A: per expert, transpose B_e [dim2, r] -> [r, dim2]
-    # [dim2, E*r] -> [dim2, E, r] -> [E, r, dim2] -> [E*r, dim2]
-    smoe_A = (
-        peft_B_em.reshape(dim2, num_experts, rank)
-        .permute(1, 2, 0)
-        .contiguous()
-        .reshape(rank * num_experts, dim2)
-    )
-
-    # smoe_B: per expert, transpose A_e [r, dim1] -> [dim1, r]
-    # [E*r, dim1] -> [E, r, dim1] -> [dim1, E, r] -> [dim1, E*r]
-    smoe_B = (
-        peft_A.reshape(num_experts, rank, dim1)
-        .permute(2, 0, 1)
-        .contiguous()
-        .reshape(dim1, num_experts * rank)
-    )
-
+    smoe_A = peft_A
+    smoe_B = peft_lora_B_to_scattermoe(peft_B, num_experts, rank)
    return smoe_A, smoe_B


--- a/src/axolotl/integrations/liger/plugin.py
+++ b/src/axolotl/integrations/liger/plugin.py
@@ -222,6 +222,56 @@ class LigerPlugin(BasePlugin):
                rms_norm=cfg.liger_rms_norm,
                swiglu=cfg.liger_glu_activation,
            )
+        elif cfg.model_config_type in ("gemma4", "gemma4_text"):
+            # Gemma4: offset=0 (NOT 1 like Gemma3), in_place=False required for
+            # gradient checkpointing compatibility, RoPE incompatible (separate q/k).
+            from liger_kernel.transformers.geglu import LigerGEGLUMLP
+            from transformers.models.gemma4 import modeling_gemma4
+
+            if cfg.liger_rms_norm:
+                _OrigGemma4RMSNorm = modeling_gemma4.Gemma4RMSNorm
+
+                class _LigerGemma4RMSNorm(LigerRMSNorm):
+                    """LigerRMSNorm for Gemma4 with in_place=False and with_scale support."""
+
+                    def __new__(cls, dim, eps=1e-6, with_scale=True):
+                        if not with_scale:
+                            return _OrigGemma4RMSNorm(dim, eps, with_scale=False)
+                        return super().__new__(cls)
+
+                    def __init__(self, dim, eps=1e-6, with_scale=True):
+                        if not with_scale:
+                            return
+                        # offset=0.0 (standard), in_place=False (gradient checkpointing safe)
+                        super().__init__(
+                            dim, eps, offset=0.0, casting_mode="llama", in_place=False
+                        )
+
+                modeling_gemma4.Gemma4RMSNorm = _LigerGemma4RMSNorm
+            if cfg.liger_glu_activation:
+
+                class _LigerGemma4MLP(LigerGEGLUMLP):
+                    def __init__(self, config, layer_idx=None):
+                        super().__init__(config)
+
+                modeling_gemma4.Gemma4TextMLP = _LigerGemma4MLP
+            if cfg.liger_rope:
+                LOG.warning(
+                    "Liger RoPE is not compatible with Gemma4 (separate q/k application). Skipping."
+                )
+            if cfg.liger_layer_norm:
+                modeling_gemma4.nn.LayerNorm = LigerLayerNorm
+            if cfg.liger_cross_entropy:
+                modeling_gemma4.nn.CrossEntropyLoss = LigerCrossEntropyLoss
+            if cfg.liger_fused_linear_cross_entropy:
+                LOG.warning(
+                    "Liger fused linear cross entropy is not compatible with Gemma4. Skipping."
+                )
+            LOG.info(
+                f"Applied Liger kernels for gemma4: "
+                f"rms_norm={cfg.liger_rms_norm}, glu={cfg.liger_glu_activation}, "
+                f"rope=False (incompatible), layer_norm={cfg.liger_layer_norm}"
+            )
        elif cfg.liger_fused_linear_cross_entropy:
            try:
                from .models.base import patch_lce_forward
--- a/src/axolotl/integrations/nemo_gym/data_producer.py
+++ b/src/axolotl/integrations/nemo_gym/data_producer.py
@@ -110,11 +110,36 @@ class NemoGymDataProducer(GRPODataProducer):
                item["agent_ref"] = full_item["agent_ref"]
            dataset_items.append(item)

-        # Expand by num_generations (agent produces one rollout per call)
-        expanded_items = []
-        for item in dataset_items:
-            for _ in range(self._num_generations):
-                expanded_items.append(item)
+        # NOTE: do NOT re-expand by num_generations here.
+        # ``RepeatSampler(mini_repeat_count=num_generations)`` already
+        # yields ``num_generations`` consecutive copies of each unique
+        # prompt, so ``inputs`` is a list of ``(unique_prompts_per_rank *
+        # num_generations)`` items — one entry per rollout. Expanding
+        # again here would fire ``num_generations^2`` rollouts per
+        # prompt per rank and make every step dogpile on a handful of
+        # tasks.
+        expanded_items = dataset_items
+
+        # Diagnostic: log what this rank is about to fire.
+        try:
+            import collections
+
+            iid_counts: collections.Counter[str | None] = collections.Counter()
+            for it in dataset_items:
+                iid_counts[
+                    (it.get("responses_create_params", {}).get("metadata") or {}).get(
+                        "instance_id"
+                    )
+                ] += 1
+            LOG.info(
+                "[RANK:%d] produce(): firing %d agent /run calls covering %d unique prompts: %s",
+                trainer.accelerator.process_index,
+                len(dataset_items),
+                len(iid_counts),
+                list(iid_counts.most_common(5)),
+            )
+        except Exception:
+            pass

        # Call NeMo Gym agents
        loop = asyncio.new_event_loop()
@@ -140,6 +165,7 @@ class NemoGymDataProducer(GRPODataProducer):
        logprobs_list = []
        rewards_list = []

+        num_turns_list: list[int] = []
        for resp in responses:
            parsed = _parse_agent_response(resp, eos_token_id)
            prompt_ids_list.append(parsed["prompt_ids"])
@@ -147,6 +173,7 @@ class NemoGymDataProducer(GRPODataProducer):
            env_mask_list.append(parsed["env_mask"])
            logprobs_list.append(parsed["logprobs"])
            rewards_list.append(parsed["reward"])
+            num_turns_list.append(parsed.get("num_turns", 0))

        # Pad to tensors
        prompt_ids = [torch.tensor(ids, device=device) for ids in prompt_ids_list]
@@ -179,22 +206,48 @@ class NemoGymDataProducer(GRPODataProducer):
        tool_mask = [torch.tensor(m, device=device) for m in env_mask_list]
        tool_mask = pad(tool_mask, padding_value=1, padding_side="right")

-        # Inject rewards into inputs so _compute_deferred_scores can use them
-        # The deferred scoring path calls _calculate_rewards which reads reward_funcs.
-        # Our passthrough reward_fn reads "env_reward" from kwargs.
+        # Inject per-rollout reward + num_turns into each input. Since
+        # ``RepeatSampler`` already yields ``num_generations`` copies of
+        # each prompt, ``inputs`` has ONE entry per rollout (matching
+        # ``rewards_list`` 1:1). No per-prompt grouping happens here —
+        # GRPO advantage normalization is the trainer's job downstream.
+        assert len(inputs) == len(rewards_list), (
+            f"rewards/inputs length mismatch: "
+            f"{len(rewards_list)} rewards vs {len(inputs)} inputs"
+        )
        for i, inp in enumerate(inputs):
-            # Each input gets rewards for its num_generations rollouts
-            start = i * self._num_generations
-            end = start + self._num_generations
-            inp["env_reward"] = rewards_list[start:end]
+            inp["env_reward"] = rewards_list[i]
+            inp["num_turns"] = num_turns_list[i]

-        # Expand inputs to match expanded rollouts (num_generations copies)
-        expanded_inputs = []
-        for inp in inputs:
-            for g in range(self._num_generations):
-                expanded_inp = dict(inp)
-                expanded_inp["env_reward"] = inp["env_reward"][g]
-                expanded_inputs.append(expanded_inp)
+        # One expanded_input per rollout (already correct count because
+        # inputs has num_generations copies baked in by the sampler).
+        expanded_inputs = [dict(inp) for inp in inputs]
+
+        # Log rollout-level stats to wandb from rank 0. These are the
+        # true agent-side metrics (not the tokenized TRL view) — so
+        # num_turns reflects how many /run iterations each rollout
+        # actually took before finishing or hitting max_turns.
+        if is_main and num_turns_list:
+            try:
+                import wandb
+
+                if wandb.run is not None:
+                    import statistics as _stats
+
+                    nonzero = sum(1 for r in rewards_list if r > 0)
+                    log_payload = {
+                        "rollout/num_turns/mean": float(_stats.mean(num_turns_list)),
+                        "rollout/num_turns/min": float(min(num_turns_list)),
+                        "rollout/num_turns/max": float(max(num_turns_list)),
+                        "rollout/reward/mean": float(_stats.mean(rewards_list)),
+                        "rollout/reward/nonzero_frac": (
+                            nonzero / len(rewards_list) if rewards_list else 0.0
+                        ),
+                        "rollout/n_samples": float(len(rewards_list)),
+                    }
+                    wandb.log(log_payload, commit=False)
+            except Exception as exc:  # never let metric logging break training
+                LOG.warning("rollout wandb log failed: %s", exc)

        # Decode completions for reward functions
        completions = trainer.processing_class.batch_decode(
--- a/src/axolotl/integrations/nemo_gym/plugin.py
+++ b/src/axolotl/integrations/nemo_gym/plugin.py
@@ -19,6 +19,7 @@ Supports two modes:
 from __future__ import annotations

 import os
+from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Union

 from axolotl.integrations.base import BasePlugin
@@ -30,6 +31,107 @@ if TYPE_CHECKING:
 LOG = get_logger(__name__)


+# ---- vLLM weight-sync transport probe ------------------------------------
+
+
+@dataclass
+class VLLMWeightSyncCapabilities:
+    """What weight-sync routes a vLLM server actually exposes.
+
+    Discovered once at ``pre_model_load`` time by fetching the server's
+    ``/openapi.json``. Drives the transport-selection table below.
+    """
+
+    nccl: bool = False  # /init_communicator/ + /update_named_param/
+    lora_filesystem: bool = False  # /v1/load_lora_adapter (vLLM native)
+    lora_axolotl: bool = False  # /set_lora_adapter/ (axolotl serve_lora extension)
+    http_full: bool = False  # /http_update_weights/ (axolotl serve_lora extension)
+    probed: bool = False
+    probe_error: str | None = None
+    routes: list[str] = field(default_factory=list)
+
+    @property
+    def any_full_param_sync(self) -> bool:
+        """True if at least one transport can push full-model weights."""
+        return self.nccl or self.http_full
+
+    @property
+    def any_lora_sync(self) -> bool:
+        """True if at least one transport can push LoRA adapters."""
+        return self.lora_filesystem or self.lora_axolotl or self.nccl
+
+
+def probe_vllm_weight_sync(
+    base_url: str, timeout: float = 5.0
+) -> VLLMWeightSyncCapabilities:
+    """Detect which weight-sync routes the configured vLLM server exposes.
+
+    Uses the server's FastAPI ``/openapi.json`` — every weight-sync transport
+    we care about is mounted as a POST route there. Falls back to all-False
+    on any error so the caller can still decide what to do (typically: raise
+    a clear error rather than silently no-op).
+    """
+    import requests
+
+    caps = VLLMWeightSyncCapabilities()
+    try:
+        r = requests.get(f"{base_url.rstrip('/')}/openapi.json", timeout=timeout)
+        r.raise_for_status()
+        spec = r.json()
+        routes = sorted((spec.get("paths") or {}).keys())
+        caps.routes = routes
+        caps.nccl = "/init_communicator/" in routes and "/update_named_param/" in routes
+        caps.lora_filesystem = "/v1/load_lora_adapter" in routes
+        caps.lora_axolotl = "/set_lora_adapter/" in routes
+        caps.http_full = "/http_update_weights/" in routes
+        caps.probed = True
+    except Exception as exc:
+        caps.probe_error = f"{type(exc).__name__}: {exc}"
+        LOG.warning(
+            "NeMo Gym: failed to probe vLLM /openapi.json at %s — %s. "
+            "Will fall back to LoRA-only behavior.",
+            base_url,
+            caps.probe_error,
+        )
+    return caps
+
+
+def select_weight_sync_transport(
+    caps: VLLMWeightSyncCapabilities,
+    *,
+    has_lora: bool,
+    vllm_lora_sync_pref: bool,
+) -> str:
+    """Pick the right transport for a (server caps, model type) combo.
+
+    Returns one of: ``"lora_filesystem"``, ``"nccl"``, ``"http_full"``, or
+    ``"none"``. The caller decides what to do with ``"none"`` (typically:
+    raise an error explaining the misconfiguration).
+
+    Selection table:
+        LoRA model + lora endpoint + lora-sync pref    → lora_filesystem
+        LoRA model + lora endpoint                     → lora_filesystem
+        LoRA model + nccl endpoint                     → nccl (broadcast merged adapter)
+        Full model + nccl endpoint                     → nccl
+        Full model + http endpoint                     → http_full
+        anything else                                  → none
+    """
+    if has_lora:
+        if (caps.lora_filesystem or caps.lora_axolotl) and vllm_lora_sync_pref:
+            return "lora_filesystem"
+        if caps.lora_filesystem or caps.lora_axolotl:
+            return "lora_filesystem"
+        if caps.nccl:
+            return "nccl"
+        return "none"
+    # Full-parameter model
+    if caps.nccl:
+        return "nccl"
+    if caps.http_full:
+        return "http_full"
+    return "none"
+
+
 class NemoGymPlugin(BasePlugin):
    """Plugin for NVIDIA NeMo Gym integration with Axolotl.

@@ -50,37 +152,69 @@ class NemoGymPlugin(BasePlugin):
        self._reward_fn = None
        self._dataset_lookup = None
        self._agent_servers = {}
+        self._vllm_caps: VLLMWeightSyncCapabilities | None = None

    def get_input_args(self):
        return "axolotl.integrations.nemo_gym.NemoGymArgs"

    def pre_model_load(self, cfg):
-        """Apply monkeypatches before trainer creation."""
+        """Probe vLLM weight-sync routes and conditionally bypass NCCL init.
+
+        Replaces the previous unconditional ``init_communicator`` monkey-patch
+        with a probe of the configured vLLM server's ``/openapi.json``. We only
+        bypass NCCL init when the server we're talking to actually lacks the
+        ``/init_communicator/`` route (i.e. stock ``vllm serve``); against
+        TRL/axolotl serve modules that DO expose NCCL routes, we leave the
+        standard TRL flow alone so full-finetune training can sync weights.
+        """
        if not cfg.nemo_gym_enabled:
            return

-        # Always skip NCCL communicator init in NeMo Gym mode.
-        # NeMo Gym uses its own vLLM server (standard OpenAI API), not the TRL
-        # colocate/NCCL path. The NCCL init fails with vLLM V1 and standard servers.
        trl_cfg = getattr(cfg, "trl", None)
-        if trl_cfg and getattr(trl_cfg, "vllm_mode", "server") == "server":
+        if not (trl_cfg and getattr(trl_cfg, "vllm_mode", "server") == "server"):
+            return
+
+        host = getattr(trl_cfg, "vllm_server_host", None) or "127.0.0.1"
+        port = getattr(trl_cfg, "vllm_server_port", None) or 8000
+        base_url = f"http://{host}:{port}"
+        self._vllm_caps = probe_vllm_weight_sync(base_url)
+
+        if self._vllm_caps.probed:
+            LOG.info(
+                "NeMo Gym: vLLM weight-sync probe @ %s — nccl=%s lora_native=%s "
+                "lora_axolotl=%s http_full=%s",
+                base_url,
+                self._vllm_caps.nccl,
+                self._vllm_caps.lora_filesystem,
+                self._vllm_caps.lora_axolotl,
+                self._vllm_caps.http_full,
+            )
+
+        # Only bypass NCCL init when the server doesn't speak it. If NCCL is
+        # available we leave VLLMClient.init_communicator alone so the
+        # standard TRL sync flow can run for full-parameter training.
+        if not self._vllm_caps.nccl:
            self._patch_skip_nccl_init()

    def _patch_skip_nccl_init(self):
        """Monkeypatch VLLMClient.init_communicator to no-op.

-        NeMo Gym uses its own vLLM server (standard OpenAI API or custom LoRA
-        serve script). The NCCL communicator is not needed and fails with both
-        vLLM V1 engine and standard OpenAI server mode.
+        Only called when the configured vLLM server doesn't expose
+        ``/init_communicator/`` (e.g. stock ``vllm serve``). In that case
+        TRL's standard ``init_communicator`` would 404 inside trainer
+        construction; we no-op it so the LoRA filesystem path can install
+        its own sync in ``post_trainer_create``.
        """
        try:
            from trl.generation.vllm_client import VLLMClient

            VLLMClient._original_init_communicator = VLLMClient.init_communicator
            VLLMClient.init_communicator = lambda self, **kwargs: LOG.info(
-                "Skipping NCCL init_communicator (LoRA sync mode)"
+                "Skipping NCCL init_communicator (server has no /init_communicator/)"
+            )
+            LOG.info(
+                "Patched VLLMClient.init_communicator to no-op (server has no NCCL routes)"
            )
-            LOG.info("Patched VLLMClient.init_communicator to no-op for LoRA sync")
        except Exception as exc:
            LOG.warning(f"Failed to patch VLLMClient: {exc}")

@@ -234,30 +368,80 @@ class NemoGymPlugin(BasePlugin):
        verify_timeout = cfg.nemo_gym_verify_timeout or 30
        multi_turn = cfg.nemo_gym_multi_turn or False

-        # Handle weight sync. NeMo Gym skips NCCL init, so we need to either:
-        # - Install LoRA sync (when vllm_lora_sync=True)
-        # - Or no-op sync_weights (when using standard vLLM server)
+        # Pick a weight-sync transport based on what the configured vLLM
+        # server actually exposes (see ``pre_model_load`` probe) and what
+        # kind of model we're training. The selection table is documented
+        # in ``select_weight_sync_transport``.
        trl_cfg = getattr(cfg, "trl", None)
        if hasattr(trainer, "vllm_generation") and trainer.vllm_generation:
            vllm_gen = trainer.vllm_generation
-            if trl_cfg and getattr(trl_cfg, "vllm_lora_sync", False):
+            adapter = getattr(cfg, "adapter", None)
+            has_lora = adapter in ("lora", "qlora")
+            vllm_lora_sync_pref = bool(
+                trl_cfg and getattr(trl_cfg, "vllm_lora_sync", False)
+            )
+            caps = self._vllm_caps or VLLMWeightSyncCapabilities()
+            transport = select_weight_sync_transport(
+                caps,
+                has_lora=has_lora,
+                vllm_lora_sync_pref=vllm_lora_sync_pref,
+            )
+
+            if transport == "lora_filesystem":
                self._setup_lora_sync(trainer)
-                # Verify the vLLM server supports runtime LoRA loading
                self._check_lora_endpoint(vllm_gen)
-            else:
-                # No NCCL, no LoRA sync — skip all weight sync paths
-                vllm_gen.sync_weights = lambda: LOG.debug(
-                    "Weight sync skipped (NeMo Gym mode)"
+                LOG.info("NeMo Gym weight sync: LoRA filesystem")
+            elif transport == "nccl":
+                # Standard TRL NCCL path. We leave ``VLLMClient.init_communicator``
+                # alone (pre_model_load only patched it when the probe found no
+                # NCCL route) so the trainer's normal weight-sync flow runs.
+                LOG.info(
+                    "NeMo Gym weight sync: NCCL (server exposes /init_communicator/)"
                )
-                type(vllm_gen).sync_weights = lambda self: LOG.debug(
-                    "Weight sync skipped (NeMo Gym mode)"
+            elif transport == "http_full":
+                # Full-parameter HTTP sync — implementation lands in step 3.
+                # For now, fail loudly so users know the path is detected but
+                # not yet wired up, instead of silently no-oping like before.
+                raise NotImplementedError(
+                    "NeMo Gym + full fine-tune + HTTP weight sync is detected "
+                    "but the client-side sync helper is not yet implemented "
+                    "(planned). Use `adapter: lora|qlora` for now, or use a "
+                    "vLLM serve module that exposes /init_communicator/ for "
+                    "NCCL sync."
                )
-                # Also patch the async trainer's internal sync method
-                if hasattr(trainer, "_maybe_sync_vllm_weights"):
-                    trainer._maybe_sync_vllm_weights = lambda: LOG.debug(
-                        "Async weight sync skipped (NeMo Gym mode)"
+            else:  # transport == "none"
+                # No viable sync path. Build a precise error so the user knows
+                # exactly what's missing and how to fix it.
+                if not caps.probed:
+                    msg = (
+                        "could not probe the vLLM server's "
+                        f"/openapi.json: {caps.probe_error}. "
+                        "Verify that vLLM is reachable at "
+                        f"{getattr(trl_cfg, 'vllm_server_host', '?')}:"
+                        f"{getattr(trl_cfg, 'vllm_server_port', '?')}."
                    )
-                LOG.info("Disabled weight sync (NeMo Gym mode, no LoRA sync)")
+                elif has_lora:
+                    msg = (
+                        "the vLLM server has neither NCCL routes "
+                        "(/init_communicator/) nor a LoRA-loading route "
+                        "(/v1/load_lora_adapter or /set_lora_adapter/). "
+                        "Restart vLLM with `--enable-lora --max-lora-rank N "
+                        "VLLM_ALLOW_RUNTIME_LORA_UPDATING=1` for the stock "
+                        "server, or use `axolotl vllm-serve` for the "
+                        "NCCL-capable serve module."
+                    )
+                else:
+                    msg = (
+                        "the vLLM server exposes no full-parameter sync route "
+                        "(/init_communicator/ for NCCL or /http_update_weights/ "
+                        "for HTTP). Use `axolotl vllm-serve` (which has both) "
+                        "or set `adapter: lora|qlora`."
+                    )
+                raise ValueError(
+                    f"NeMo Gym: no usable weight-sync transport — {msg} Without "
+                    "weight sync the trainer's gradient updates never reach the "
+                    "rollout policy (functionally a no-op trainer)."
+                )

        if multi_turn:
            self._wire_multi_turn(cfg, trainer, model_name, verify_timeout)
--- a/src/axolotl/integrations/nemo_gym/server.py
+++ b/src/axolotl/integrations/nemo_gym/server.py
@@ -130,21 +130,41 @@ def start_servers(
    )


-def get_server_configs(head_port: int = 11000) -> dict:
+def get_server_configs(head_port: int = 11000, timeout: float = 30.0) -> dict:
    """Fetch the global config from the NeMo Gym head server.

+    Retries up to 3 times with exponential backoff. The default per-attempt
+    timeout is 30s (raised from the original 5s) because head servers can
+    be slow to respond when they're concurrently serving rollouts from a
+    prior training run. A 5s timeout was empirically too tight to survive
+    a kill-and-relaunch cycle.
+
    Returns:
        Dict mapping server_name -> server config.
    """
-    response = requests.get(
-        f"http://127.0.0.1:{head_port}/global_config_dict_yaml", timeout=5
+    url = f"http://127.0.0.1:{head_port}/global_config_dict_yaml"
+    last_exc: Exception | None = None
+    for attempt in (1, 2, 3):
+        try:
+            response = requests.get(url, timeout=timeout)
+            response.raise_for_status()
+            result = yaml.safe_load(response.text)
+            # NeMo Gym head server double-encodes: YAML string inside a YAML string
+            if isinstance(result, str):
+                result = yaml.safe_load(result)
+            return result
+        except (requests.exceptions.RequestException, OSError) as exc:
+            last_exc = exc
+            LOG.warning(
+                "NeMo Gym head probe attempt %d/3 failed: %s. Retrying...",
+                attempt,
+                type(exc).__name__,
+            )
+            if attempt < 3:
+                time.sleep(2.0 * attempt)
+    raise RuntimeError(
+        f"NeMo Gym head server at {url} did not respond after 3 attempts: {last_exc}"
    )
-    response.raise_for_status()
-    result = yaml.safe_load(response.text)
-    # NeMo Gym head server double-encodes: YAML string inside a YAML string
-    if isinstance(result, str):
-        result = yaml.safe_load(result)
-    return result


 def get_agent_servers(
--- a/src/axolotl/kernels/gemma4_fused_rope.py
+++ b/src/axolotl/kernels/gemma4_fused_rope.py
@@ -0,0 +1,593 @@
+"""
+Fused RMSNorm + RoPE Triton kernel for Gemma 4.
+
+Fuses three operations into one kernel launch:
+  1. RMSNorm: x_norm = (x / sqrt(mean(x^2) + eps)) * weight
+  2. RoPE:    y = x_norm * cos + rotate_half(x_norm) * sin
+  3. (optional) RMSNorm without scale (for v_norm)
+
+This eliminates two intermediate tensor materializations per Q/K path;
+churn from rotate_half / apply_rotary_pos_emb.
+
+Shapes:
+  X:      (rows, head_dim)  — flattened from (batch, seq_len, num_heads, head_dim)
+  W:      (head_dim,)       — RMSNorm weight (None for with_scale=False)
+  cos:    (rows, head_dim)  — flattened from (batch, seq_len, 1, head_dim) after broadcast
+  sin:    (rows, head_dim)  — same as cos
+"""
+
+import math
+import operator
+
+import torch
+import triton
+import triton.language as tl
+from liger_kernel.ops.utils import (
+    calculate_settings,
+    compare_version,
+    ensure_contiguous,
+    torch_to_triton_dtype,
+)
+from liger_kernel.utils import is_npu_available
+
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
+    try:
+        from triton.language.extra.libdevice import rsqrt
+    except ModuleNotFoundError:
+        from triton.language.extra.cuda.libdevice import rsqrt
+else:
+    from triton.language.math import rsqrt
+
+
+@triton.jit
+def _rms_norm_rope_forward_kernel(
+    Y_ptr,
+    Y_row_stride,
+    X_ptr,
+    X_row_stride,
+    W_ptr,
+    COS_ptr,
+    COS_row_stride,
+    SIN_ptr,
+    SIN_row_stride,
+    RSTD_ptr,
+    RSTD_row_stride,
+    n_cols,
+    n_rot,
+    n_heads,
+    eps,
+    HAS_WEIGHT: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    Fused forward:
+      x_norm = x / rms(x) [* weight]   (RMSNorm, full n_cols)
+      y[..., :n_rot]  = rope(x_norm[..., :n_rot])
+      y[..., n_rot:]  = x_norm[..., n_rot:]   (pass-through for partial rotary)
+
+    rotate_half swaps first/second halves and negates the first, restricted
+    to the rotary span [0, n_rot):
+      rotate_half([a, b]) = [-b, a]   where len(a) = len(b) = n_rot/2
+
+    For the partial-rotary pass-through region we load cos with default 1.0
+    and sin with default 0.0 outside [0, n_rot), so the same formula
+    `Y = X_norm * cos + X_rot_norm * sin` collapses to `Y = X_norm`.
+
+    cos/sin are indexed by row_idx // n_heads to handle per-head broadcast
+    (cos/sin have shape (B*S, n_rot) while X has shape (B*S*H, n_cols)).
+    """
+    row_idx = tl.program_id(0).to(tl.int64)
+    # cos/sin row: divide by n_heads since cos/sin are (B*S, n_rot)
+    cs_row_idx = row_idx // n_heads
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+    rot_mask_col = col_offsets < n_rot
+    half_rot = n_rot // 2
+
+    # Load input row
+    X_row = tl.load(X_ptr + row_idx * X_row_stride + col_offsets, mask=mask, other=0)
+    X_dtype = X_row.dtype
+    X_fp32 = X_row.to(tl.float32)
+
+    # RMSNorm: compute 1/rms over the full row (rotary + pass-through)
+    mean_sq = tl.sum(X_fp32 * X_fp32, axis=0) / n_cols
+    rstd = rsqrt(mean_sq + eps)
+    tl.store(RSTD_ptr + row_idx * RSTD_row_stride, rstd)
+
+    # Normalize
+    X_norm = X_fp32 * rstd
+
+    # Apply weight if present (with_scale=True)
+    if HAS_WEIGHT:
+        W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0).to(tl.float32)
+        X_norm = X_norm * W_row
+
+    # RoPE: load cos/sin (broadcast across heads). For col >= n_rot we get
+    # cos=1, sin=0 so the formula leaves X_norm untouched.
+    cos_row = tl.load(
+        COS_ptr + cs_row_idx * COS_row_stride + col_offsets,
+        mask=rot_mask_col,
+        other=1.0,
+    ).to(tl.float32)
+    sin_row = tl.load(
+        SIN_ptr + cs_row_idx * SIN_row_stride + col_offsets,
+        mask=rot_mask_col,
+        other=0.0,
+    ).to(tl.float32)
+
+    # rotate_half within [0, n_rot):
+    #   for col < half_rot:  take -X_norm[col + half_rot]
+    #   for col in [half_rot, n_rot): take  X_norm[col - half_rot]
+    # For col >= n_rot the rotation is irrelevant (sin = 0 zeros it out).
+    rot_offsets = tl.where(
+        col_offsets < half_rot, col_offsets + half_rot, col_offsets - half_rot
+    )
+    rot_load_mask = (rot_offsets < n_cols) & rot_mask_col
+    X_rot = tl.load(
+        X_ptr + row_idx * X_row_stride + rot_offsets, mask=rot_load_mask, other=0
+    ).to(tl.float32)
+    # Re-normalize the rotated values
+    X_rot_norm = X_rot * rstd
+    if HAS_WEIGHT:
+        W_rot = tl.load(W_ptr + rot_offsets, mask=rot_load_mask, other=0).to(tl.float32)
+        X_rot_norm = X_rot_norm * W_rot
+
+    # Negate the first half (rotate_half negates x2, which becomes the first half)
+    sign = tl.where(col_offsets < half_rot, -1.0, 1.0)
+    X_rot_norm = X_rot_norm * sign
+
+    # Final RoPE: y = x_norm * cos + rotate_half(x_norm) * sin
+    Y_row = X_norm * cos_row + X_rot_norm * sin_row
+
+    tl.store(
+        Y_ptr + row_idx * Y_row_stride + col_offsets,
+        Y_row.to(X_dtype),
+        mask=mask,
+    )
+
+
+@triton.jit
+def _rms_norm_rope_backward_kernel(
+    dY_ptr,
+    dY_row_stride,
+    dX_ptr,
+    dX_row_stride,
+    X_ptr,
+    X_row_stride,
+    X_dtype: tl.constexpr,
+    W_ptr,
+    COS_ptr,
+    COS_row_stride,
+    SIN_ptr,
+    SIN_row_stride,
+    RSTD_ptr,
+    RSTD_row_stride,
+    dW_ptr,
+    dW_row_stride,
+    n_rows,
+    n_cols,
+    n_rot,
+    n_heads,
+    rows_per_program,
+    HAS_WEIGHT: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    Backward for Y = RoPE(RMSNorm(X, W)) with optional partial rotary
+    (`n_rot <= n_cols`).
+
+    For col < n_rot the standard RoPE adjoint applies. For col >= n_rot the
+    output is just the normalized row, so dN[col] = dY[col] (achieved by
+    loading cos with default 1.0 and forcing the rotate-half contribution
+    to zero outside the rotary span).
+
+    cos/sin indexed by row_idx // n_heads for per-head broadcast.
+    """
+    row_block_id = tl.program_id(0).to(tl.int64)
+    row_start = row_block_id * rows_per_program
+    row_end = min((row_block_id + 1) * rows_per_program, n_rows)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+    rot_mask_col = col_offsets < n_rot
+    half_rot = n_rot // 2
+
+    dW_acc = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
+
+    if HAS_WEIGHT:
+        W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0).to(tl.float32)
+
+    for row_idx in range(row_start, row_end):
+        cs_row_idx = row_idx // n_heads
+
+        dY_row = tl.load(
+            dY_ptr + row_idx * dY_row_stride + col_offsets, mask=mask, other=0
+        ).to(tl.float32)
+        X_row = tl.load(
+            X_ptr + row_idx * X_row_stride + col_offsets, mask=mask, other=0
+        ).to(tl.float32)
+        rstd = tl.load(RSTD_ptr + row_idx * RSTD_row_stride)
+
+        cos_row = tl.load(
+            COS_ptr + cs_row_idx * COS_row_stride + col_offsets,
+            mask=rot_mask_col,
+            other=1.0,
+        ).to(tl.float32)
+
+        # dN = dY * cos + rotate_half^T(dY * sin)   (within the rotary span)
+        # rotate_half^T([a, b]) = [b, -a]  (adjoint of rotate_half)
+        #
+        # For col >= n_rot the formula must collapse to dN = dY (since the
+        # forward is just a pass-through). cos defaults to 1.0 above; the
+        # rotate-half contribution is masked to zero below.
+        rot_offsets = tl.where(
+            col_offsets < half_rot, col_offsets + half_rot, col_offsets - half_rot
+        )
+        rot_load_mask = (rot_offsets < n_cols) & rot_mask_col
+        dY_rot = tl.load(
+            dY_ptr + row_idx * dY_row_stride + rot_offsets,
+            mask=rot_load_mask,
+            other=0,
+        ).to(tl.float32)
+        sin_rot = tl.load(
+            SIN_ptr + cs_row_idx * SIN_row_stride + rot_offsets,
+            mask=rot_load_mask,
+            other=0,
+        ).to(tl.float32)
+
+        adj_sign = tl.where(col_offsets < half_rot, 1.0, -1.0)
+        rotate_term = dY_rot * sin_rot * adj_sign
+        # Zero out rotate-half contribution outside the rotary span.
+        rotate_term = tl.where(rot_mask_col, rotate_term, 0.0)
+        dN = dY_row * cos_row + rotate_term
+
+        # Pre-weight normalized: n = rstd * x
+        n = X_row * rstd
+
+        if HAS_WEIGHT:
+            dW_acc += dN * n
+            dm = dN * W_row
+        else:
+            dm = dN
+
+        # RMSNorm backward: dX = rstd * (dm - (1/n_cols) * rstd^2 * dot(dm, X) * X)
+        dot_dm_x = tl.sum(dm * X_row, axis=0)
+        dX_row = rstd * (dm - (1.0 / n_cols) * rstd * rstd * dot_dm_x * X_row)
+
+        tl.store(
+            dX_ptr + row_idx * dX_row_stride + col_offsets,
+            dX_row.to(X_dtype),
+            mask=mask,
+        )
+
+    if HAS_WEIGHT:
+        tl.store(
+            dW_ptr + row_block_id * dW_row_stride + col_offsets,
+            dW_acc,
+            mask=mask,
+        )
+
+
+def rms_norm_rope_forward(X, W, cos, sin, eps, n_heads, n_rot):
+    """
+    Args:
+        X:   (B*S*H, head_dim) — contiguous, flattened from (B, S, H, D)
+        W:   (head_dim,) or None — RMSNorm weight
+        cos: (B*S, n_rot) — position embeddings (broadcast across heads)
+        sin: (B*S, n_rot) — position embeddings (broadcast across heads)
+        eps: float
+        n_heads: int — number of attention heads (for cos/sin indexing)
+        n_rot: int — rotary dim (== head_dim for full rotary, < head_dim for
+            partial rotary). Must be even and ``<= head_dim``.
+    Returns:
+        Y, X_saved, RSTD, BLOCK_SIZE, num_warps
+    """
+    n_rows, n_cols = X.shape
+    BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+    has_weight = W is not None
+
+    Y = torch.empty_like(X)
+    RSTD = torch.empty(n_rows, dtype=torch.float32, device=X.device)
+
+    _rms_norm_rope_forward_kernel[(n_rows,)](
+        Y,
+        Y.stride(0),
+        X,
+        X.stride(0),
+        W if has_weight else X,  # dummy pointer when no weight
+        cos,
+        cos.stride(0),
+        sin,
+        sin.stride(0),
+        RSTD,
+        RSTD.stride(0),
+        n_cols,
+        n_rot,
+        n_heads,
+        eps,
+        HAS_WEIGHT=has_weight,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=num_warps,
+    )
+    return Y, X, RSTD, BLOCK_SIZE, num_warps
+
+
+def rms_norm_rope_backward(
+    dY, X, W, cos, sin, RSTD, n_heads, n_rot, BLOCK_SIZE, num_warps
+):
+    n_rows, n_cols = dY.shape
+    has_weight = W is not None
+
+    sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
+    rows_per_program = math.ceil(n_rows / sm_count)
+
+    dX = torch.empty_like(X)
+
+    if has_weight:
+        _dW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=X.device)
+    else:
+        _dW = torch.empty((1, n_cols), dtype=torch.float32, device=X.device)
+
+    _rms_norm_rope_backward_kernel[(sm_count,)](
+        dY,
+        dY.stride(0),
+        dX,
+        dX.stride(0),
+        X,
+        X.stride(0),
+        torch_to_triton_dtype[X.dtype],
+        W if has_weight else X,  # dummy
+        cos,
+        cos.stride(0),
+        sin,
+        sin.stride(0),
+        RSTD,
+        RSTD.stride(0),
+        _dW,
+        _dW.stride(0),
+        n_rows,
+        n_cols,
+        n_rot,
+        n_heads,
+        rows_per_program,
+        HAS_WEIGHT=has_weight,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=num_warps,
+    )
+
+    dW = _dW.sum(dim=0).to(W.dtype) if has_weight else None
+    return dX, dW
+
+
+class FusedRMSNormRoPEFunction(torch.autograd.Function):
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, X, W, cos, sin, eps, n_heads, n_rot):
+        """
+        X:    (B*S*H, head_dim)
+        W:    (head_dim,) or None
+        cos:  (B*S, n_rot) — broadcast across heads
+        sin:  (B*S, n_rot) — broadcast across heads
+        n_heads: int
+        n_rot:   int — rotary dim (<= head_dim)
+        """
+        Y, X_saved, RSTD, BLOCK_SIZE, num_warps = rms_norm_rope_forward(
+            X,
+            W,
+            cos,
+            sin,
+            eps,
+            n_heads,
+            n_rot,
+        )
+        ctx.eps = eps
+        ctx.BLOCK_SIZE = BLOCK_SIZE
+        ctx.num_warps = num_warps
+        ctx.n_heads = n_heads
+        ctx.n_rot = n_rot
+        ctx.has_weight = W is not None
+        ctx.save_for_backward(X_saved, W, cos, sin, RSTD)
+        return Y
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, dY):
+        X, W, cos, sin, RSTD = ctx.saved_tensors
+        dX, dW = rms_norm_rope_backward(
+            dY,
+            X,
+            W,
+            cos,
+            sin,
+            RSTD,
+            ctx.n_heads,
+            ctx.n_rot,
+            ctx.BLOCK_SIZE,
+            ctx.num_warps,
+        )
+        return dX, dW, None, None, None, None, None
+
+
+def fused_rms_norm_rope(x, weight, cos, sin, eps=1e-6):
+    """
+    Apply fused RMSNorm + (partial) RoPE.
+
+    Args:
+        x:      (batch, seq_len, num_heads, head_dim) — after projection + view
+        weight: (head_dim,) — RMSNorm weight, or None for no-scale norm
+        cos:    (batch, seq_len, n_rot) — from RotaryEmbedding. ``n_rot``
+                must be even and ``<= head_dim``. When ``n_rot < head_dim``
+                the trailing ``head_dim - n_rot`` columns are RMSNorm-only
+                (partial-rotary pass-through), matching stock Gemma 4 with
+                ``partial_rotary_factor < 1.0``.
+        sin:    (batch, seq_len, n_rot) — same shape as ``cos``
+        eps:    float — RMSNorm epsilon
+
+    Returns:
+        y: (batch, seq_len, num_heads, head_dim) — normalized + rotated
+    """
+    shape = x.shape  # (B, S, H, D)
+    B, S, H, D = shape
+    n_rot = cos.shape[-1]
+    if sin.shape[-1] != n_rot:
+        raise ValueError(
+            f"cos and sin must have the same last dim, got cos={cos.shape[-1]} "
+            f"sin={sin.shape[-1]}"
+        )
+    if n_rot > D:
+        raise ValueError(f"rotary dim ({n_rot}) cannot exceed head_dim ({D})")
+    if n_rot % 2 != 0:
+        raise ValueError(f"rotary dim must be even, got {n_rot}")
+
+    # Flatten to 2D: (B*S*H, D)
+    x_flat = x.reshape(-1, D).contiguous()
+    # cos/sin may broadcast over the batch dim (e.g. (1, S, n_rot) when
+    # all sequences share the same rotary positions). The kernel needs a
+    # dense (B*S, n_rot) buffer so that row_idx // n_heads maps cleanly
+    # onto a single (b, s) pair, so expand-then-contiguous to materialize
+    # the per-batch broadcast. Expand is a no-op when B == cos.shape[0].
+    if cos.shape[0] != B:
+        if cos.shape[0] != 1:
+            raise ValueError(
+                f"cos/sin batch dim ({cos.shape[0]}) must be 1 or equal "
+                f"to x batch dim ({B})"
+            )
+        cos = cos.expand(B, S, n_rot)
+        sin = sin.expand(B, S, n_rot)
+    cos_flat = cos.reshape(B * S, n_rot).contiguous()
+    sin_flat = sin.reshape(B * S, n_rot).contiguous()
+
+    y_flat = FusedRMSNormRoPEFunction.apply(
+        x_flat, weight, cos_flat, sin_flat, eps, H, n_rot
+    )
+    return y_flat.view(shape)
+
+
+@triton.jit
+def _rms_norm_forward_kernel(
+    Y_ptr,
+    Y_row_stride,
+    X_ptr,
+    X_row_stride,
+    RSTD_ptr,
+    RSTD_row_stride,
+    n_cols,
+    eps,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """RMSNorm without scale weight: y = x / rms(x)"""
+    row_idx = tl.program_id(0).to(tl.int64)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+
+    X_row = tl.load(X_ptr + row_idx * X_row_stride + col_offsets, mask=mask, other=0)
+    X_dtype = X_row.dtype
+    X_fp32 = X_row.to(tl.float32)
+
+    mean_sq = tl.sum(X_fp32 * X_fp32, axis=0) / n_cols
+    rstd = rsqrt(mean_sq + eps)
+    tl.store(RSTD_ptr + row_idx * RSTD_row_stride, rstd)
+
+    Y_row = X_fp32 * rstd
+    tl.store(Y_ptr + row_idx * Y_row_stride + col_offsets, Y_row.to(X_dtype), mask=mask)
+
+
+@triton.jit
+def _rms_norm_noscale_backward_kernel(
+    dY_ptr,
+    dY_row_stride,
+    dX_ptr,
+    dX_row_stride,
+    X_ptr,
+    X_row_stride,
+    X_dtype: tl.constexpr,
+    RSTD_ptr,
+    RSTD_row_stride,
+    n_cols,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """Backward for y = x * rstd (no weight)."""
+    row_idx = tl.program_id(0).to(tl.int64)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+
+    dY_row = tl.load(
+        dY_ptr + row_idx * dY_row_stride + col_offsets, mask=mask, other=0
+    ).to(tl.float32)
+    X_row = tl.load(
+        X_ptr + row_idx * X_row_stride + col_offsets, mask=mask, other=0
+    ).to(tl.float32)
+    rstd = tl.load(RSTD_ptr + row_idx * RSTD_row_stride)
+
+    dot_dy_x = tl.sum(dY_row * X_row, axis=0)
+    dX_row = rstd * (dY_row - (1.0 / n_cols) * rstd * rstd * dot_dy_x * X_row)
+
+    tl.store(
+        dX_ptr + row_idx * dX_row_stride + col_offsets, dX_row.to(X_dtype), mask=mask
+    )
+
+
+class FusedRMSNormNoScaleFunction(torch.autograd.Function):
+    """RMSNorm without learnable scale — used for Gemma4's v_norm."""
+
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, X, eps):
+        n_rows, n_cols = X.shape
+        BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+        Y = torch.empty_like(X)
+        RSTD = torch.empty(n_rows, dtype=torch.float32, device=X.device)
+
+        _rms_norm_forward_kernel[(n_rows,)](
+            Y,
+            Y.stride(0),
+            X,
+            X.stride(0),
+            RSTD,
+            RSTD.stride(0),
+            n_cols,
+            eps,
+            BLOCK_SIZE=BLOCK_SIZE,
+            num_warps=num_warps,
+        )
+        ctx.BLOCK_SIZE = BLOCK_SIZE
+        ctx.num_warps = num_warps
+        ctx.save_for_backward(X, RSTD)
+        ctx.n_cols = n_cols
+        return Y
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, dY):
+        X, RSTD = ctx.saved_tensors
+        n_rows = X.shape[0]
+        dX = torch.empty_like(X)
+        _rms_norm_noscale_backward_kernel[(n_rows,)](
+            dY,
+            dY.stride(0),
+            dX,
+            dX.stride(0),
+            X,
+            X.stride(0),
+            torch_to_triton_dtype[X.dtype],
+            RSTD,
+            RSTD.stride(0),
+            ctx.n_cols,
+            BLOCK_SIZE=ctx.BLOCK_SIZE,
+            num_warps=ctx.num_warps,
+        )
+        return dX, None
+
+
+def fused_rms_norm_noscale(x, eps=1e-6):
+    """
+    RMSNorm without scale for v_norm.
+
+    Args:
+        x: (batch, seq_len, num_heads, head_dim)
+    Returns:
+        y: same shape, normalized
+    """
+    shape = x.shape
+    x_flat = x.reshape(-1, shape[-1])
+    y_flat = FusedRMSNormNoScaleFunction.apply(x_flat, eps)
+    return y_flat.view(shape)
--- a/src/axolotl/kernels/lora.py
+++ b/src/axolotl/kernels/lora.py
@@ -1297,6 +1297,339 @@ def apply_lora_qkv(
    return Q, K, V


+class LoRA_QK(torch.autograd.Function):
+    """Optimized LoRA QK implementation for models where v_proj is None.
+
+    Used by models like Gemma4 with attention_k_eq_v=True, where key states are
+    reused as value states.  Only Q and K projections are fused; the caller
+    returns K a second time as V so that autograd accumulates key+value gradients
+    into a single dK.
+
+    Supports bias, dropout, and DoRA (Weight-Decomposed Low-Rank Adaptation).
+    """
+
+    @staticmethod
+    @torch_amp_custom_fwd
+    def forward(
+        ctx: torch.autograd.function.FunctionCtx,
+        X: torch.Tensor,
+        X_drop: torch.Tensor | None,
+        # Q params
+        q_weight: torch.Tensor,
+        q_bias: torch.Tensor | None,
+        q_quant: QuantState | None,
+        q_A: torch.Tensor | None,
+        q_B: torch.Tensor | None,
+        q_scale: float,
+        q_lora_bias: torch.Tensor | None,
+        q_magnitude: torch.Tensor | None,
+        # K params
+        k_weight: torch.Tensor,
+        k_bias: torch.Tensor | None,
+        k_quant: QuantState | None,
+        k_A: torch.Tensor | None,
+        k_B: torch.Tensor | None,
+        k_scale: float,
+        k_lora_bias: torch.Tensor | None,
+        k_magnitude: torch.Tensor | None,
+        # Flags
+        inplace: bool = True,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        has_dropout = X_drop is not None
+        has_dora = q_magnitude is not None
+
+        if has_dora:
+            dtype = X.dtype
+            X_lora = X_drop if has_dropout else X
+
+            # Compute Q with DoRA
+            Q_base = matmul_lora(X, q_weight, None, q_quant, None, None, None)
+            Q_lora = _lora_only(X_lora, q_A, q_B, q_scale, q_lora_bias, dtype)
+            q_mag_scale = _compute_dora_scale(
+                q_weight, q_quant, q_A, q_B, q_scale, q_magnitude, dtype
+            )
+            Q = q_mag_scale.unsqueeze(0) * (Q_base + Q_lora)
+            if q_bias is not None:
+                Q = Q + q_bias
+
+            # Compute K with DoRA
+            K_base = matmul_lora(X, k_weight, None, k_quant, None, None, None)
+            K_lora = _lora_only(X_lora, k_A, k_B, k_scale, k_lora_bias, dtype)
+            k_mag_scale = _compute_dora_scale(
+                k_weight, k_quant, k_A, k_B, k_scale, k_magnitude, dtype
+            )
+            K = k_mag_scale.unsqueeze(0) * (K_base + K_lora)
+            if k_bias is not None:
+                K = K + k_bias
+
+            Q_combined = Q_base + Q_lora
+            K_combined = K_base + K_lora
+
+            ctx.save_for_backward(
+                X,
+                X_drop if has_dropout else X,
+                q_A.to(dtype) if q_A is not None else q_A,
+                q_B.to(dtype) if q_B is not None else q_B,
+                k_A.to(dtype) if k_A is not None else k_A,
+                k_B.to(dtype) if k_B is not None else k_B,
+                q_magnitude,
+                k_magnitude,
+                q_mag_scale,
+                k_mag_scale,
+                Q_combined,
+                K_combined,
+                q_lora_bias,
+                k_lora_bias,
+            )
+        else:
+            # Standard LoRA (with optional dropout and bias)
+            Q = matmul_lora(
+                X,
+                q_weight,
+                q_bias,
+                q_quant,
+                q_A,
+                q_B,
+                q_scale,
+                X_drop=X_drop,
+                lora_bias=q_lora_bias,
+            )
+            K = matmul_lora(
+                X,
+                k_weight,
+                k_bias,
+                k_quant,
+                k_A,
+                k_B,
+                k_scale,
+                X_drop=X_drop,
+                lora_bias=k_lora_bias,
+            )
+
+            dtype = X.dtype
+            ctx.save_for_backward(
+                X,
+                X_drop if has_dropout else X,
+                q_A.to(dtype) if q_A is not None else q_A,
+                q_B.to(dtype) if q_B is not None else q_B,
+                k_A.to(dtype) if k_A is not None else k_A,
+                k_B.to(dtype) if k_B is not None else k_B,
+                q_lora_bias,
+                k_lora_bias,
+            )
+
+        ctx.scales = (q_scale, k_scale)
+        ctx.quants = (q_quant, k_quant)
+        ctx.weights = (q_weight, k_weight)
+        ctx.inplace = inplace
+        ctx.has_dropout = has_dropout
+        ctx.has_dora = has_dora
+
+        return Q, K
+
+    @staticmethod
+    @torch_amp_custom_bwd
+    def backward(
+        ctx: torch.autograd.function.FunctionCtx,
+        q_grad: torch.Tensor,
+        k_grad: torch.Tensor,
+    ):
+        q_weight, k_weight = ctx.weights
+        q_quant, k_quant = ctx.quants
+        q_scale, k_scale = ctx.scales
+        has_dropout = ctx.has_dropout
+        has_dora = ctx.has_dora
+
+        if has_dora:
+            (
+                X,
+                X_lora,
+                A_q,
+                B_q,
+                A_k,
+                B_k,
+                q_magnitude,
+                k_magnitude,
+                q_mag_scale,
+                k_mag_scale,
+                Q_combined,
+                K_combined,
+                q_lora_bias,
+                k_lora_bias,
+            ) = ctx.saved_tensors
+        else:
+            (
+                X,
+                X_lora,
+                A_q,
+                B_q,
+                A_k,
+                B_k,
+                q_lora_bias,
+                k_lora_bias,
+            ) = ctx.saved_tensors
+            q_magnitude = k_magnitude = None
+            q_mag_scale = k_mag_scale = None
+            Q_combined = K_combined = None
+
+        batch, seq_len = X.shape[:2]
+        q_grad = q_grad.view(-1, q_grad.shape[-1])
+        k_grad = k_grad.reshape(-1, k_grad.shape[-1])
+        X = X.view(-1, X.shape[-1])
+        X_lora = X_lora.view(-1, X_lora.shape[-1])
+
+        d_q_mag = d_k_mag = None
+        d_q_lora_bias = d_k_lora_bias = None
+
+        if has_dora:
+            Q_combined = Q_combined.view(-1, Q_combined.shape[-1])
+            K_combined = K_combined.view(-1, K_combined.shape[-1])
+
+            d_q_mag = (q_grad * Q_combined).sum(dim=0) * q_mag_scale / q_magnitude
+            d_k_mag = (k_grad * K_combined).sum(dim=0) * k_mag_scale / k_magnitude
+
+            q_grad = q_grad * q_mag_scale.unsqueeze(0)
+            k_grad = k_grad * k_mag_scale.unsqueeze(0)
+
+        # LoRA bias gradients
+        if q_lora_bias is not None:
+            d_q_lora_bias = q_scale * q_grad.sum(dim=0)
+        if k_lora_bias is not None:
+            d_k_lora_bias = k_scale * k_grad.sum(dim=0)
+
+        X_lora_t = X_lora.t()
+
+        d_A_q = d_B_q = d_A_k = d_B_k = None
+        grad_B_q = grad_B_k = None
+
+        if A_q is not None and B_q is not None:
+            grad_B_q = q_grad @ B_q
+            d_A_q = torch.empty_like(A_q.t())
+            d_B_q = torch.empty_like(B_q.t())
+            d_A_q.addmm_(X_lora_t, grad_B_q, alpha=q_scale, beta=0)
+            d_B_q.addmm_(A_q @ X_lora_t, q_grad, alpha=q_scale, beta=0)
+
+        if A_k is not None and B_k is not None:
+            grad_B_k = k_grad @ B_k
+            d_A_k = torch.empty_like(A_k.t())
+            d_B_k = torch.empty_like(B_k.t())
+            d_A_k.addmm_(X_lora_t, grad_B_k, alpha=k_scale, beta=0)
+            d_B_k.addmm_(A_k @ X_lora_t, k_grad, alpha=k_scale, beta=0)
+
+        # Base path input gradient
+        out_buffer = X if ctx.inplace else None
+
+        q_weight_t = dequantize(q_weight, q_quant)
+        grad_X = torch.mm(q_grad, q_weight_t, out=out_buffer)
+        del q_weight_t
+
+        k_weight_t = dequantize(k_weight, k_quant)
+        grad_X.addmm_(k_grad, k_weight_t)
+        del k_weight_t
+
+        # LoRA path input gradient
+        if has_dropout:
+            grad_X_drop = torch.zeros_like(X_lora)
+            if grad_B_q is not None:
+                grad_X_drop.addmm_(grad_B_q, A_q, alpha=q_scale)
+            if grad_B_k is not None:
+                grad_X_drop.addmm_(grad_B_k, A_k, alpha=k_scale)
+        else:
+            grad_X_drop = None
+            if grad_B_q is not None:
+                grad_X.addmm_(grad_B_q, A_q, alpha=q_scale)
+            if grad_B_k is not None:
+                grad_X.addmm_(grad_B_k, A_k, alpha=k_scale)
+
+        if d_A_q is not None:
+            d_A_q = d_A_q.t()
+            d_B_q = d_B_q.t()  # type: ignore[union-attr]
+        if d_A_k is not None:
+            d_A_k = d_A_k.t()
+            d_B_k = d_B_k.t()  # type: ignore[union-attr]
+
+        grad_X = grad_X.view(batch, seq_len, -1)
+        if grad_X_drop is not None:
+            grad_X_drop = grad_X_drop.view(batch, seq_len, -1)
+
+        # Return gradients for all forward inputs:
+        # X, X_drop,
+        # q: weight, bias, quant, A, B, scale, lora_bias, magnitude
+        # k: weight, bias, quant, A, B, scale, lora_bias, magnitude
+        # inplace
+        return (
+            grad_X,
+            grad_X_drop,
+            # Q
+            None,
+            None,
+            None,
+            d_A_q,
+            d_B_q,
+            None,
+            d_q_lora_bias,
+            d_q_mag,
+            # K
+            None,
+            None,
+            None,
+            d_A_k,
+            d_B_k,
+            None,
+            d_k_lora_bias,
+            d_k_mag,
+            # inplace
+            None,
+        )
+
+
+def apply_lora_qk(
+    self, X: torch.Tensor, inplace: bool = True
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Applies LoRA to compute Query and Key projections for models where v_proj is None.
+
+    When v_proj is None (e.g. Gemma4 attention_k_eq_v), key states are reused as
+    value states.  Returns (Q, K, K) — the caller's patched forward will use K as V.
+    Because K is returned twice, autograd accumulates gradients from both the key and
+    value paths into dK before calling LoRA_QK.backward.
+
+    Supports bias, dropout, and DoRA.
+    """
+    QW, Qb, QW_quant, QA, QB, QS, Qlb, Qdrop, Qmag = get_lora_parameters(self.q_proj)
+    KW, Kb, KW_quant, KA, KB, KS, Klb, Kdrop, Kmag = get_lora_parameters(self.k_proj)
+
+    # Apply dropout outside autograd.Function (shared mask for Q, K)
+    X_drop = _apply_dropout(Qdrop, X, self.training)
+
+    Q, K = LoRA_QK.apply(
+        X,
+        X_drop,
+        # Q
+        QW,
+        Qb,
+        QW_quant,
+        QA,
+        QB,
+        QS,
+        Qlb,
+        Qmag,
+        # K
+        KW,
+        Kb,
+        KW_quant,
+        KA,
+        KB,
+        KS,
+        Klb,
+        Kmag,
+        # Flags
+        inplace,
+    )
+
+    return Q, K, K
+
+
 class LoRA_O(torch.autograd.Function):
    """Optimized LoRA implementation for output projection.

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Wing Lian	993db05b3a	fix losses	2026-04-26 10:28:05 -04:00
Wing Lian	1b9520cc8b	more train steps	2026-04-25 02:17:48 +00:00
Wing Lian	f77408a3d0	fix tests	2026-04-23 23:47:28 +00:00
Wing Lian	5db4272f69	more steps for loss check	2026-04-23 18:43:18 +00:00
Wing Lian	431888c1de	use smaller pretrained models for ci	2026-04-23 13:51:01 +00:00
thad0ctor	1bf65c500e	feat: add processor_kwargs YAML field forwarded to from_pretrained (#3612 )	2026-04-23 00:26:34 -04:00
brightwind26	bcbe049c21	Feat: add support for datasets with `str` saved `messages` field (#3607 ) * feat: support datasets saved in str format * add also str for tools * format * fix: address comments + add unit test * format	2026-04-23 00:25:48 -04:00
Andrew Wu	90090fa9e8	DPO support loss types (#3566 ) * Support loss_type/loss_weights DPO * Validate dpo loss type/weights only set for dpo * Tests: Update ipo tests to use new path * Docs: Update docs for new ipo path * PR fixes - typo/validation * PR nit - warning * chore: fix warnings arg --------- Co-authored-by: NanoCode012 <nano@axolotl.ai>	2026-04-23 00:25:28 -04:00
Wing Lian	7420fd4de6	fix async prefetch with nemogym (#3606 )	2026-04-22 09:05:46 -04:00
Wing Lian	05113bc91a	train on remote compute using Tinker compatible APIs (#3614 ) * train on remote compute using Tinker compatible APIs * chore: lint * fixes with latest hatchery changes * chore: lint	2026-04-22 01:14:41 -04:00
thad0ctor	e562e149ce	fix: [gemma4] fix VRAM leak in hybrid FA2+SDPA (hybrid attentiuon) path under activation check… (#3611 ) * [gemma4] fix VRAM leak in hybrid FA2+SDPA path under activation checkpointing Route shared_kv_states through a thread-local side channel instead of the decoder-layer kwargs so the checkpoint partial never references the dict. HF's Gemma4TextModel.forward passes shared_kv_states (a mutable dict used for cross-layer K/V sharing) as a kwarg to every decoder_layer call. GradientCheckpointingLayer.__call__ then forms partial(super().__call__, *kwargs), and whichever checkpoint runs (axolotl's CPU_Offloaded_Gradient_Checkpointer or torch's stock checkpoint) captures that partial. The partial holds a reference to the dict, which holds the K/V tensors produced by store_full_length_kv layers. Those tensors stay pinned for the full duration of backward, and delayed ref-cycle cleanup in torch's caching allocator under FSDP2 + activation checkpointing bleeds the residual across steps. Observed symptom: VRAM climbs ~0.47 GiB/step from a 42 GiB baseline, OOMs around step 73 (~94 GiB peak) on Gemma-4 31B multimodal with gemma4_hybrid_attn_impl: true. Independent of seq len / image size. All-flex-attention path is flat but ~22x slower. Violated invariant: anything crossing an activation-checkpoint boundary must be a tensor (refcounted by autograd) or plain Python data -- never a mutable container holding tensor references. Fix (all in src/axolotl/monkeypatch/models/gemma4/fused_attn.py): threading.local() store with _get/_set_shared_kv_states helpers * _patch_decoder_layer_call(): monkeypatches Gemma4TextDecoderLayer.__call__ to pop shared_kv_states from kwargs and stash it in TLS before delegating to GradientCheckpointingLayer. The partial formed downstream no longer references the dict. * fused_forward reads TLS first, falls back to kwarg for callers that bypass the patched __call__ (e.g. direct attention invocation). * wired into patch_gemma4_fused_attn; idempotent via a sentinel. TLS is overwritten on each new step's first decoder-layer call, so the previous step's dict is released promptly. No changes to hybrid dispatch, FSDP wrap policy, or any config behaviour. Works for hybrid, flex, and eager paths. Introduced by PR #3598 (commit `b8358aa5`). * Coderabbit comment: gemma4: clear TLS unconditionally in decoder-layer patched __call__ Overwrite the thread-local shared_kv_states store on every invocation (including with None) instead of only when the kwarg is present. The previous conditional write left stale dicts in TLS on any path that reaches Gemma4TextDecoderLayer.__call__ without a shared_kv_states kwarg — e.g. generation, eval hooks, or future HF refactors that make the kwarg optional. fused_forward would then silently consume a prior step's K/V dict instead of falling back to its own kwarg path. Unconditional write makes the invariant in the surrounding comment ("TLS is overwritten on each new step's first decoder-layer call, so the previous step's dict is released promptly") actually hold. No behavior change for the training happy path, which always passes the kwarg. Addresses CodeRabbit review on PR #3611 * fix: swap threading.local() for module-level store so autograd worker threads see shared_kv_states during backward recompute Previous commits fixed memory leak on 31B but caused type error with MOE Gemma4 variants - this fixes that: PR 3611's TLS variant only works when recompute runs on the same thread that set TLS during forward. PyTorch's C++ autograd engine (_engine_run_backward) spawns per-device worker threads to dispatch backward, and HF-Trainer gradient_checkpointing (stock torch.utils.checkpoint, non-reentrant / saved-tensor-hooks) fires unpack_hook -> recompute_fn on those worker threads. TLS set on the main thread during forward is invisible there, so _get_shared_kv_states() returns None and the consumer-layer lookup crashes with "'NoneType' object is not subscriptable" at fused_attn.py:97 (shared_kv_states[self.kv_shared_layer_index]). A plain module-level dict is visible to all threads in the process. Lifecycle is identical: the slot is overwritten each forward, releasing the previous step's dict and allowing its K/V tensors to be GC'd, so the original VRAM-leak fix still holds under FSDP2 AC too. * scope gemma4 shared_kv_states side channel to checkpointed training Update PR #3611 with gate for checkpointed training to avoid regressions across async flows. Added unit tests for kwargs pop, store-clear regression, and flag gating. Condensed verbose comments * add gemma4 cross-thread visibility test for shared_kv_states store Additional regression test for MoE gemma4 variants - asserts the module-level store is readable from threads other than the one that set it in response to previously observed 'NoneType' error * fix logger --------- Co-authored-by: Wing Lian <wing@axolotl.ai>	2026-04-21 17:49:58 -04:00
NanoCode012	9de5b76336	feat: move to uv first (#3545 ) * feat: move to uv first * fix: update doc to uv first * fix: merge dev/tests into uv pyproject * fix: update docker docs to match current config * fix: migrate examples to readme * fix: add llmcompressor to conflict * feat: rec uv sync with lockfile for dev/ci * fix: update docker docs to clarify how to use uv images * chore: docs * fix: use system python, no venv * fix: set backend cpu * fix: only set for installing pytorch step * fix: remove unsloth kernel and installs * fix: remove U in tests * fix: set backend in deps too * chore: test * chore: comments * fix: attempt to lock torch * fix: workaround torch cuda and not upgraded * fix: forgot to push * fix: missed source * fix: nightly upstream loralinear config * fix: nightly phi3 long rope not work * fix: forgot commit * fix: test phi3 template change * fix: no more requirements * fix: carry over changes from new requirements to pyproject * chore: remove lockfile per discussion * fix: set match-runtime * fix: remove unneeded hf hub buildtime * fix: duplicate cache delete on nightly * fix: torchvision being overridden * fix: migrate to uv images * fix: leftover from merge * fix: simplify base readme * fix: update assertion message to be clearer * chore: docs * fix: change fallback for cicd script * fix: match against main exactly * fix: peft 0.19.1 change * fix: e2e test * fix: ci * fix: e2e test	2026-04-21 10:16:03 -04:00
Wing Lian	323da791eb	bump transformers to 5.5.4 and trl to latest 1.1.0 (#3603 ) * bump transformers to 5.5.4 and trl to latest 1.1.0 * more upgrades * update peft too * adapt lora_merge to peft 0.19 layer config API PEFT 0.19 requires a LoraConfig object on Linear/ParamWrapper/Conv layer constructors and moved use_rslora, use_dora, fan_in_fan_out, lora_dropout, and lora_bias into that config. Build the config per branch in _build_peft_layer_and_get_delta so the merge utility works with the upgraded peft. * allow lora_dropout on mixed attention+MoE configs under peft 0.19 PEFT 0.19's convert_peft_config_for_transformers auto-remaps old MoE target_modules (w1/w2/w3 on Mixtral, etc.) into target_parameters for transformers v5's fused 3D expert Parameters. Those targets get wrapped with ParamWrapper, which rejects lora_dropout != 0 because the 3D einsum can't factor dropout out of lora_B(lora_A(dropout(x))). Monkeypatch ParamWrapper.__init__ to internally use a copy of the LoraConfig with lora_dropout=0, so its dropout slot becomes nn.Identity while the shared config still delivers real dropout to sibling Linear LoRA layers (attention q/k/v/o). A probe runs the same conversion on a deep copy to detect the situation and emit a warning before patching.	2026-04-15 09:27:03 -04:00
NanoCode012	6990478163	fix: rename model to adapter_model for fsdp sharded final model (#3585 ) * fix: rename model to adapter_model for fsdp sharded final model * fix: follow upstream transformer shard size * fix: handle multiple model files * fix redundant condition, tighten to safetensors, keep shard size small --------- Co-authored-by: Wing Lian <wing@axolotl.ai>	2026-04-12 20:51:30 -04:00
ゆり	63a58cfec1	feat: support excess_length_strategy for RL trainers (#3578 ) [skip ci] * feat: support excess_length_strategy for RL trainers Previously, RL data loading always dropped sequences exceeding sequence_len. This adds support for the existing `excess_length_strategy` config option (`drop`, `truncate`, `raise`) in RL training pipelines, matching the behavior already available for SFT. - `drop` (default): unchanged behavior, filters out long samples - `truncate`: tokenizes text components, truncates responses to fit within sequence_len while preserving the full prompt, then decodes back to text. Handles DPO/IPO/ORPO/SIMPO and KTO datasets. - `raise`: raises ValueError if any sample exceeds sequence_len Closes #3547 * improve RL truncation strategy robustness and performance --------- Co-authored-by: yurekami <yurekami@users.noreply.github.com> Co-authored-by: Wing Lian <wing@axolotl.ai>	2026-04-12 20:51:10 -04:00
madScientist10	3985ec2f67	feat: add FineGrainedFP8Config support for model quantization (#3587 ) [skip ci] Allow loading FP8-quantized models (e.g. Mistral-Small-4-119B) with FineGrainedFP8Config and optional dequantize kwarg for full fine-tuning. Made-with: Cursor	2026-04-12 20:50:37 -04:00
Joaquin Hui	a44edda6d7	Skip redundant evaluation when resuming from checkpoint (#3575 ) [skip ci] * Skip redundant evaluation when resuming from checkpoint * add condition check for adding callback --------- Co-authored-by: Wing Lian <wing@axolotl.ai>	2026-04-12 20:50:15 -04:00
Wing Lian	66c3e5a3fd	better handling of dora merge on Conv layers in Qwen 3.5 (#3599 ) * better handling of dora merge on Conv layers in Qwen 3.5 * address issues from code review * stricter efficient merges for dora since we now have meta model to reference	2026-04-12 10:57:45 -04:00
Wing Lian	b8358aa5ab	[gemma4] use mixed Flash Attention and SDPA and add fused RMSNorm+RoPE Triton kernels (#3598 )	2026-04-12 10:29:55 -04:00
Joaquin Hui	e079cf16a2	qwen3_5.jinja: handle list content on system messages (#3595 ) [skip ci] * qwen3_5.jinja: handle list content on system messages The system message branch used string concatenation on messages[0].content, which breaks when the first system message uses the OpenAI-style list-of-parts format that multimodal datasets require. User and assistant branches already handle both string and list content, but the system branch did not. Check whether content is a string and fall back to iterating over parts when it is a list, matching the pattern used for user messages. Fixes #3590 * Address pr for other content types --------- Co-authored-by: Joaquin Hui Gomez <joaquinhuigomez@users.noreply.github.com> Co-authored-by: Wing Lian <wing@axolotl.ai>	2026-04-12 00:58:58 -04:00
Wing Lian	e2f69828d2	[fix][fsdp2] clone sharded param so original full size shard can be gc'ed (#3597 ) [skip ci]	2026-04-11 20:22:35 -04:00
Wing Lian	122b50bad6	pre-cache the eot token ids rather than on each iteration (#3594 ) [skip ci]	2026-04-11 20:05:21 -04:00
Wing Lian	e77a185e86	upgrade transformers to use v5.5.3 (#3593 )	2026-04-10 17:08:14 -04:00
Wing Lian	29fa4dedbb	Gemma4 fixes and profiler (#3591 )	2026-04-10 16:46:17 -04:00
Wing Lian	315cdeede9	handle trainable/masked spans in content and reasoning content (#3592 )	2026-04-10 14:11:10 -04:00
NanoCode012	e7a6a5b529	fix: move warning after we've set any overrides (#3589 ) [skip ci]	2026-04-10 13:00:47 -04:00
NanoCode012	bfb4da1d25	fix: document jinja2 file path support (#3588 ) [skip ci]	2026-04-10 13:00:26 -04:00
floaty3	4dfa0a59b2	Add uninstall command to cut_cross_entropy import message (#3583 ) [skip ci]	2026-04-10 13:00:07 -04:00
Wing Lian	4ef608dda3	fix ddp/fsdp w gemma4 (#3584 ) * fix ddp/fsdp w gemma4 * address pr comments * activation offloading fix and update agent docs for gemma4	2026-04-09 20:02:36 -07:00
NanoCode012	7daf7d96f1	fix: regex for unfrozen language tower (#3586 ) [skip ci] * fix: regex for unfrozen language tower * fix: other leftover regex	2026-04-08 08:18:11 -07:00
Wing Lian	7c56809c7f	use vllm 0.19.0 for torch 2.10.0 (#3582 )	2026-04-07 08:09:49 -07:00
NanoCode012	149178ddb7	chore: cleanup post release v0.16 (#3577 ) * fix: remove unneeded debug log * fix: cleanup * feat: add dense gemma config and cleanup * feat: add cce support * update notes and set torch compile * fix patch for new number of return vals * fixes for gemma4 * fix packing bug * use updated cce for mm * fix: pass in kv cache func when avail for transformers 5.5 * feat: update examples with flex variant and readme * gemma4 lora attention kernels --------- Co-authored-by: Wing Lian <wing.lian@gmail.com> Co-authored-by: Wing Lian <wing@axolotl.ai>	2026-04-06 10:10:52 -07:00
NanoCode012	dc638e723f	fix(config): add cce and liger to nemotron-h example (#3573 ) [skip ci]	2026-04-06 10:10:25 -07:00
Wing Lian	6f15da4cac	make it easier for agents to discover docs (#3579 ) [skip ci] * make it easier for agents to discover docs * fixup pr comments	2026-04-06 10:00:55 -07:00