add missing file

fixes for scattermoe from latest peft upgrade
fix test dims
2026-04-21 08:44:01 -04:00 · 2026-04-21 08:00:16 -04:00 · 2026-04-21 00:44:26 +00:00 · 2026-04-20 23:09:47 +00:00 · 2026-04-19 01:53:05 +00:00 · 2026-04-17 17:48:26 +00:00
80 changed files with 4021 additions and 1392 deletions
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -31,10 +31,7 @@ PRs are **greatly welcome**!

 Please run below to setup env
 ```bash
-# Install axolotl + dev and test dependencies from lockfile
-export UV_TORCH_BACKEND=cu128  # or cu130
-uv sync --extra flash-attn --extra deepspeed --group dev --group test
-source .venv/bin/activate
+pip3 install -r requirements-dev.txt -r requirements-tests.txt
 pre-commit install

 # test
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -6,7 +6,7 @@ on:
      types: [opened, synchronize, reopened, ready_for_review]
      paths:
       - '**.py'
-       - 'pyproject.toml'
+       - 'requirements.txt'
       - '.github/workflows/*.yml'
       - "*.[q]md"
       - "examples/**/*.y[a]?ml"
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -3,15 +3,17 @@ name: docker-multigpu-tests-biweekly
 on:
  pull_request:
    paths:
-      - "tests/e2e/multigpu/**.py"
-      - "pyproject.toml"
-      - ".github/workflows/multi-gpu-e2e.yml"
-      - "scripts/cutcrossentropy_install.py"
-      - "src/axolotl/core/trainers/mixins/sequence_parallel.py"
-      - "src/axolotl/utils/distributed.py"
+      - 'tests/e2e/multigpu/**.py'
+      - 'requirements.txt'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - '.github/workflows/multi-gpu-e2e.yml'
+      - 'scripts/cutcrossentropy_install.py'
+      - 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
+      - 'src/axolotl/utils/distributed.py'
  workflow_dispatch:
  schedule:
-    - cron: "0 0 * * 1,4" # Runs at 00:00 UTC every monday & thursday
+    - cron: '0 0 * * 1,4'  # Runs at 00:00 UTC every monday & thursday

 # Cancel jobs on the same ref if a new one is triggered
 concurrency:
@@ -31,19 +33,19 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          #          - cuda: 129
-          #            cuda_version: 12.9.1
-          #            python_version: "3.12"
-          #            pytorch: 2.9.1
-          #            axolotl_extras: "fbgemm-gpu"
-          #            num_gpus: 2
-          #            dockerfile: "Dockerfile-uv.jinja"
+#          - cuda: 129
+#            cuda_version: 12.9.1
+#            python_version: "3.12"
+#            pytorch: 2.9.1
+#            axolotl_extras: "fbgemm-gpu"
+#            num_gpus: 2
+#            dockerfile: "Dockerfile-uv.jinja"
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
-            #            axolotl_extras: fbgemm-gpu
+#            axolotl_extras: fbgemm-gpu
            num_gpus: 2
          - cuda: 128
            cuda_version: 12.8.1
@@ -51,6 +53,7 @@ jobs:
            pytorch: 2.10.0
            axolotl_extras: "fbgemm-gpu"
            num_gpus: 2
+            dockerfile: "Dockerfile-uv.jinja"
    runs-on: [self-hosted, modal]
    timeout-minutes: 120
    steps:
@@ -72,7 +75,7 @@ jobs:
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -8,9 +8,6 @@ on:

 permissions: {}

-env:
-  UV_SYSTEM_PYTHON: "1"
-
 jobs:
  setup_release:
    name: Create Release
@@ -44,15 +41,11 @@ jobs:
        with:
          python-version: "3.11"

-      - name: Install uv
-        uses: astral-sh/setup-uv@v7
-
      - name: Install dependencies
        run: |
-          uv pip install wheel packaging
-          uv pip install --no-build-isolation -e .
-          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
-            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
+          pip3 install wheel packaging==26.0
+          pip3 install --no-build-isolation -e .
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt

      - name: Extract tag name
        id: tag
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -2,18 +2,15 @@ name: Tests Nightly against upstream main
 on:
  workflow_dispatch:
  schedule:
-    - cron: "0 0 * * *" # Runs at 00:00 UTC every day
+    - cron: '0 0 * * *'  # Runs at 00:00 UTC every day
  pull_request:
    types: [opened, synchronize, reopened, ready_for_review]
    paths:
-      - ".github/workflows/tests-nightly.yml"
+      - '.github/workflows/tests-nightly.yml'

 permissions:
  contents: read

-env:
-  UV_SYSTEM_PYTHON: "1"
-
 jobs:
  pre-commit:
    name: pre-commit
@@ -23,7 +20,7 @@ jobs:
      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"
-          cache: "pip" # caching pip dependencies
+          cache: 'pip' # caching pip dependencies
      - uses: pre-commit/action@v3.0.1
        env:
          SKIP: no-commit-to-branch
@@ -46,7 +43,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python_version: ["3.12"] # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
+        python_version: ["3.12"]  # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
        pytorch_version: ["2.9.1", "2.10.0"]
    timeout-minutes: 20

@@ -64,34 +61,36 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
+          cache: 'pip' # caching pip dependencies

-      - name: Install uv
-        uses: astral-sh/setup-uv@v7
+      - name: upgrade pip
+        run: |
+          pip3 install --upgrade pip
+          pip3 install --upgrade packaging==26.0 setuptools==78.1.1 wheel

      - name: Install PyTorch
        run: |
-          uv pip install torch==${{ matrix.pytorch_version }} torchvision
-          uv pip freeze | grep -E "^(torch|torchvision)==" > /tmp/torch-pin.txt
+          pip3 install torch==${{ matrix.pytorch_version }} torchvision
+
+      - name: Update requirements.txt
+        run: |
+          sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt
+          sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt
+          sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt
+          sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt
+          sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt

      - name: Install dependencies
        run: |
-          uv pip install --no-build-isolation -e . --override /tmp/torch-pin.txt
-          python scripts/cutcrossentropy_install.py --uv | sh
-          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
-            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
-
-      - name: Override with nightly HF packages
-        run: |
-          uv pip install --no-deps \
-            "transformers @ git+https://github.com/huggingface/transformers.git@main" \
-            "peft @ git+https://github.com/huggingface/peft.git@main" \
-            "accelerate @ git+https://github.com/huggingface/accelerate.git@main" \
-            "trl @ git+https://github.com/huggingface/trl.git@main" \
-            "datasets @ git+https://github.com/huggingface/datasets.git@main"
+          pip3 show torch
+          pip3 install --no-build-isolation -U -e .
+          python scripts/unsloth_install.py | sh
+          python scripts/cutcrossentropy_install.py | sh
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt

      - name: Make sure PyTorch version wasn't clobbered
        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__, f'Expected torch ${{ matrix.pytorch_version }} but got {torch.__version__}'"
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"

      - name: Ensure axolotl CLI was installed
        run: |
@@ -103,6 +102,9 @@ jobs:
          pytest -v --durations=10 tests/patched/
          pytest -v --durations=10 tests/cli/

+      - name: cleanup pip cache
+        run: |
+          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

  docker-e2e-tests:
    if: github.repository_owner == 'axolotl-ai-cloud'
@@ -134,6 +136,7 @@ jobs:
            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
+            dockerfile: "Dockerfile-uv.jinja"
            nightly_build: "true"
    steps:
      - name: Checkout
@@ -154,7 +157,7 @@ jobs:
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -6,19 +6,21 @@ on:
    branches:
      - "main"
    paths:
-      - "**.py"
-      - "pyproject.toml"
-      - ".github/workflows/*.yml"
-      - "cicd/cicd.sh"
-      - "cicd/Dockerfile-uv.jinja"
+      - '**.py'
+      - 'requirements.txt'
+      - '.github/workflows/*.yml'
+      - 'requirements-tests.txt'
+      - 'cicd/cicd.sh'
+      - 'cicd/Dockerfile.jinja'
  pull_request:
-    types: [opened, synchronize, reopened, ready_for_review]
-    paths:
-      - "**.py"
-      - "pyproject.toml"
-      - ".github/workflows/*.yml"
-      - "cicd/cicd.sh"
-      - "cicd/Dockerfile-uv.jinja"
+      types: [opened, synchronize, reopened, ready_for_review]
+      paths:
+       - '**.py'
+       - 'requirements.txt'
+       - '.github/workflows/*.yml'
+       - 'requirements-tests.txt'
+       - 'cicd/cicd.sh'
+       - 'cicd/Dockerfile.jinja'
  workflow_dispatch:

 # Cancel jobs on the same ref if a new one is triggered
@@ -31,7 +33,6 @@ permissions:

 env:
  TRANSFORMERS_IS_CI: "yes"
-  UV_SYSTEM_PYTHON: "1"

 jobs:
  pre-commit:
@@ -43,7 +44,7 @@ jobs:
      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"
-          cache: "pip" # caching pip dependencies
+          cache: 'pip' # caching pip dependencies
      - uses: pre-commit/action@v3.0.1
        env:
          SKIP: no-commit-to-branch
@@ -93,25 +94,32 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
+          cache: 'pip' # caching pip dependencies

-      - name: Install uv
-        uses: astral-sh/setup-uv@v7
+      - name: upgrade pip
+        run: |
+          pip3 install --upgrade pip
+          pip3 install --upgrade packaging==26.0 setuptools==75.8.0 wheel

      - name: Install PyTorch
        run: |
-          uv pip install torch==${{ matrix.pytorch_version }} torchvision
-          uv pip freeze | grep -E "^(torch|torchvision)==" > /tmp/torch-pin.txt
+          pip3 install --no-cache-dir torch==${{ matrix.pytorch_version }} torchvision

      - name: Install dependencies
        run: |
-          uv pip install --no-build-isolation -e . --override /tmp/torch-pin.txt
-          python scripts/cutcrossentropy_install.py --uv | sh
-          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
-            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
+          pip3 show torch
+          pip3 install --no-cache-dir --no-build-isolation -U -e .
+          python scripts/unsloth_install.py | sh
+          python scripts/cutcrossentropy_install.py | sh
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+
+      - name: cleanup pip cache
+        run: |
+          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

      - name: Make sure PyTorch version wasn't clobbered
        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__, f'Expected torch ${{ matrix.pytorch_version }} but got {torch.__version__}'"
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"

      - name: Ensure axolotl CLI was installed
        run: |
@@ -180,27 +188,33 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
+          cache: 'pip' # caching pip dependencies

-      - name: Install uv
-        uses: astral-sh/setup-uv@v7
+      - name: upgrade pip
+        run: |
+          pip3 install --upgrade pip
+          pip3 install --upgrade packaging==26.0 setuptools==75.8.0 setuptools_scm build wheel psutil

      - name: Install PyTorch
        run: |
-          uv pip install torch==${{ matrix.pytorch_version }} torchvision
-          uv pip freeze | grep -E "^(torch|torchvision)==" > /tmp/torch-pin.txt
+          pip3 install --no-cache-dir torch==${{ matrix.pytorch_version }} torchvision

      - name: Install dependencies
        run: |
-          uv pip install packaging setuptools_scm build wheel psutil
+          pip3 show torch
          python -m build --no-isolation --sdist
-          uv pip install --no-build-isolation dist/axolotl*.tar.gz --override /tmp/torch-pin.txt
-          python scripts/cutcrossentropy_install.py --uv | sh
-          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
-            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
+          pip3 install --no-cache-dir --no-build-isolation dist/axolotl*.tar.gz
+          python scripts/unsloth_install.py | sh
+          python scripts/cutcrossentropy_install.py | sh
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+
+      - name: cleanup pip cache
+        run: |
+          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

      - name: Make sure PyTorch version wasn't clobbered
        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__, f'Expected torch ${{ matrix.pytorch_version }} but got {torch.__version__}'"
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"

      - name: Ensure axolotl CLI was installed
        run: |
@@ -277,6 +291,7 @@ jobs:
            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
+            dockerfile: "Dockerfile-uv.jinja"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -297,7 +312,7 @@ jobs:
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
@@ -359,7 +374,7 @@ jobs:
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "GPU_TYPE=${{ matrix.gpu_type || 'L40S'}}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,6 +1,7 @@
+include requirements.txt
 include README.md
 include LICENSE
-include VERSION
+include src/setuptools_axolotl_dynamic_dependencies.py
 include src/axolotl/utils/chat_templates/templates/*.jinja
 include AGENTS.md
 recursive-include docs/agents *.md
--- a/README.md
+++ b/README.md
@@ -95,11 +95,14 @@ Features:

 ### Installation

-```bash
-# install uv if you don't already have it installed (restart shell after)
-curl -LsSf https://astral.sh/uv/install.sh | sh
+#### Using uv (recommended)

-# change depending on system
+```bash
+# install uv if you don't already have it installed
+curl -LsSf https://astral.sh/uv/install.sh | sh
+source $HOME/.local/bin/env
+
+# CUDA 12.8.1 tends to have better package compatibility
 export UV_TORCH_BACKEND=cu128

 # create a new virtual environment
@@ -109,6 +112,23 @@ source .venv/bin/activate
 uv pip install torch==2.10.0 torchvision
 uv pip install --no-build-isolation axolotl[deepspeed]

+# recommended - install cut-cross-entropy
+uv pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@main"
+
+# (optional) - prefetch flash-attn2 and causal-conv1d kernels
+uv run --python 3.12 python -c "from kernels import get_kernel; get_kernel('kernels-community/flash-attn2'); get_kernel('kernels-community/causal-conv1d')"
+
+# Download example axolotl configs, deepspeed configs
+axolotl fetch examples
+axolotl fetch deepspeed_configs  # OPTIONAL
+```
+
+#### Using pip
+
+```bash
+pip3 install -U packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
+
 # Download example axolotl configs, deepspeed configs
 axolotl fetch examples
 axolotl fetch deepspeed_configs  # OPTIONAL
@@ -118,7 +138,7 @@ axolotl fetch deepspeed_configs  # OPTIONAL

 Installing with Docker can be less error prone than installing in your own environment.
 ```bash
-docker run --gpus '"all"' --ipc=host --rm -it axolotlai/axolotl:main-latest
+docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
 ```

 Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html).
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -134,6 +134,7 @@ quartodoc:
        - monkeypatch.stablelm_attn_hijack_flash
        - monkeypatch.trainer_fsdp_optim
        - monkeypatch.transformers_fa_utils
+        - monkeypatch.unsloth_
        - monkeypatch.data.batch_dataset_fetcher
        - monkeypatch.mixtral
        - monkeypatch.gradient_checkpointing.offload_cpu
@@ -326,6 +327,7 @@ website:
        - section: "Advanced Features"
          contents:
            - docs/fsdp_qlora.qmd
+            - docs/unsloth.qmd
            - docs/torchao.qmd
            - docs/custom_integrations.qmd
            - docs/sequence_parallelism.qmd
--- a/cicd/Dockerfile-uv.jinja
+++ b/cicd/Dockerfile-uv.jinja
@@ -22,6 +22,15 @@ WORKDIR /workspace/axolotl
 RUN git fetch origin +$GITHUB_REF && \
    git checkout FETCH_HEAD

+# If AXOLOTL_EXTRAS is set, append it in brackets
+RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
+        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
+        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
+        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
+        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
+        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
+    fi
+
 RUN uv pip install packaging==26.0 setuptools==78.1.1
 RUN uv pip install torchvision
 RUN uv pip uninstall causal_conv1d
@@ -31,21 +40,11 @@ RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
    fi

-# Override with nightly HF packages for nightly builds
-RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
-        uv pip install --no-deps \
-            "transformers @ git+https://github.com/huggingface/transformers.git@main" \
-            "peft @ git+https://github.com/huggingface/peft.git@main" \
-            "accelerate @ git+https://github.com/huggingface/accelerate.git@main" \
-            "trl @ git+https://github.com/huggingface/trl.git@main" \
-            "datasets @ git+https://github.com/huggingface/datasets.git@main"; \
-    fi
-
+RUN python scripts/unsloth_install.py --uv | sh
 RUN python scripts/cutcrossentropy_install.py --uv | sh

 # So we can test the Docker image
-RUN uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
-    codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
+RUN uv pip install -r requirements-dev.txt -r requirements-tests.txt

 # fix so that git fetch/pull from remote works
 RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -0,0 +1,54 @@
+FROM axolotlai/axolotl-base:{{ BASE_TAG }}
+
+ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
+ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
+ENV CUDA="{{ CUDA }}"
+ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
+ENV GITHUB_REF="{{ GITHUB_REF }}"
+ENV GITHUB_SHA="{{ GITHUB_SHA }}"
+ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
+ENV HF_HOME="{{ HF_HOME }}"
+ENV AXOLOTL_DATASET_NUM_PROC="8"
+
+RUN apt-get update && \
+    apt-get install -y --allow-change-held-packages vim curl nano zstd libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm
+
+WORKDIR /workspace
+
+RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
+
+WORKDIR /workspace/axolotl
+
+RUN git fetch origin +$GITHUB_REF && \
+    git checkout FETCH_HEAD
+
+# If AXOLOTL_EXTRAS is set, append it in brackets
+RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
+        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
+        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
+        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
+        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
+        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
+    fi
+
+RUN pip install packaging==26.0 setuptools==78.1.1 psutil
+RUN pip uninstall -y causal_conv1d
+RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
+        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+    else \
+        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
+    fi
+
+RUN python scripts/unsloth_install.py | sh
+RUN python scripts/cutcrossentropy_install.py | sh
+
+# So we can test the Docker image
+RUN pip install -r requirements-dev.txt -r requirements-tests.txt
+
+# fix so that git fetch/pull from remote works
+RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
+    git config --get remote.origin.fetch
+
+# helper for huggingface-login cli
+RUN git config --global credential.helper store
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -e

-python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__, f'Expected torch $PYTORCH_VERSION but got {torch.__version__}'"
+python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"

 set -o pipefail
 for i in 1 2 3; do
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -17,7 +17,7 @@ template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
 template_env = jinja2.Environment(
    loader=template_loader, autoescape=select_autoescape()
 )
-dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile-uv.jinja")
+dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja")
 df_template = template_env.get_template(dockerfile)

 df_args = {
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
@@ -16,7 +16,7 @@ template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
 template_env = jinja2.Environment(
    loader=template_loader, autoescape=select_autoescape()
 )
-dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile-uv.jinja")
+dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja")
 df_template = template_env.get_template(dockerfile)

 df_args = {
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -32,7 +32,7 @@ RUN if [ "$TARGETARCH" = "arm64" ]; then \
        pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
        pip install --no-build-isolation -e .[$BASE_EXTRAS] $AXOLOTL_ARGS; \
-    fi && \
+    fi && \    python scripts/unsloth_install.py | sh && \
    python scripts/cutcrossentropy_install.py | sh && \
    pip install pytest && \
    pip cache purge
--- a/docker/Dockerfile-uv
+++ b/docker/Dockerfile-uv
@@ -33,6 +33,7 @@ RUN if [ "$TARGETARCH" = "arm64" ]; then \
    else \
        uv pip install --no-build-isolation -e .[$BASE_EXTRAS] $AXOLOTL_ARGS; \
    fi && \
+    python scripts/unsloth_install.py --uv | sh && \
    python scripts/cutcrossentropy_install.py --uv | sh && \
    uv pip install pytest && \
    uv cache clean
--- a/docs/debugging.qmd
+++ b/docs/debugging.qmd
@@ -76,9 +76,8 @@ datasets:
 Make sure you have an [editable install](https://setuptools.pypa.io/en/latest/userguide/development_mode.html) of Axolotl, which ensures that changes you make to the code are reflected at runtime.  Run the following commands from the root of this project:

 ```bash
-export UV_TORCH_BACKEND=cu128  # or cu130
-uv sync --extra flash-attn --extra deepspeed --group dev --group test
-source .venv/bin/activate
+pip3 install packaging
+pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
 ```

 #### Remote Hosts
@@ -209,17 +208,17 @@ cd axolotl
 Next, run the desired docker image and mount the current directory. Below is a docker command you can run to do this:[^2]

 ```bash
-docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl-uv:main-latest
+docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl:main-py3.10-cu118-2.0.1
 ```

 >[!Tip]
 > To understand which containers are available, see the [Docker section of the README](../README.md#docker) and the [DockerHub repo](https://hub.docker.com/r/axolotlai/axolotl/tags).  For details of how the Docker containers are built, see axolotl's [Docker CI builds](../.github/workflows/main.yml).

-You will now be in the container.  Next, install Axolotl with dev dependencies:
+You will now be in the container.  Next, perform an editable install of Axolotl:

 ```bash
-uv sync --extra flash-attn --extra deepspeed --group dev --group test
-source .venv/bin/activate
+pip3 install packaging
+pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
 ```

 ### Attach To Container
--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -6,30 +6,23 @@ format:
    toc-depth: 4
 ---

-This section describes the different Docker images that are released by AxolotlAI at
-[Docker Hub](https://hub.docker.com/u/axolotlai).
+This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai).

 ::: {.callout-important}
-For Blackwell GPUs, please use the tags with PyTorch 2.9.1 and CUDA 12.8.
-:::
-
-::: {.callout-tip}
-Each image below is available in a **uv variant** that uses [uv](https://docs.astral.sh/uv/) with
-a relocatable venv (`/workspace/axolotl-venv`) instead of Miniconda + pip. Append `-uv` to the image name
-(e.g. `axolotlai/axolotl-base-uv`). Tags follow the same format. We recommend the uv images for new deployments.
+For Blackwell GPUs, please use the tags with PyTorch 2.7.1 and CUDA 12.8.
 :::

 ## Base

-The base image is the most minimal image that can install Axolotl. It is based on the `nvidia/cuda` image.
-It includes python, torch, git, git-lfs, awscli, pydantic, and more.
+The base image is the most minimal image that can install Axolotl. It is based on the `nvidia/cuda` image. It includes python, torch, git, git-lfs, awscli, pydantic, and more.

 #### Image

-| Variant | Image | Docker Hub |
-|---------|-------|------------|
-| pip | `axolotlai/axolotl-base` | [Link](https://hub.docker.com/r/axolotlai/axolotl-base) |
-| uv | `axolotlai/axolotl-base-uv` | [Link](https://hub.docker.com/r/axolotlai/axolotl-base-uv) |
+```
+axolotlai/axolotl-base
+```
+
+Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-base)

 #### Tags format

@@ -39,10 +32,8 @@ main-base-py{python_version}-cu{cuda_version}-{pytorch_version}

 Tags examples:

+- `main-base-py3.11-cu128-2.8.0`
 - `main-base-py3.11-cu128-2.9.1`
- `main-base-py3.12-cu128-2.10.0`
- `main-base-py3.12-cu130-2.9.1`
- `main-base-py3.12-cu130-2.10.0`

 ## Main

@@ -50,10 +41,11 @@ The main image is the image that is used to run Axolotl. It is based on the `axo

 #### Image

-| Variant | Image | Docker Hub |
-|---------|-------|------------|
-| pip | `axolotlai/axolotl` | [Link](https://hub.docker.com/r/axolotlai/axolotl) |
-| uv | `axolotlai/axolotl-uv` | [Link](https://hub.docker.com/r/axolotlai/axolotl-uv) |
+```
+axolotlai/axolotl
+```
+
+Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl)

 #### Tags format {#sec-main-tags}

@@ -61,7 +53,7 @@ The main image is the image that is used to run Axolotl. It is based on the `axo
 # on push to main
 main-py{python_version}-cu{cuda_version}-{pytorch_version}

-# latest main (currently torch 2.9.1, python 3.11, cuda 12.8)
+# latest main (currently torch 2.6.0, python 3.11, cuda 12.4)
 main-latest

 # nightly build
@@ -79,12 +71,11 @@ There may be some extra tags appended to the image, like `-vllm` which installs

 Tags examples:

+- `main-py3.11-cu128-2.8.0`
 - `main-py3.11-cu128-2.9.1`
- `main-py3.12-cu128-2.10.0`
- `main-py3.12-cu130-2.9.1`
- `main-py3.12-cu130-2.10.0`
 - `main-latest`
- `main-20260315-py3.11-cu128-2.9.1`
+- `main-20250303-py3.11-cu124-2.6.0`
+- `main-20250303-py3.11-cu126-2.6.0`
 - `0.12.0`

 ## Cloud
@@ -99,10 +90,11 @@ Jupyter lab is run by default. Set `JUPYTER_DISABLE=1` in the environment variab

 #### Image

-| Variant | Image | Docker Hub |
-|---------|-------|------------|
-| pip | `axolotlai/axolotl-cloud` | [Link](https://hub.docker.com/r/axolotlai/axolotl-cloud) |
-| uv | `axolotlai/axolotl-cloud-uv` | [Link](https://hub.docker.com/r/axolotlai/axolotl-cloud-uv) |
+```
+axolotlai/axolotl-cloud
+```
+
+Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-cloud)

 #### Tags format

--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -15,30 +15,64 @@ This guide covers all the ways you can install and set up Axolotl for your envir

 - NVIDIA GPU (Ampere architecture or newer for `bf16` and Flash Attention) or AMD GPU
 - Python ≥3.11
- PyTorch ≥2.9.0
+- PyTorch ≥2.6.0

-## Installation {#sec-installation}
+## Installation Methods {#sec-installation-methods}
+
+::: {.callout-important}
+Please make sure to have Pytorch installed before installing Axolotl in your local environment.
+
+Follow the instructions at: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
+:::

 ::: {.callout-important}
 For Blackwell GPUs, please use Pytorch 2.9.1 and CUDA 12.8.
 :::

-### Quick Install {#sec-uv}
+### PyPI Installation (Recommended) {#sec-pypi}

-Axolotl uses [uv](https://docs.astral.sh/uv/) as its package manager. uv is a fast, reliable Python package installer and resolver built in Rust.
+```{.bash}
+pip3 install -U packaging setuptools wheel ninja
+pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
+```

-Install uv if not already installed:
+We use `--no-build-isolation` in order to detect the installed PyTorch version (if
+installed) in order not to clobber it, and so that we set the correct version of
+dependencies that are specific to the PyTorch version or other installed
+co-dependencies.
+
+### uv Installation {#sec-uv}
+
+uv is a fast, reliable Python package installer and resolver built in Rust. It offers significant performance improvements over pip and provides better dependency resolution, making it an excellent choice for complex environments.
+
+Install uv if not already installed
 ```{.bash}
 curl -LsSf https://astral.sh/uv/install.sh | sh
 source $HOME/.local/bin/env
 ```

-Choose your CUDA version (e.g. `cu128`, `cu130`), create a venv, and install:
+Choose your CUDA version to use with PyTorch; e.g. `cu124`, `cu126`, `cu128`,
+then create the venv and activate
 ```{.bash}
-export UV_TORCH_BACKEND=cu128  # or cu130
+export UV_TORCH_BACKEND=cu126
 uv venv --no-project --relocatable
 source .venv/bin/activate
-uv pip install --no-build-isolation axolotl[flash-attn,deepspeed]
+```
+
+Install PyTorch
+- PyTorch 2.6.0 recommended
+```{.bash}
+uv pip install packaging setuptools wheel
+uv pip install torch==2.6.0
+uv pip install awscli pydantic
+```
+
+Install axolotl from PyPi
+```{.bash}
+uv pip install --no-build-isolation axolotl[deepspeed,flash-attn]
+
+# optionally install with vLLM if you're using torch==2.6.0 and want to train w/ GRPO
+uv pip install --no-build-isolation axolotl[deepspeed,flash-attn,vllm]
 ```

 ### Edge/Development Build {#sec-edge-build}
@@ -48,17 +82,14 @@ For the latest features between releases:
 ```{.bash}
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
-export UV_TORCH_BACKEND=cu128  # or cu130
-uv sync --extra flash-attn --extra deepspeed
-source .venv/bin/activate
+pip3 install -U packaging setuptools wheel ninja
+pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
 ```

-`uv sync` creates a `.venv`, installs exact pinned versions from `uv.lock`, and sets up an editable install automatically.
-
 ### Docker {#sec-docker}

 ```{.bash}
-docker run --gpus '"all"' --rm -it --ipc=host axolotlai/axolotl-uv:main-latest
+docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
 ```

 For development with Docker:
@@ -75,12 +106,12 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it \
  --ulimit memlock=-1 --ulimit stack=67108864 \
  --mount type=bind,src="${PWD}",target=/workspace/axolotl \
  -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-  axolotlai/axolotl-uv:main-latest
+  axolotlai/axolotl:main-latest
 ```
 :::

 ::: {.callout-important}
-For Blackwell GPUs, please use `axolotlai/axolotl-uv:main-py3.11-cu128-2.9.1` or the cloud variant `axolotlai/axolotl-cloud-uv:main-py3.11-cu128-2.9.1`.
+For Blackwell GPUs, please use `axolotlai/axolotl:main-py3.11-cu128-2.9.1` or the cloud variant `axolotlai/axolotl-cloud:main-py3.11-cu128-2.9.1`.
 :::

 Please refer to the [Docker documentation](docker.qmd) for more information on the different Docker images that are available.
@@ -91,7 +122,7 @@ Please refer to the [Docker documentation](docker.qmd) for more information on t

 For providers supporting Docker:

- Use `axolotlai/axolotl-cloud-uv:main-latest`
+- Use `axolotlai/axolotl-cloud:main-latest`
 - Available on:
    - [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
    - [Vast.ai](https://cloud.vast.ai?ref_id=62897&template_id=bdd4a49fa8bce926defc99471864cace&utm_source=axolotl&utm_medium=partner&utm_campaign=template_launch_july2025&utm_content=docs_link)
@@ -110,7 +141,7 @@ For providers supporting Docker:
 ### macOS {#sec-macos}

 ```{.bash}
-uv pip install --no-build-isolation -e '.'
+pip3 install --no-build-isolation -e '.'
 ```

 See @sec-troubleshooting for Mac-specific issues.
@@ -121,44 +152,21 @@ See @sec-troubleshooting for Mac-specific issues.
 We recommend using WSL2 (Windows Subsystem for Linux) or Docker.
 :::

-## Migrating from pip to uv {#sec-migrating}
+## Environment Managers {#sec-env-managers}

-If you have an existing pip-based Axolotl installation, you can migrate to uv:
+### Conda/Pip venv {#sec-conda}

-```{.bash}
-# Install uv
-curl -LsSf https://astral.sh/uv/install.sh | sh
-source $HOME/.local/bin/env
-
-# Create a fresh venv (recommended for a clean start)
-export UV_TORCH_BACKEND=cu128  # or cu130
-uv venv --no-project --relocatable
-source .venv/bin/activate
-
-# Reinstall axolotl
-uv pip install --no-build-isolation axolotl[flash-attn,deepspeed]
-```
-
-## Using pip (Alternative) {#sec-pip}
-
-If you are unable to install uv, you can still use pip directly.
-
-::: {.callout-important}
-Please make sure to have PyTorch installed before installing Axolotl with pip.
-
-Follow the instructions at: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
-:::
-
-```{.bash}
-pip3 install -U packaging setuptools wheel ninja
-pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
-```
-
-For editable/development installs:
-```{.bash}
-pip3 install -U packaging setuptools wheel ninja
-pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
-```
+1. Install Python ≥3.11
+2. Install PyTorch: https://pytorch.org/get-started/locally/
+3. Install Axolotl:
+   ```{.bash}
+   pip3 install -U packaging setuptools wheel ninja
+   pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
+   ```
+4. (Optional) Login to Hugging Face:
+   ```{.bash}
+   hf auth login
+   ```

 ## Troubleshooting {#sec-troubleshooting}

--- a/docs/unsloth.qmd
+++ b/docs/unsloth.qmd
@@ -0,0 +1,53 @@
+---
+title: "Unsloth"
+description: "Hyper-optimized QLoRA finetuning for single GPUs"
+---
+
+### Overview
+
+Unsloth provides hand-written optimized kernels for LLM finetuning that slightly improve speed and VRAM over
+standard industry baselines.
+
+::: {.callout-important}
+Due to breaking changes in transformers `v4.48.0`, users will need to downgrade to `<=v4.47.1` to use this patch.
+
+This will later be deprecated in favor of [LoRA Optimizations](lora_optims.qmd).
+:::
+
+
+### Installation
+
+The following will install the correct unsloth and extras from source.
+
+```bash
+python scripts/unsloth_install.py | sh
+```
+
+### Usage
+
+Axolotl exposes a few configuration options to try out unsloth and get most of the performance gains.
+
+Our unsloth integration is currently limited to the following model architectures:
+ - llama
+
+These options are specific to LoRA finetuning and cannot be used for multi-GPU finetuning
+```yaml
+unsloth_lora_mlp: true
+unsloth_lora_qkv: true
+unsloth_lora_o: true
+```
+
+These options are composable and can be used with multi-gpu finetuning
+```yaml
+unsloth_cross_entropy_loss: true
+unsloth_rms_norm: true
+unsloth_rope: true
+```
+
+### Limitations
+
+- Single GPU only; e.g. no multi-gpu support
+- No deepspeed or FSDP support (requires multi-gpu)
+- LoRA + QLoRA support only. No full fine tunes or fp8 support.
+- Limited model architecture support. Llama, Phi, Gemma, Mistral only
+- No MoE support.
--- a/examples/LiquidAI/README.md
+++ b/examples/LiquidAI/README.md
@@ -15,7 +15,8 @@ Thanks to the team at LiquidAI for giving us early access to prepare for these r
    Here is an example of how to install from pip:
    ```bash
    # Ensure you have a compatible version of Pytorch installed
-    uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+    pip3 install packaging setuptools wheel ninja
+    pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
    ```

 2.  Run one of the finetuning examples below.
@@ -34,7 +35,7 @@ Thanks to the team at LiquidAI for giving us early access to prepare for these r

    **LFM2-MoE**
    ```bash
-    uv pip install git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6
+    pip install git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6

    # LoRA SFT (1x48GB @ 16.2GiB)
    axolotl train examples/LiquidAI/lfm2-8b-a1b-lora.yaml
@@ -44,7 +45,7 @@ Thanks to the team at LiquidAI for giving us early access to prepare for these r

 - **Installation Error**: If you encounter `ImportError: ... undefined symbol ...` or `ModuleNotFoundError: No module named 'causal_conv1d_cuda'`, the `causal-conv1d` package may have been installed incorrectly. Try uninstalling it:
  ```bash
-  uv pip uninstall causal-conv1d
+  pip uninstall -y causal-conv1d
  ```

 - **Dataset Loading**: Read more on how to load your own dataset in our [documentation](https://docs.axolotl.ai/docs/dataset_loading.html).
--- a/examples/apertus/README.md
+++ b/examples/apertus/README.md
@@ -15,7 +15,8 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl

-uv pip install --no-build-isolation -e '.[flash-attn]'
+pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation -e '.[flash-attn]'

 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
@@ -30,7 +31,7 @@ python scripts/cutcrossentropy_install.py | sh
 # For those using our Docker image, use the below path.
 export CUDA_HOME=/usr/local/cuda

-uv pip install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
+pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
 ```

 For any installation errors, see [XIELU Installation Issues](#xielu-installation-issues)
@@ -66,7 +67,7 @@ If those didn't help, please try the below solutions:
 1. Pass env for CMAKE and try install again:

    ```bash
-    Python_EXECUTABLE=$(which python) uv pip install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
+    Python_EXECUTABLE=$(which python) pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
    ```

 2. Git clone the repo and manually hardcode python path:
@@ -91,7 +92,7 @@ If those didn't help, please try the below solutions:
    ```

    ```bash
-    uv pip install . --no-build-isolation --no-deps
+    pip3 install . --no-build-isolation --no-deps
    ```

 ## Optimization Guides
--- a/examples/arcee/README.md
+++ b/examples/arcee/README.md
@@ -17,7 +17,8 @@ Thanks to the team at Arcee.ai for using Axolotl in supervised fine-tuning the A
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl

-uv pip install --no-build-isolation -e '.[flash-attn]'
+pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation -e '.[flash-attn]'

 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
--- a/examples/devstral/README.md
+++ b/examples/devstral/README.md
@@ -16,7 +16,8 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r

 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```

 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage
--- a/examples/gemma3n/README.md
+++ b/examples/gemma3n/README.md
@@ -10,16 +10,17 @@ Gemma-3n is a family of multimodal models from Google found on [HuggingFace](htt

 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```

 2. In addition to Axolotl's requirements, Gemma-3n requires:

 ```bash
-uv pip install timm==1.0.17
+pip3 install timm==1.0.17

 # for loading audio data
-uv pip install librosa==0.11.0
+pip3 install librosa==0.11.0
 ```

 3. Download sample dataset files
--- a/examples/gpt-oss/README.md
+++ b/examples/gpt-oss/README.md
@@ -14,7 +14,8 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations

 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```

 2. Choose one of the following configs below for training the 20B model. (for 120B, see [below](#training-120b))
@@ -86,7 +87,7 @@ for more information about using a special vllm-openai docker image for inferenc
 Optionally, vLLM can be installed from nightly:

 ```bash
-uv pip install --no-build-isolation --pre -U vllm --extra-index-url https://wheels.vllm.ai/nightly
+pip install --no-build-isolation --pre -U vllm --extra-index-url https://wheels.vllm.ai/nightly
 ```
 and the vLLM server can be started with the following command (modify `--tensor-parallel-size 8` to match your environment):
 ```bash
--- a/examples/granite4/README.md
+++ b/examples/granite4/README.md
@@ -15,7 +15,8 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl

-uv pip install --no-build-isolation -e '.[flash-attn]'
+pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation -e '.[flash-attn]'

 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
--- a/examples/hunyuan/README.md
+++ b/examples/hunyuan/README.md
@@ -13,7 +13,8 @@ Tencent released a family of opensource models called HunYuan with varying param
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl

-uv pip install --no-build-isolation -e '.[flash-attn]'
+pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation -e '.[flash-attn]'

 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
--- a/examples/internvl3_5/README.md
+++ b/examples/internvl3_5/README.md
@@ -11,7 +11,7 @@ This guide shows how to fine-tune it with Axolotl.
 2. Install `timm` for vision model support:

    ```bash
-    uv pip install timm==1.0.19
+    pip install timm==1.0.19
    ```

 3. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
--- a/examples/magistral/README.md
+++ b/examples/magistral/README.md
@@ -14,7 +14,8 @@ Thanks to the team at MistralAI for giving us early access to prepare for these

 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.7.0 min)
-uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```

 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage
--- a/examples/magistral/vision/README.md
+++ b/examples/magistral/vision/README.md
@@ -12,7 +12,7 @@ Before starting, ensure you have:

 1. Install the required vision lib:
    ```bash
-    uv pip install 'mistral-common[opencv]==1.8.5'
+    pip install 'mistral-common[opencv]==1.8.5'
    ```

 2. Download the example dataset image:
--- a/examples/ministral3/README.md
+++ b/examples/ministral3/README.md
@@ -23,7 +23,7 @@ Note: This is still experimental given it is based on transformers v5 RC.
    git checkout transformers-v5

    # Install packages for transformers v5
-    uv pip install -e .
+    pip install -e .
    ```

 4. Run the fine-tuning:
--- a/examples/ministral3/vision/README.md
+++ b/examples/ministral3/vision/README.md
@@ -12,7 +12,7 @@ Before starting, ensure you have:

 1. Install the required vision lib:
    ```bash
-    uv pip install 'mistral-common[opencv]==1.8.6'
+    pip install 'mistral-common[opencv]==1.8.6'
    ```

 2. Download the example dataset image:
--- a/examples/mistral-small/README.md
+++ b/examples/mistral-small/README.md
@@ -12,7 +12,7 @@ Before starting, ensure you have:

 1. Install the required vision lib:
    ```bash
-    uv pip install 'mistral-common[opencv]==1.8.5'
+    pip install 'mistral-common[opencv]==1.8.5'
    ```

 2. Download the example dataset image:
--- a/examples/mistral4/README.md
+++ b/examples/mistral4/README.md
@@ -13,7 +13,7 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r
 3. Install transformers from main

  ```bash
-  uv pip install git+https://github.com/huggingface/transformers.git
+  pip install git+https://github.com/huggingface/transformers.git
  ```

 4. Run one of the example configs:
--- a/examples/qwen3-next/README.md
+++ b/examples/qwen3-next/README.md
@@ -12,7 +12,7 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations

 3. Install FLA for improved performance
 ```bash
-uv pip uninstall causal-conv1d && uv pip install flash-linear-attention==0.4.1
+pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.4.1
 ```

 4. Run the finetuning example:
--- a/examples/qwen3.5/README.md
+++ b/examples/qwen3.5/README.md
@@ -10,7 +10,7 @@

 3. Install FLA for sample packing support with the Gated DeltaNet linear attention layers:
  ```bash
-  uv pip uninstall causal-conv1d && uv pip install flash-linear-attention==0.4.1
+  pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.4.1
  ```
  > FLA is required when `sample_packing: true`. Without it, training raises a `RuntimeError` on packed sequences. Vision configs use `sample_packing: false` so FLA is optional there.

--- a/examples/seed-oss/README.md
+++ b/examples/seed-oss/README.md
@@ -11,7 +11,8 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
    Here is an example of how to install from pip:
    ```bash
    # Ensure you have a compatible version of Pytorch installed
-    uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+    pip3 install packaging setuptools wheel ninja
+    pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'

    # Install Cut Cross Entropy
    python scripts/cutcrossentropy_install.py | sh
--- a/examples/smolvlm2/README.md
+++ b/examples/smolvlm2/README.md
@@ -13,13 +13,14 @@ This guide shows how to fine-tune SmolVLM2 models with Axolotl.
    Here is an example of how to install from pip:
    ```bash
    # Ensure you have a compatible version of Pytorch installed
-    uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+    pip3 install packaging setuptools wheel ninja
+    pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
    ```

 2. Install an extra dependency:

    ```bash
-    uv pip install num2words==0.5.14
+    pip3 install num2words==0.5.14
    ```

 3.  Run the finetuning example:
--- a/examples/voxtral/README.md
+++ b/examples/voxtral/README.md
@@ -12,15 +12,16 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r

 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```

 2. Please install the below.

 ```bash
 # audio
-uv pip install librosa==0.11.0
-uv pip install 'mistral_common[audio]==1.8.3'
+pip3 install librosa==0.11.0
+pip3 install 'mistral_common[audio]==1.8.3'

 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,165 +1,15 @@
 [build-system]
-requires = ["setuptools>=64", "wheel", "setuptools_scm>=8"]
+requires = ["setuptools>=64", "wheel", "setuptools_scm>=8", "packaging==26.0"]
 build-backend = "setuptools.build_meta"

 [project]
 name = "axolotl"
-dynamic = ["version"]
+dynamic = ["version", "dependencies", "optional-dependencies"]
 description = "LLM Trainer"
 readme = "README.md"
 requires-python = ">=3.10"
 # license = "Apache-2.0"

-dependencies = [
-    # Core ML stack
-    "torch>=2.6.0",
-    "packaging==26.0",
-    "huggingface_hub>=1.1.7",
-    "peft>=0.19.1,<0.20.0",
-    "tokenizers>=0.22.1",
-    "transformers==5.5.4",
-    "accelerate==1.13.0",
-    "datasets>=4.8.4,<4.9.0",
-    "trl==1.1.0",
-    "hf_xet==1.4.3",
-    "kernels==0.13.0",
-    "trackio>=0.16.1",
-    "typing-extensions>=4.15.0",
-    "optimum==1.16.2",
-    "hf_transfer",
-    "sentencepiece",
-    "gradio>=6.2.0,<7.0",
-    "modal==1.3.0.post1",
-    "pydantic>=2.10.6",
-    "addict",
-    "fire",
-    "PyYAML>=6.0",
-    "requests",
-    "wandb",
-    "einops",
-    "colorama",
-    "numba>=0.61.2",
-    "numpy>=2.2.6",
-
-    # Evaluation & metrics
-    "evaluate==0.4.1",
-    "scipy",
-    "nvidia-ml-py==12.560.30",
-    "art",
-    "tensorboard",
-    "python-dotenv==1.0.1",
-
-    # Remote filesystems
-    "s3fs>=2024.5.0",
-    "gcsfs>=2025.3.0",
-    "adlfs>=2024.5.0",
-    "ocifs==1.3.2",
-
-    "zstandard==0.22.0",
-    "fastcore",
-
-    # lm eval harness
-    "lm_eval==0.4.11",
-    "langdetect==1.0.9",
-    "immutabledict==4.2.0",
-    "antlr4-python3-runtime==4.13.2",
-
-    "schedulefree==1.4.1",
-    "openenv-core==0.1.0",
-
-    # Axolotl contribs
-    "axolotl-contribs-lgpl==0.0.7",
-    "axolotl-contribs-mit==0.0.6",
-
-    # Telemetry
-    "posthog==6.7.11",
-
-    "mistral-common==1.11.0",
-
-    # Platform-specific (Linux only)
-    "bitsandbytes==0.49.1 ; sys_platform != 'darwin'",
-    "triton>=3.4.0 ; sys_platform != 'darwin'",
-    "xformers>=0.0.23.post1 ; sys_platform != 'darwin'",
-    "liger-kernel==0.7.0 ; sys_platform != 'darwin'",
-    "torchao==0.17.0 ; sys_platform != 'darwin' and platform_machine != 'aarch64'",
-
-    # Architecture-specific
-    "fla-core==0.4.1 ; platform_machine != 'aarch64'",
-    "flash-linear-attention==0.4.1 ; platform_machine != 'aarch64'",
-]
-
-[project.optional-dependencies]
-flash-attn = ["flash-attn==2.8.3"]
-ring-flash-attn = [
-    "flash-attn==2.8.3",
-    "ring-flash-attn>=0.1.7",
-]
-deepspeed = [
-    "deepspeed>=0.18.6,<0.19.0",
-    "deepspeed-kernels",
-]
-mamba-ssm = [
-    "mamba-ssm==1.2.0.post1",
-    "causal_conv1d",
-]
-auto-gptq = [
-    "auto-gptq==0.5.1",
-]
-mlflow = [
-    "mlflow",
-]
-galore = [
-    "galore_torch",
-]
-apollo = [
-    "apollo-torch",
-]
-optimizers = [
-    "galore_torch",
-    "apollo-torch",
-    "lomo-optim==0.1.1",
-    "torch-optimi==0.2.1",
-    "came_pytorch==0.1.3",
-]
-ray = [
-    "ray[train]>=2.52.1",
-]
-vllm = [
-    "vllm>=0.15.0",
-]
-llmcompressor = [
-    "llmcompressor>=0.10.0",
-]
-fbgemm-gpu = ["fbgemm-gpu-genai>=1.3.0"]
-opentelemetry = [
-    "opentelemetry-api",
-    "opentelemetry-sdk",
-    "opentelemetry-exporter-prometheus",
-    "prometheus-client",
-]
-
-[dependency-groups]
-dev = [
-    "black",
-    "mypy",
-    "pre-commit",
-    "types-requests",
-    "quartodoc",
-    "jupyter",
-    "blobfile",
-    "tiktoken",
-]
-test = [
-    "codecov",
-    "codecov-cli",
-    "pytest",
-    "pytest-cov",
-    "pytest-retry",
-    "pytest-sugar",
-    "pytest-xdist",
-    "tbparse",
-]
-
 [project.scripts]
 axolotl = "axolotl.cli.main:main"

@@ -168,15 +18,18 @@ Homepage = "https://axolotl.ai/"
 Documentation = "https://docs.axolotl.ai/"
 Repository = "https://github.com/axolotl-ai-cloud/axolotl.git"

-[tool.setuptools]
-include-package-data = true
+[tool.setuptools_scm]

-[tool.setuptools.packages.find]
-where = ["src"]
+[tool.setuptools]
+py-modules = ["setuptools_axolotl_dynamic_dependencies"]
+include-package-data = true

 [tool.setuptools.dynamic]
 version = { file = "VERSION" }

+[tool.setuptools.cmdclass]
+build_py = "setuptools_axolotl_dynamic_dependencies.BuildPyCommand"
+
 [tool.ruff]
 line-length = 88
 target-version = "py310"
@@ -214,43 +67,5 @@ markers = [
    "slow: marks tests as slow",
 ]

-# UV specific configuration
-[tool.uv]
-prerelease = "allow"
-conflicts = [
-    [
-        { package = "axolotl" },
-        { extra = "vllm" },
-    ],
-    [
-        { package = "axolotl" },
-        { extra = "flash-attn" },
-    ],
-    [
-        { package = "axolotl" },
-        { extra = "ring-flash-attn" },
-    ],
-    [
-        { package = "axolotl" },
-        { extra = "mamba-ssm" },
-    ],
-    [
-        { package = "axolotl" },
-        { extra = "auto-gptq" },
-    ],
-    [
-        { package = "axolotl" },
-        { extra = "fbgemm-gpu" },
-    ],
-    [
-        { package = "axolotl" },
-        { extra = "llmcompressor" },
-    ],
-]
-
 [tool.uv.extra-build-dependencies]
-mamba-ssm = [{ requirement = "torch", match-runtime = true }]
-causal-conv1d = [{ requirement = "torch", match-runtime = true }]
-flash-attn = [{ requirement = "torch", match-runtime = true }]
-deepspeed = [{ requirement = "torch", match-runtime = true }]
-auto-gptq = [{ requirement = "torch", match-runtime = true }]
+axolotl = ["huggingface_hub"]
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -0,0 +1,8 @@
+black
+mypy
+pre-commit
+types-requests
+quartodoc
+jupyter
+blobfile
+tiktoken
--- a/requirements-tests.txt
+++ b/requirements-tests.txt
@@ -0,0 +1,8 @@
+codecov
+codecov-cli
+pytest
+pytest-cov
+pytest-retry
+pytest-sugar
+pytest-xdist
+tbparse
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,78 @@
+--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
+
+# START section of dependencies that don't install on Darwin/MacOS
+bitsandbytes==0.49.1
+triton>=3.4.0
+mamba-ssm==1.2.0.post1
+xformers>=0.0.23.post1
+liger-kernel==0.7.0
+# END section
+
+packaging==26.0
+huggingface_hub>=1.1.7
+peft>=0.19.0,<0.20.0
+tokenizers>=0.22.1
+transformers==5.5.4
+accelerate==1.13.0
+datasets>=4.8.4,<4.9.0
+deepspeed>=0.18.6,<0.19.0
+trl==1.1.0
+hf_xet==1.4.3
+kernels==0.13.0
+
+fla-core==0.4.1
+flash-linear-attention==0.4.1
+
+trackio>=0.16.1
+typing-extensions>=4.15.0
+
+optimum==1.16.2
+hf_transfer
+sentencepiece
+gradio>=6.2.0,<7.0
+
+modal==1.3.0.post1
+pydantic>=2.10.6
+addict
+fire
+PyYAML>=6.0
+requests
+wandb
+einops
+colorama
+numba>=0.61.2
+numpy>=2.2.6
+
+# qlora things
+evaluate==0.4.1
+scipy
+nvidia-ml-py==12.560.30
+art
+tensorboard
+python-dotenv==1.0.1
+
+# remote filesystems
+s3fs>=2024.5.0
+gcsfs>=2025.3.0
+adlfs>=2024.5.0
+ocifs==1.3.2
+
+zstandard==0.22.0
+fastcore
+
+# lm eval harness
+lm_eval==0.4.11
+langdetect==1.0.9
+immutabledict==4.2.0
+antlr4-python3-runtime==4.13.2
+
+torchao==0.17.0
+openenv-core==0.1.0
+schedulefree==1.4.1
+
+axolotl-contribs-lgpl==0.0.7
+axolotl-contribs-mit==0.0.6
+# telemetry
+posthog==6.7.11
+
+mistral-common==1.11.0
--- a/scripts/build_scattermoe_lora_kernel.py
+++ b/scripts/build_scattermoe_lora_kernel.py
@@ -1,479 +0,0 @@
-#!/usr/bin/env python3
-"""Build a disposable Hugging Face Kernel Hub package for ScatterMoE LoRA.
-
-This script does not move or edit the in-tree Axolotl kernel sources. It copies
-``src/axolotl/integrations/kernels/libs/scattermoe_lora`` into an ignored
-build directory and emits a universal HF kernels project that can be pushed to
-the Hub.
-"""
-
-from __future__ import annotations
-
-import argparse
-import fnmatch
-import hashlib
-import json
-import os
-import shutil
-import subprocess
-import sys
-from importlib import metadata
-from pathlib import Path
-
-PACKAGE_NAME = "scattermoe_lora"
-BUILD_VARIANT = "torch-universal"
-DEFAULT_REPO_ID = "kernels-community/scattermoe-lora"
-HF_REPO_TYPE = "kernel"
-HF_KERNEL_URL_PREFIX = "https://hf.co/kernels"
-
-REPO_ROOT = Path(__file__).resolve().parents[1]
-DEFAULT_SOURCE_DIR = (
-    REPO_ROOT / "src" / "axolotl" / "integrations" / "kernels" / "libs" / PACKAGE_NAME
-)
-DEFAULT_OUTPUT_DIR = REPO_ROOT / "build" / "hf-kernels" / PACKAGE_NAME
-
-EXCLUDED_DIRS = {
-    "__pycache__",
-    ".mypy_cache",
-    ".pytest_cache",
-    ".ruff_cache",
-}
-EXCLUDED_FILE_PATTERNS = {
-    "*.pyc",
-    "*.pyo",
-    "*.so",
-    ".DS_Store",
-}
-
-TEXT_REPLACEMENTS = {
-    "from axolotl.integrations.kernels.libs.scattermoe_lora.selective_dequant import": (
-        "from .selective_dequant import"
-    ),
-    "from axolotl.integrations.kernels.libs.scattermoe_lora.selective_dequant_kernel import": (
-        "from .selective_dequant_kernel import"
-    ),
-    "from axolotl.integrations.kernels.libs.scattermoe_lora.kernels.ops import": (
-        "from .ops import"
-    ),
-}
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description=(
-            "Copy Axolotl's ScatterMoE LoRA Triton kernels into a disposable "
-            "HF Kernel Hub universal package."
-        )
-    )
-    parser.add_argument(
-        "--source-dir",
-        type=Path,
-        default=DEFAULT_SOURCE_DIR,
-        help=f"ScatterMoE LoRA source package to copy. Default: {DEFAULT_SOURCE_DIR}",
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=Path,
-        default=DEFAULT_OUTPUT_DIR,
-        help=f"Destination build/dist directory. Default: {DEFAULT_OUTPUT_DIR}",
-    )
-    parser.add_argument(
-        "--repo-id",
-        default=DEFAULT_REPO_ID,
-        help=f"HF Hub repo id to write into build.toml. Default: {DEFAULT_REPO_ID}",
-    )
-    parser.add_argument(
-        "--version",
-        type=int,
-        default=1,
-        help="Kernel major version written to build.toml and metadata.json.",
-    )
-    parser.add_argument(
-        "--force",
-        action="store_true",
-        help="Delete the output directory first if it already exists.",
-    )
-    parser.add_argument(
-        "--no-source-layout",
-        action="store_true",
-        help="Only write the shippable build/ tree, not torch-ext/ sources.",
-    )
-    parser.add_argument(
-        "--upload",
-        action="store_true",
-        help=(
-            "Upload the generated universal kernel package with huggingface_hub. "
-            "This bypasses kernel-builder and is intended for pure Python/Triton "
-            "universal kernels."
-        ),
-    )
-    parser.add_argument(
-        "--private",
-        action="store_true",
-        help="Create the HF Hub repo as private when used with --upload.",
-    )
-    parser.add_argument(
-        "--skip-version-branch",
-        action="store_true",
-        help="With --upload, only upload main and skip the v<version> branch.",
-    )
-    return parser.parse_args()
-
-
-def should_skip_file(path: Path) -> bool:
-    return any(
-        fnmatch.fnmatch(path.name, pattern) for pattern in EXCLUDED_FILE_PATTERNS
-    )
-
-
-def iter_source_files(source_dir: Path) -> list[Path]:
-    files: list[Path] = []
-    for root, dirs, filenames in os.walk(source_dir):
-        dirs[:] = sorted(d for d in dirs if d not in EXCLUDED_DIRS)
-        for filename in sorted(filenames):
-            path = Path(root) / filename
-            if not should_skip_file(path):
-                files.append(path)
-    return files
-
-
-def content_hash(source_dir: Path) -> str:
-    digest = hashlib.sha1()
-    for path in iter_source_files(source_dir):
-        rel = path.relative_to(source_dir).as_posix()
-        digest.update(rel.encode("utf-8"))
-        digest.update(b"\0")
-        digest.update(path.read_bytes())
-        digest.update(b"\0")
-    return digest.hexdigest()[:10]
-
-
-def git_revision() -> str:
-    try:
-        result = subprocess.run(
-            ["git", "rev-parse", "--short", "HEAD"],
-            cwd=REPO_ROOT,
-            check=True,
-            capture_output=True,
-            text=True,
-        )
-    except (OSError, subprocess.CalledProcessError):
-        return "unknown"
-    return result.stdout.strip() or "unknown"
-
-
-def transform_python_source(text: str, rel_path: Path, op_namespace: str) -> str:
-    for old, new in TEXT_REPLACEMENTS.items():
-        text = text.replace(old, new)
-
-    if rel_path.as_posix() == "gemma4_experts.py":
-        text = text.replace(
-            "    from axolotl.integrations.kernels.constants import resolve_experts_class",
-            (
-                "    raise RuntimeError(\n"
-                '        "patch_gemma4_scattermoe is only available from the in-tree Axolotl "\n'
-                '        "integration. Use register_scattermoe_experts() with the standalone "\n'
-                '        "HF kernel package."\n'
-                "    )"
-            ),
-        )
-
-    return text.replace("scattermoe::", f"{op_namespace}::")
-
-
-def copy_package(source_dir: Path, package_dir: Path, op_namespace: str) -> None:
-    for source in iter_source_files(source_dir):
-        rel_path = source.relative_to(source_dir)
-        destination = package_dir / rel_path
-        destination.parent.mkdir(parents=True, exist_ok=True)
-
-        if source.suffix == ".py":
-            text = source.read_text(encoding="utf-8")
-            text = transform_python_source(text, rel_path, op_namespace)
-            destination.write_text(text, encoding="utf-8")
-        else:
-            shutil.copy2(source, destination)
-
-    write_ops_module(package_dir / "_ops.py", op_namespace)
-
-
-def write_ops_module(path: Path, op_namespace: str) -> None:
-    path.write_text(
-        "\n".join(
-            [
-                "import torch",
-                "",
-                f"ops = torch.ops.{op_namespace}",
-                "",
-                "",
-                "def add_op_namespace_prefix(op_name: str) -> str:",
-                f'    return f"{op_namespace}::{{op_name}}"',
-                "",
-            ]
-        ),
-        encoding="utf-8",
-    )
-
-
-def write_build_toml(path: Path, repo_id: str, version: int) -> None:
-    lines = [
-        "[general]",
-        f'name = "{PACKAGE_NAME}"',
-        "universal = true",
-        f"version = {version}",
-        "",
-    ]
-    if repo_id:
-        lines.extend(
-            [
-                "[general.hub]",
-                f'repo-id = "{repo_id}"',
-                "",
-            ]
-        )
-    path.write_text("\n".join(lines), encoding="utf-8")
-
-
-def write_flake(path: Path) -> None:
-    path.write_text(
-        """{
-  description = "Flake for scattermoe_lora kernel";
-
-  inputs = {
-    builder.url = "github:huggingface/kernels";
-  };
-
-  outputs =
-    {
-      self,
-      builder,
-    }:
-    builder.lib.genKernelFlakeOutputs {
-      inherit self;
-      path = ./.;
-    };
-}
-""",
-        encoding="utf-8",
-    )
-
-
-def write_readme(path: Path, repo_id: str, source_hash: str, op_namespace: str) -> None:
-    repo_display = repo_id or "<your-org>/scattermoe-lora"
-    path.write_text(
-        f"""---
-library_name: kernels
-license: apache-2.0
-tags:
- kernel
- kernels
---
-
-# ScatterMoE LoRA
-
-Standalone Hugging Face Kernel Hub package for Axolotl's ScatterMoE LoRA Triton kernels.
-
-This package is generated from Axolotl's in-tree `scattermoe_lora` sources and is exported as a universal kernel because the implementation is Python/Triton rather than a precompiled C++/CUDA extension.
-
-```python
-from kernels import get_kernel
-
-scattermoe_lora = get_kernel("{repo_display}")
-```
-
-Export metadata:
-
- source package: `src/axolotl/integrations/kernels/libs/scattermoe_lora`
- source revision: `{git_revision()}`
- source content hash: `{source_hash}`
- torch custom op namespace: `{op_namespace}`
-
-The generated `build/torch-universal/{PACKAGE_NAME}` directory is the shippable Hub artifact. `torch-ext/{PACKAGE_NAME}` is included so `kernel-builder build-and-copy` can regenerate the universal build tree if desired.
-""",
-        encoding="utf-8",
-    )
-
-
-def write_metadata(path: Path, version: int) -> None:
-    path.write_text(
-        json.dumps({"version": version}, indent=2, sort_keys=True) + "\n",
-        encoding="utf-8",
-    )
-
-
-def prepare_output_dir(output_dir: Path, force: bool) -> None:
-    if output_dir.exists():
-        if not force:
-            raise FileExistsError(
-                f"{output_dir} already exists. Re-run with --force to replace it."
-            )
-        shutil.rmtree(output_dir)
-    output_dir.mkdir(parents=True)
-
-
-def build_package(args: argparse.Namespace) -> Path:
-    source_dir = args.source_dir.resolve()
-    output_dir = args.output_dir.resolve()
-
-    if not source_dir.is_dir():
-        raise FileNotFoundError(f"source package does not exist: {source_dir}")
-    if not (source_dir / "__init__.py").is_file():
-        raise FileNotFoundError(f"source package is missing __init__.py: {source_dir}")
-
-    source_hash = content_hash(source_dir)
-    op_namespace = f"_{PACKAGE_NAME}_{source_hash}"
-
-    prepare_output_dir(output_dir, args.force)
-
-    write_build_toml(output_dir / "build.toml", args.repo_id, args.version)
-    write_flake(output_dir / "flake.nix")
-    write_readme(output_dir / "README.md", args.repo_id, source_hash, op_namespace)
-
-    if not args.no_source_layout:
-        copy_package(source_dir, output_dir / "torch-ext" / PACKAGE_NAME, op_namespace)
-
-    build_package_dir = output_dir / "build" / BUILD_VARIANT / PACKAGE_NAME
-    copy_package(source_dir, build_package_dir, op_namespace)
-    write_metadata(build_package_dir.parent / "metadata.json", args.version)
-
-    return output_dir
-
-
-def upload_package(args: argparse.Namespace, output_dir: Path) -> None:
-    if not args.repo_id:
-        raise ValueError("--repo-id is required when using --upload")
-
-    try:
-        from huggingface_hub import HfApi, constants as hf_constants
-    except ImportError as exc:
-        raise RuntimeError(
-            "--upload requires huggingface_hub. Install it or run the upload "
-            "manually with the Hugging Face CLI."
-        ) from exc
-
-    try:
-        hub_version = metadata.version("huggingface_hub")
-    except metadata.PackageNotFoundError:
-        hub_version = "unknown"
-
-    accepted_repo_types = getattr(
-        hf_constants,
-        "REPO_TYPES_WITH_KERNEL",
-        getattr(hf_constants, "REPO_TYPES", ()),
-    )
-    if HF_REPO_TYPE not in accepted_repo_types:
-        raise RuntimeError(
-            "Your huggingface_hub installation does not support "
-            f"repo_type={HF_REPO_TYPE!r} (found huggingface_hub {hub_version}). "
-            f"Upgrade this interpreter with: {sys.executable} -m pip install --upgrade "
-            "'huggingface_hub>=1.10.0'"
-        )
-
-    # huggingface_hub 1.11.0 has partial kernel support: create_repo accepts
-    # "kernel", but upload_folder/create_commit still validate against the
-    # older REPO_TYPES list. Extend it in-process so those helpers use the
-    # /api/kernels/... endpoints until upstream broadens that check.
-    if HF_REPO_TYPE not in hf_constants.REPO_TYPES:
-        hf_constants.REPO_TYPES.append(HF_REPO_TYPE)
-
-    api = HfApi()
-    try:
-        repo_id = api.create_repo(
-            repo_id=args.repo_id,
-            repo_type=HF_REPO_TYPE,
-            private=args.private,
-            exist_ok=True,
-        ).repo_id
-    except ValueError as exc:
-        if "Invalid repo type" in str(exc):
-            raise RuntimeError(
-                "huggingface_hub rejected repo_type='kernel'. "
-                f"This usually means the command is running with an older Hub "
-                f"client than expected (found huggingface_hub {hub_version} at "
-                f"{sys.executable}). Upgrade with: {sys.executable} -m pip "
-                "install --upgrade 'huggingface_hub>=1.10.0'"
-            ) from exc
-        raise
-
-    delete_patterns = [
-        "build/**",
-        "torch-ext/**",
-        "build.toml",
-        "flake.nix",
-        "README.md",
-    ]
-
-    api.upload_folder(
-        repo_id=repo_id,
-        repo_type=HF_REPO_TYPE,
-        folder_path=output_dir,
-        revision="main",
-        delete_patterns=delete_patterns,
-        commit_message="Upload ScatterMoE LoRA universal kernel",
-    )
-    print(f"Uploaded main branch: {HF_KERNEL_URL_PREFIX}/{repo_id}")
-
-    if args.skip_version_branch:
-        return
-
-    version_branch = f"v{args.version}"
-    api.create_branch(
-        repo_id=repo_id,
-        repo_type=HF_REPO_TYPE,
-        branch=version_branch,
-        revision="main",
-        exist_ok=True,
-    )
-    api.upload_folder(
-        repo_id=repo_id,
-        repo_type=HF_REPO_TYPE,
-        folder_path=output_dir,
-        revision=version_branch,
-        delete_patterns=delete_patterns,
-        commit_message=f"Upload ScatterMoE LoRA universal kernel {version_branch}",
-    )
-    print(
-        f"Uploaded version branch: "
-        f"{HF_KERNEL_URL_PREFIX}/{repo_id}/tree/{version_branch}"
-    )
-
-
-def main() -> int:
-    args = parse_args()
-    try:
-        output_dir = build_package(args)
-        if args.upload:
-            upload_package(args, output_dir)
-    except Exception as exc:
-        print(f"error: {exc}", file=sys.stderr)
-        return 1
-
-    print(f"Wrote ScatterMoE LoRA HF kernel package to: {output_dir}")
-    print(f"Shippable artifact: {output_dir / 'build' / BUILD_VARIANT / PACKAGE_NAME}")
-    if args.upload:
-        print(f'Load it with: get_kernel("{args.repo_id}", version={args.version})')
-        print(f"Uploaded as Hugging Face repo_type={HF_REPO_TYPE!r}.")
-        return 0
-
-    print("Next step:")
-    print("  upload this universal Python/Triton kernel directly:")
-    print(
-        f"    python3 {Path(__file__).as_posix()} "
-        f"--repo-id {args.repo_id} --force --upload"
-    )
-    if shutil.which("kernel-builder") is None:
-        print("  optional: install kernel-builder for full Nix-based builds:")
-        print(
-            "    curl -fsSL "
-            "https://raw.githubusercontent.com/huggingface/kernels/main/install.sh "
-            "| bash"
-        )
-    else:
-        print("  optional: upload with kernel-builder:")
-        print(f"    cd {output_dir}")
-        print("    kernel-builder build-and-upload")
-    return 0
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())
--- a/scripts/unsloth_install.py
+++ b/scripts/unsloth_install.py
@@ -0,0 +1,40 @@
+# noqa
+import sys
+
+try:
+    import torch
+except ImportError as error:
+    raise ImportError("Install torch via `pip install torch`") from error
+from packaging.version import Version as V
+
+use_uv = "--uv" in sys.argv[1:]
+
+v = V(torch.__version__)
+cuda = str(torch.version.cuda)
+try:
+    is_ampere = torch.cuda.get_device_capability()[0] >= 8
+except RuntimeError:
+    is_ampere = False
+if cuda != "12.1" and cuda != "11.8" and cuda != "12.4":
+    raise RuntimeError(f"CUDA = {cuda} not supported!")
+if v <= V("2.1.0"):
+    raise RuntimeError(f"Torch = {v} too old!")
+elif v <= V("2.1.1"):
+    x = "cu{}{}-torch211"
+elif v <= V("2.1.2"):
+    x = "cu{}{}-torch212"
+elif v < V("2.3.0"):
+    x = "cu{}{}-torch220"
+elif v < V("2.4.0"):
+    x = "cu{}{}-torch230"
+elif v < V("2.5.0"):
+    x = "cu{}{}-torch240"
+elif v < V("2.6.0"):
+    x = "cu{}{}-torch250"
+else:
+    raise RuntimeError(f"Torch = {v} too new!")
+x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "")
+uv_prefix = "uv " if use_uv else ""
+print(
+    f'{uv_prefix}pip install unsloth-zoo==2024.12.1 && {uv_prefix}pip install --no-deps "unsloth[{x}]==2024.12.4"'
+)
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,230 @@
+"""setup.py for axolotl"""
+
+import os
+import platform
+import re
+from importlib.metadata import PackageNotFoundError, version
+from pathlib import Path
+
+from setuptools import find_packages, setup
+
+
+def parse_requirements(extras_require_map):
+    _install_requires = []
+    _dependency_links = []
+    with open("./requirements.txt", encoding="utf-8") as requirements_file:
+        lines = [r.strip() for r in requirements_file.readlines()]
+        for line in lines:
+            is_extras = "deepspeed" in line or "mamba-ssm" in line
+            if line.startswith("--extra-index-url"):
+                # Handle custom index URLs
+                _, url = line.split()
+                _dependency_links.append(url)
+            elif not is_extras and line and line[0] != "#":
+                # Handle standard packages
+                _install_requires.append(line)
+    try:
+        xformers_version = [req for req in _install_requires if "xformers" in req][0]
+        install_xformers = platform.machine() != "aarch64"
+        if platform.machine() == "aarch64":
+            # skip on ARM64
+            skip_packages = [
+                "torchao",
+                "fla-core",
+                "flash-linear-attention",
+            ]
+            _install_requires = [
+                req
+                for req in _install_requires
+                if re.split(r"[>=<]", req)[0].strip() not in skip_packages
+            ]
+        if "Darwin" in platform.system():
+            # skip packages not compatible with OSX
+            skip_packages = [
+                "bitsandbytes",
+                "triton",
+                "mamba-ssm",
+                "xformers",
+                "liger-kernel",
+            ]
+            _install_requires = [
+                req
+                for req in _install_requires
+                if re.split(r"[>=<]", req)[0].strip() not in skip_packages
+            ]
+            print(
+                _install_requires, [req in skip_packages for req in _install_requires]
+            )
+        else:
+            # detect the version of torch already installed
+            # and set it so dependencies don't clobber the torch version
+            try:
+                torch_version = version("torch")
+            except PackageNotFoundError:
+                torch_version = "2.8.0"  # default to torch 2.8.0
+            _install_requires.append(f"torch=={torch_version}")
+
+            version_match = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?", torch_version)
+            if version_match:
+                major, minor, patch = version_match.groups()
+                major, minor = int(major), int(minor)
+                patch = (
+                    int(patch) if patch is not None else 0
+                )  # Default patch to 0 if not present
+            else:
+                raise ValueError("Invalid version format")
+
+            torch_parts = torch_version.split("+")
+            if len(torch_parts) == 2:
+                torch_cuda_version = torch_parts[1]
+                _dependency_links.append(
+                    f"https://download.pytorch.org/whl/{torch_cuda_version}"
+                )
+
+            if (major, minor) >= (2, 10):
+                extras_require_map.pop("fbgemm-gpu")
+                extras_require_map["fbgemm-gpu"] = [
+                    "fbgemm-gpu==1.5.0",
+                    "fbgemm-gpu-genai==1.5.0",
+                ]
+                if not install_xformers:
+                    _install_requires.pop(_install_requires.index(xformers_version))
+                extras_require_map["vllm"] = ["vllm>=0.19.0"]
+            elif (major, minor) >= (2, 9):
+                extras_require_map.pop("fbgemm-gpu")
+                extras_require_map["fbgemm-gpu"] = [
+                    "fbgemm-gpu==1.4.0",
+                    "fbgemm-gpu-genai==1.4.2",
+                ]
+                if not install_xformers:
+                    _install_requires.pop(_install_requires.index(xformers_version))
+                if patch == 0:
+                    extras_require_map["vllm"] = ["vllm==0.13.0"]
+                else:
+                    extras_require_map["vllm"] = ["vllm==0.14.0"]
+            elif (major, minor) >= (2, 8):
+                extras_require_map.pop("fbgemm-gpu")
+                extras_require_map["fbgemm-gpu"] = ["fbgemm-gpu-genai==1.3.0"]
+                extras_require_map["vllm"] = ["vllm==0.11.0"]
+                if not install_xformers:
+                    _install_requires.pop(_install_requires.index(xformers_version))
+            elif (major, minor) >= (2, 7):
+                _install_requires.pop(_install_requires.index(xformers_version))
+                if patch == 0:
+                    if install_xformers:
+                        _install_requires.append("xformers==0.0.30")
+                    # vllm 0.9.x is incompatible with latest transformers
+                    extras_require_map.pop("vllm")
+                else:
+                    if install_xformers:
+                        _install_requires.append("xformers==0.0.31")
+                    extras_require_map["vllm"] = ["vllm==0.10.1"]
+            elif (major, minor) >= (2, 6):
+                _install_requires.pop(_install_requires.index(xformers_version))
+                if install_xformers:
+                    _install_requires.append("xformers==0.0.29.post3")
+                # since we only support 2.6.0+cu126
+                _dependency_links.append("https://download.pytorch.org/whl/cu126")
+                extras_require_map.pop("vllm")
+            elif (major, minor) >= (2, 5):
+                _install_requires.pop(_install_requires.index(xformers_version))
+                if install_xformers:
+                    if patch == 0:
+                        _install_requires.append("xformers==0.0.28.post2")
+                    else:
+                        _install_requires.append("xformers>=0.0.28.post3")
+                extras_require_map.pop("vllm")
+            elif (major, minor) >= (2, 4):
+                extras_require_map.pop("vllm")
+                if install_xformers:
+                    if patch == 0:
+                        _install_requires.pop(_install_requires.index(xformers_version))
+                        _install_requires.append("xformers>=0.0.27")
+                    else:
+                        _install_requires.pop(_install_requires.index(xformers_version))
+                        _install_requires.append("xformers==0.0.28.post1")
+            else:
+                raise ValueError("axolotl requires torch>=2.4")
+
+    except PackageNotFoundError:
+        pass
+    return _install_requires, _dependency_links, extras_require_map
+
+
+def get_package_version():
+    with open(
+        Path(os.path.dirname(os.path.abspath(__file__))) / "VERSION",
+        "r",
+        encoding="utf-8",
+    ) as fin:
+        version_ = fin.read().strip()
+    return version_
+
+
+extras_require = {
+    "flash-attn": ["flash-attn==2.8.3"],
+    "ring-flash-attn": [
+        "flash-attn==2.8.3",
+        "ring-flash-attn>=0.1.7",
+    ],
+    "deepspeed": [
+        "deepspeed==0.18.2",
+        "deepspeed-kernels",
+    ],
+    "mamba-ssm": [
+        "mamba-ssm==1.2.0.post1",
+        "causal_conv1d",
+    ],
+    "auto-gptq": [
+        "auto-gptq==0.5.1",
+    ],
+    "mlflow": [
+        "mlflow",
+    ],
+    "galore": [
+        "galore_torch",
+    ],
+    "apollo": [
+        "apollo-torch",
+    ],
+    "optimizers": [
+        "galore_torch",
+        "apollo-torch",
+        "lomo-optim==0.1.1",
+        "torch-optimi==0.2.1",
+        "came_pytorch==0.1.3",
+    ],
+    "ray": [
+        "ray[train]>=2.52.1",
+    ],
+    "vllm": [
+        "vllm==0.10.0",
+    ],
+    "llmcompressor": [
+        "llmcompressor==0.5.1",
+    ],
+    "fbgemm-gpu": ["fbgemm-gpu-genai==1.3.0"],
+    "opentelemetry": [
+        "opentelemetry-api",
+        "opentelemetry-sdk",
+        "opentelemetry-exporter-prometheus",
+        "prometheus-client",
+    ],
+}
+install_requires, dependency_links, extras_require_build = parse_requirements(
+    extras_require
+)
+
+setup(
+    version=get_package_version(),
+    package_dir={"": "src"},
+    packages=find_packages("src"),
+    install_requires=install_requires,
+    dependency_links=dependency_links,
+    entry_points={
+        "console_scripts": [
+            "axolotl=axolotl.cli.main:main",
+        ],
+    },
+    extras_require=extras_require_build,
+)
--- a/src/axolotl/cli/utils/lora_merge.py
+++ b/src/axolotl/cli/utils/lora_merge.py
@@ -339,11 +339,7 @@ def _build_peft_layer_and_get_delta(
            )
        layer.lora_A[adapter_name].weight.data = lora_a
        layer.lora_B[adapter_name].weight.data = lora_b
-        delta = layer.get_delta_weight(adapter_name)
-        # peft >=0.19.1 may return delta with transposed dims for 3D params
-        if delta.shape != base_tensor.shape and delta.ndim == 3:
-            delta = delta.transpose(1, 2).contiguous()
-        return delta
+        return layer.get_delta_weight(adapter_name)
    elif (
        layer_type and "Conv" in layer_type or (layer_type is None and lora_a.ndim > 2)
    ):
--- a/src/axolotl/core/trainers/grpo/async_trainer.py
+++ b/src/axolotl/core/trainers/grpo/async_trainer.py
@@ -242,6 +242,85 @@ class ProducerConfig:
            )


+class _GroupShardedSampler:
+    """Rank-aware shard of a ``RepeatSampler`` that preserves GRPO groups.
+
+    ``RepeatSampler`` yields ``num_generations`` consecutive copies of
+    each prompt, forming a GRPO group. For distributed training each
+    rank must see a disjoint slice of prompts (otherwise every rank
+    dogpiles on the first 1/world_size of the batch) while keeping each
+    group intact on a single rank so advantage normalization sees all
+    peer generations.
+
+    ``accelerator.prepare(DataLoader)`` does not handle this correctly
+    for custom samplers with ``split_batches=False`` (the default): it
+    leaves the sampler alone and every rank replays identical indices.
+    This wrapper fixes that by consuming the inner sampler's full
+    output, chunking it into ``num_generations``-sized groups, and
+    round-robining whole groups across ranks.
+
+    Intended to be used ONLY when distributed training is active
+    (``num_replicas > 1``); for single-rank it is a no-op but still
+    correct.
+    """
+
+    def __init__(
+        self,
+        inner: Any,
+        num_generations: int,
+        rank: int,
+        num_replicas: int,
+    ):
+        if num_generations < 1:
+            raise ValueError(f"num_generations must be >= 1, got {num_generations}")
+        if num_replicas < 1:
+            raise ValueError(f"num_replicas must be >= 1, got {num_replicas}")
+        if not (0 <= rank < num_replicas):
+            raise ValueError(f"rank must be in [0, {num_replicas}), got {rank}")
+        self.inner = inner
+        self.num_generations = num_generations
+        self.rank = rank
+        self.num_replicas = num_replicas
+
+    def __iter__(self):
+        all_indices = list(self.inner)
+        if len(all_indices) % self.num_generations != 0:
+            raise ValueError(
+                f"inner sampler yielded {len(all_indices)} indices, "
+                f"not a multiple of num_generations={self.num_generations}"
+            )
+        # Chunk the flat index sequence into groups of num_generations
+        # consecutive indices. ``RepeatSampler`` guarantees that each
+        # group contains num_generations copies of the same prompt id.
+        groups = [
+            all_indices[i : i + self.num_generations]
+            for i in range(0, len(all_indices), self.num_generations)
+        ]
+        # Round-robin whole groups across ranks. Round-robin (vs.
+        # contiguous chunking) preserves approximate shuffled order on
+        # each rank even when the group count is small relative to the
+        # world size.
+        for group in groups[self.rank :: self.num_replicas]:
+            yield from group
+
+    def __len__(self):
+        try:
+            inner_len = len(self.inner)
+        except TypeError:
+            # Non-sized inner sampler — we can't know the per-rank
+            # length without materializing. Return 0 as a hint that the
+            # DataLoader should fall back to iteration.
+            return 0
+        total_groups = inner_len // self.num_generations
+        # Ceiling division for the trailing groups that don't divide
+        # evenly — extra groups go to the first ``total_groups %
+        # num_replicas`` ranks, matching the round-robin above.
+        my_groups = (
+            total_groups + self.num_replicas - self.rank - 1
+        ) // self.num_replicas
+        return my_groups * self.num_generations
+
+
 class DataProducer(ABC):
    """Abstract base class for online data producers.

@@ -556,6 +635,34 @@ class GRPODataProducer(BaseDataProducer):
            seed=self._seed,
        )

+        # Shard the sampler across distributed ranks so each rank sees
+        # a disjoint slice of prompts. ``RepeatSampler`` groups each
+        # prompt with ``num_generations`` consecutive copies — our
+        # wrapper round-robins WHOLE groups across ranks so all
+        # generations of a given prompt stay on the same rank (needed
+        # for GRPO advantage normalization within a group).
+        #
+        # Without this, ``accelerator.prepare(dl)`` with the default
+        # ``split_batches=False`` leaves the custom sampler alone, so
+        # every rank iterates the identical index sequence and the
+        # cluster dogpiles on the first 1/world_size of the prompts.
+        num_replicas = max(1, trainer.accelerator.num_processes)
+        if num_replicas > 1:
+            sampler = _GroupShardedSampler(
+                inner=sampler,
+                num_generations=self._num_generations,
+                rank=trainer.accelerator.process_index,
+                num_replicas=num_replicas,
+            )
+            logger.info(
+                "[RANK:%d] _GroupShardedSampler active "
+                "(num_replicas=%d, num_generations=%d, gen_batch=%d)",
+                trainer.accelerator.process_index,
+                num_replicas,
+                self._num_generations,
+                self._generation_batch_size,
+            )
+
        # Use identity collator (same as stock GRPOTrainer)
        def _identity(x):
            return x
@@ -574,12 +681,11 @@ class GRPODataProducer(BaseDataProducer):
                rank=trainer.args.process_index,
            ),
        )
-        self._prompt_dl = trainer.accelerator.prepare(dl)
-
-        # Don't let accelerator track this dataloader
-        acc_dls = trainer.accelerator._dataloaders
-        if self._prompt_dl in acc_dls:
-            acc_dls.remove(self._prompt_dl)
+        # Skip accelerator.prepare — we're handling per-rank sharding
+        # ourselves via ``_GroupShardedSampler``. ``prepare()`` would
+        # otherwise try to wrap the DataLoader with its own sharding
+        # logic which does not understand our group structure.
+        self._prompt_dl = dl

        self._prompt_iter = iter(self._prompt_dl)

@@ -1103,11 +1209,22 @@ class AsyncGRPOTrainer(GRPOTrainer):
        - vllm_lora_sync: saves adapter to filesystem, vLLM loads natively
        - PEFT no-merge: computes merged weights as new tensors, NCCL broadcast
        - Non-PEFT: stock sync_weights via merge_adapter + NCCL
+
+        This is the canonical sync trigger and runs in BOTH async and
+        synchronous modes from ``_prepare_inputs_with_data_producer`` /
+        ``_prepare_inputs_legacy_async``. The ``_generate_single_turn``
+        patch is a parallel backup for non-data-producer paths (vanilla
+        GRPO without NeMo Gym), where the data producer is bypassed
+        entirely and TRL's stock generate-then-sync flow is used instead.
        """
-        if not (self.use_vllm and self.args.async_prefetch):
+        if not self.use_vllm:
            return
        step = self.state.global_step
-        interval = self.args.vllm_sync_interval
+        # Default to syncing every step when no interval is configured —
+        # otherwise ``step % None`` would TypeError, and the previous
+        # behavior of crashing on the first sync was strictly worse than
+        # the standard "sync every optimizer step".
+        interval = self.args.vllm_sync_interval or 1
        if step != self._last_synced_step and step % interval == 0:
            if step == 0:
                logger.info("Skipping vLLM weight sync at step 0 (no training yet)")
@@ -1202,13 +1319,42 @@ class AsyncGRPOTrainer(GRPOTrainer):

        # Permanently replace vllm_generation.sync_weights with our custom
        # sync to avoid merge_adapter (fails on FP8 / races with training).
-        # For LoRA sync mode, make it a no-op here since _maybe_sync_vllm_weights
-        # handles the sync with proper interval tracking.
+        #
+        # The design has two modes that have to be threaded carefully:
+        #
+        #   - Async prefetch ON: BG generation thread can't safely call
+        #     sync_weights mid-rollout (it races with the trainer's optimizer
+        #     step and can corrupt weights). We no-op the stock sync hook and
+        #     drive sync ourselves from ``_maybe_sync_vllm_weights`` after the
+        #     optimizer step on the main thread.
+        #
+        #   - Async prefetch OFF (synchronous mode): TRL's stock
+        #     ``_generate_single_turn`` calls ``sync_weights`` once per step
+        #     boundary. There's no BG thread to race with, and
+        #     ``_maybe_sync_vllm_weights`` short-circuits with
+        #     ``if not async_prefetch: return``, so we MUST wire the stock
+        #     hook directly to our LoRA sync helper — otherwise nothing ever
+        #     pushes weights to vLLM and the trainer becomes a no-op (vLLM
+        #     keeps serving the base model, every rollout in every group
+        #     produces identical outputs, advantages are zero, optimizer
+        #     step gets skipped, repeat).
        if not getattr(self, "_patched_sync_weights", False):
            if self.use_vllm and hasattr(self, "vllm_generation"):
                if getattr(self.args, "vllm_lora_sync", False):
-                    # No-op: LoRA sync is driven by _maybe_sync_vllm_weights
-                    self.vllm_generation.sync_weights = lambda: None
+                    if getattr(self.args, "async_prefetch", False):
+                        # Async: drive sync from main thread via
+                        # _maybe_sync_vllm_weights instead.
+                        self.vllm_generation.sync_weights = lambda: None
+                    else:
+                        # Sync mode: TRL's _generate_single_turn already
+                        # calls sync_weights once per step boundary. Wire
+                        # it directly to our LoRA filesystem sync helper.
+                        sync_helper = self._sync_lora_adapter
+
+                        def _lora_filesystem_sync():
+                            sync_helper()
+
+                        self.vllm_generation.sync_weights = _lora_filesystem_sync
                    self._patched_sync_weights = True
                else:
                    from accelerate.utils import is_peft_model
--- a/src/axolotl/integrations/kernels/libs/scattermoe_lora/init.py
+++ b/src/axolotl/integrations/kernels/libs/scattermoe_lora/init.py
@@ -2,17 +2,35 @@
 # Copyright (c) Axolotl AI
 # Licensed under the Apache License, Version 2.0

-from . import layers
-from .lora_ops import ParallelExperts
-from .parallel_experts import flatten_sort_count, parallel_linear
-from .parallel_linear_lora import ScatterMoELoRA, parallel_linear_lora
+from .lora_layout import (
+    peft_down_proj_lora_to_scattermoe,
+    peft_lora_B_to_scattermoe,
+    peft_lora_to_scattermoe,
+    validate_scattermoe_lora_shapes,
+)

 __all__ = [
-    "layers",
-    "ParallelExperts",
-    "flatten_sort_count",
-    "parallel_linear",
-    "ScatterMoELoRA",
-    "parallel_linear_lora",
-    "lora_ops",
+    "peft_down_proj_lora_to_scattermoe",
+    "peft_lora_B_to_scattermoe",
+    "peft_lora_to_scattermoe",
+    "validate_scattermoe_lora_shapes",
 ]
+
+try:
+    from . import layers
+    from .lora_ops import ParallelExperts
+    from .parallel_experts import flatten_sort_count, parallel_linear
+    from .parallel_linear_lora import ScatterMoELoRA, parallel_linear_lora
+except ModuleNotFoundError as exc:
+    if exc.name != "triton":
+        raise
+else:
+    __all__ += [
+        "layers",
+        "ParallelExperts",
+        "flatten_sort_count",
+        "parallel_linear",
+        "ScatterMoELoRA",
+        "parallel_linear_lora",
+        "lora_ops",
+    ]
--- a/src/axolotl/integrations/kernels/libs/scattermoe_lora/layers.py
+++ b/src/axolotl/integrations/kernels/libs/scattermoe_lora/layers.py
@@ -35,46 +35,19 @@ import torch
 from torch import nn
 from torch.nn import functional as F

+from .lora_layout import (
+    peft_down_proj_lora_to_scattermoe,
+    peft_lora_B_to_scattermoe,
+    peft_lora_to_scattermoe,
+)
 from .parallel_experts import flatten_sort_count, parallel_linear
 from .parallel_linear_lora import get_lora_params_from_wrapper, parallel_linear_lora

-# =============================================================================
-# LoRA layout conversion utilities (peft <-> scattermoe)
-# =============================================================================
-
-
-def peft_lora_B_to_scattermoe(peft_B, num_experts, rank):
-    """Convert peft rank-major lora_B ``[out, E*r]`` to scattermoe
-    expert-major ``[N, r*E]``.
-
-    peft reshapes B to ``[out, r, E]`` (rank-major).
-    scattermoe slices B as ``[:, e*r:(e+1)*r]`` (expert-major).
-    """
-    N = peft_B.shape[0]
-    return (
-        peft_B.reshape(N, rank, num_experts)
-        .permute(0, 2, 1)
-        .contiguous()
-        .reshape(N, num_experts * rank)
-    )
-
-
-def peft_lora_to_scattermoe(peft_A, peft_B, num_experts, rank):
-    """Convert peft LoRA weights to scattermoe layout.
-
-    peft >=0.19.1 assigns in/out features for 3D params such that
-    A and B already align with scattermoe's convention (no A<->B swap).
-    Only B needs rank-major → expert-major layout conversion.
-    """
-    smoe_A = peft_A
-    smoe_B = peft_lora_B_to_scattermoe(peft_B, num_experts, rank)
-    return smoe_A, smoe_B
-
-
-def peft_down_proj_lora_to_scattermoe(peft_A, peft_B, num_experts, rank):
-    """Deprecated alias for :func:`peft_lora_to_scattermoe`."""
-    return peft_lora_to_scattermoe(peft_A, peft_B, num_experts, rank)
-
+__all__ = [
+    "peft_down_proj_lora_to_scattermoe",
+    "peft_lora_B_to_scattermoe",
+    "peft_lora_to_scattermoe",
+]

 # =============================================================================
 # ParamWrapper unwrapping
@@ -164,7 +137,7 @@ def _unwrap_experts_lora(experts_module):
        if gup is not None:
            num_experts = gup.shape[0]

-    # Extract gate_up_proj LoRA (needs A<->B swap due to transposition)
+    # Extract gate_up_proj LoRA
    gup_lora = None
    gup_wrapper = wrappers.get("gate_up_proj")
    if gup_wrapper is not None:
@@ -173,7 +146,7 @@ def _unwrap_experts_lora(experts_module):
            rank = lora_A.shape[0] // num_experts
            gup_lora = _convert_smoe_lora(lora_A, lora_B, num_experts, rank, scaling)

-    # Extract down_proj LoRA (needs A<->B swap due to transposition)
+    # Extract down_proj LoRA
    down_lora = None
    down_wrapper = wrappers.get("down_proj")
    if down_wrapper is not None:
--- a/src/axolotl/integrations/kernels/libs/scattermoe_lora/lora_layout.py
+++ b/src/axolotl/integrations/kernels/libs/scattermoe_lora/lora_layout.py
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Axolotl AI
+# Licensed under the Apache License, Version 2.0
+
+"""Pure tensor layout helpers for ScatterMoE LoRA weights."""
+
+
+def peft_lora_B_to_scattermoe(peft_B, num_experts, rank):
+    """Convert peft rank-major lora_B ``[out, E*r]`` to scattermoe
+    expert-major ``[N, r*E]``.
+
+    peft reshapes B to ``[out, r, E]`` (rank-major).
+    scattermoe slices B as ``[:, e*r:(e+1)*r]`` (expert-major).
+    """
+    N = peft_B.shape[0]
+    return (
+        peft_B.reshape(N, rank, num_experts)
+        .permute(0, 2, 1)
+        .contiguous()
+        .reshape(N, num_experts * rank)
+    )
+
+
+def peft_lora_to_scattermoe(peft_A, peft_B, num_experts, rank):
+    """Convert peft LoRA weights to scattermoe layout.
+
+    peft operates on the parameter in its native storage layout ``[E, dim1, dim2]``
+    where ``out_features=dim1, in_features=dim2``. ScatterMoE transposes the
+    parameter (``W = param.transpose(2, 1)``), giving ``[E, dim2, dim1]`` with
+    ``K=dim2, N=dim1``.
+
+    peft gives:
+        lora_A ``[r*E, dim2]``, lora_B ``[dim1, r*E]``
+
+    scattermoe needs:
+        lora_A ``[r*E, K=dim2]``, lora_B ``[N=dim1, r*E]``
+
+    peft's A already matches ScatterMoE's A shape. Only B needs conversion from
+    peft's rank-major layout to ScatterMoE's expert-major layout.
+    """
+    smoe_A = peft_A
+    smoe_B = peft_lora_B_to_scattermoe(peft_B, num_experts, rank)
+
+    return smoe_A, smoe_B
+
+
+def peft_down_proj_lora_to_scattermoe(peft_A, peft_B, num_experts, rank):
+    """Deprecated alias for :func:`peft_lora_to_scattermoe`."""
+    return peft_lora_to_scattermoe(peft_A, peft_B, num_experts, rank)
+
+
+def validate_scattermoe_lora_shapes(expert_weights, lora_A, lora_B):
+    """Validate LoRA tensor layout before dispatching ScatterMoE kernels."""
+    E, K, N = expert_weights.shape
+    if lora_A.dim() != 2 or lora_B.dim() != 2:
+        raise ValueError(
+            "ScatterMoE LoRA expects 2D lora_A and lora_B tensors, got "
+            f"lora_A={tuple(lora_A.shape)} and lora_B={tuple(lora_B.shape)}."
+        )
+
+    if lora_A.size(0) % E != 0:
+        raise ValueError(
+            "ScatterMoE LoRA expects lora_A rows to be divisible by the number "
+            f"of experts ({E}), got lora_A={tuple(lora_A.shape)}."
+        )
+
+    rank = lora_A.size(0) // E
+    expected_A = (E * rank, K)
+    expected_B = (N, E * rank)
+    if tuple(lora_A.shape) != expected_A or tuple(lora_B.shape) != expected_B:
+        raise ValueError(
+            "Invalid ScatterMoE LoRA layout for expert_weights "
+            f"{tuple(expert_weights.shape)}. Expected lora_A={expected_A} and "
+            f"lora_B={expected_B}, got lora_A={tuple(lora_A.shape)} and "
+            f"lora_B={tuple(lora_B.shape)}. For PEFT target_parameters, keep "
+            "lora_A as [E*r, K] and only convert lora_B from rank-major to "
+            "expert-major layout."
+        )
--- a/src/axolotl/integrations/kernels/libs/scattermoe_lora/parallel_linear_lora.py
+++ b/src/axolotl/integrations/kernels/libs/scattermoe_lora/parallel_linear_lora.py
@@ -34,6 +34,7 @@ from .kernels.lora_ops import (
    scatter2scatter_lora,
    scatter2scatter_lora_dX,
 )
+from .lora_layout import validate_scattermoe_lora_shapes


 class ScatterMoELoRA(torch.autograd.Function):
@@ -422,11 +423,6 @@ def get_lora_params_from_wrapper(module) -> tuple:
    return lora_A, lora_B, scaling


-# =============================================================================
-# Drop-in replacement for parallel_linear
-# =============================================================================
-
-
 def parallel_linear_lora(
    inputs: torch.Tensor,
    expert_weights: torch.Tensor,
@@ -451,6 +447,7 @@ def parallel_linear_lora(
    Otherwise falls back to standard scatter2scatter.
    """
    if lora_A is not None and lora_B is not None:
+        validate_scattermoe_lora_shapes(expert_weights, lora_A, lora_B)
        return ScatterMoELoRA.apply(
            inputs,
            expert_weights,
--- a/src/axolotl/integrations/nemo_gym/data_producer.py
+++ b/src/axolotl/integrations/nemo_gym/data_producer.py
@@ -110,11 +110,36 @@ class NemoGymDataProducer(GRPODataProducer):
                item["agent_ref"] = full_item["agent_ref"]
            dataset_items.append(item)

-        # Expand by num_generations (agent produces one rollout per call)
-        expanded_items = []
-        for item in dataset_items:
-            for _ in range(self._num_generations):
-                expanded_items.append(item)
+        # NOTE: do NOT re-expand by num_generations here.
+        # ``RepeatSampler(mini_repeat_count=num_generations)`` already
+        # yields ``num_generations`` consecutive copies of each unique
+        # prompt, so ``inputs`` is a list of ``(unique_prompts_per_rank *
+        # num_generations)`` items — one entry per rollout. Expanding
+        # again here would fire ``num_generations^2`` rollouts per
+        # prompt per rank and make every step dogpile on a handful of
+        # tasks.
+        expanded_items = dataset_items
+
+        # Diagnostic: log what this rank is about to fire.
+        try:
+            import collections
+
+            iid_counts: collections.Counter[str | None] = collections.Counter()
+            for it in dataset_items:
+                iid_counts[
+                    (it.get("responses_create_params", {}).get("metadata") or {}).get(
+                        "instance_id"
+                    )
+                ] += 1
+            LOG.info(
+                "[RANK:%d] produce(): firing %d agent /run calls covering %d unique prompts: %s",
+                trainer.accelerator.process_index,
+                len(dataset_items),
+                len(iid_counts),
+                list(iid_counts.most_common(5)),
+            )
+        except Exception:
+            pass

        # Call NeMo Gym agents
        loop = asyncio.new_event_loop()
@@ -140,6 +165,7 @@ class NemoGymDataProducer(GRPODataProducer):
        logprobs_list = []
        rewards_list = []

+        num_turns_list: list[int] = []
        for resp in responses:
            parsed = _parse_agent_response(resp, eos_token_id)
            prompt_ids_list.append(parsed["prompt_ids"])
@@ -147,6 +173,7 @@ class NemoGymDataProducer(GRPODataProducer):
            env_mask_list.append(parsed["env_mask"])
            logprobs_list.append(parsed["logprobs"])
            rewards_list.append(parsed["reward"])
+            num_turns_list.append(parsed.get("num_turns", 0))

        # Pad to tensors
        prompt_ids = [torch.tensor(ids, device=device) for ids in prompt_ids_list]
@@ -179,22 +206,48 @@ class NemoGymDataProducer(GRPODataProducer):
        tool_mask = [torch.tensor(m, device=device) for m in env_mask_list]
        tool_mask = pad(tool_mask, padding_value=1, padding_side="right")

-        # Inject rewards into inputs so _compute_deferred_scores can use them
-        # The deferred scoring path calls _calculate_rewards which reads reward_funcs.
-        # Our passthrough reward_fn reads "env_reward" from kwargs.
+        # Inject per-rollout reward + num_turns into each input. Since
+        # ``RepeatSampler`` already yields ``num_generations`` copies of
+        # each prompt, ``inputs`` has ONE entry per rollout (matching
+        # ``rewards_list`` 1:1). No per-prompt grouping happens here —
+        # GRPO advantage normalization is the trainer's job downstream.
+        assert len(inputs) == len(rewards_list), (
+            f"rewards/inputs length mismatch: "
+            f"{len(rewards_list)} rewards vs {len(inputs)} inputs"
+        )
        for i, inp in enumerate(inputs):
-            # Each input gets rewards for its num_generations rollouts
-            start = i * self._num_generations
-            end = start + self._num_generations
-            inp["env_reward"] = rewards_list[start:end]
+            inp["env_reward"] = rewards_list[i]
+            inp["num_turns"] = num_turns_list[i]

-        # Expand inputs to match expanded rollouts (num_generations copies)
-        expanded_inputs = []
-        for inp in inputs:
-            for g in range(self._num_generations):
-                expanded_inp = dict(inp)
-                expanded_inp["env_reward"] = inp["env_reward"][g]
-                expanded_inputs.append(expanded_inp)
+        # One expanded_input per rollout (already correct count because
+        # inputs has num_generations copies baked in by the sampler).
+        expanded_inputs = [dict(inp) for inp in inputs]
+
+        # Log rollout-level stats to wandb from rank 0. These are the
+        # true agent-side metrics (not the tokenized TRL view) — so
+        # num_turns reflects how many /run iterations each rollout
+        # actually took before finishing or hitting max_turns.
+        if is_main and num_turns_list:
+            try:
+                import wandb
+
+                if wandb.run is not None:
+                    import statistics as _stats
+
+                    nonzero = sum(1 for r in rewards_list if r > 0)
+                    log_payload = {
+                        "rollout/num_turns/mean": float(_stats.mean(num_turns_list)),
+                        "rollout/num_turns/min": float(min(num_turns_list)),
+                        "rollout/num_turns/max": float(max(num_turns_list)),
+                        "rollout/reward/mean": float(_stats.mean(rewards_list)),
+                        "rollout/reward/nonzero_frac": (
+                            nonzero / len(rewards_list) if rewards_list else 0.0
+                        ),
+                        "rollout/n_samples": float(len(rewards_list)),
+                    }
+                    wandb.log(log_payload, commit=False)
+            except Exception as exc:  # never let metric logging break training
+                LOG.warning("rollout wandb log failed: %s", exc)

        # Decode completions for reward functions
        completions = trainer.processing_class.batch_decode(
--- a/src/axolotl/integrations/nemo_gym/plugin.py
+++ b/src/axolotl/integrations/nemo_gym/plugin.py
@@ -19,6 +19,7 @@ Supports two modes:
 from __future__ import annotations

 import os
+from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Union

 from axolotl.integrations.base import BasePlugin
@@ -30,6 +31,107 @@ if TYPE_CHECKING:
 LOG = get_logger(__name__)


+# ---- vLLM weight-sync transport probe ------------------------------------
+
+
+@dataclass
+class VLLMWeightSyncCapabilities:
+    """What weight-sync routes a vLLM server actually exposes.
+
+    Discovered once at ``pre_model_load`` time by fetching the server's
+    ``/openapi.json``. Drives the transport-selection table below.
+    """
+
+    nccl: bool = False  # /init_communicator/ + /update_named_param/
+    lora_filesystem: bool = False  # /v1/load_lora_adapter (vLLM native)
+    lora_axolotl: bool = False  # /set_lora_adapter/ (axolotl serve_lora extension)
+    http_full: bool = False  # /http_update_weights/ (axolotl serve_lora extension)
+    probed: bool = False
+    probe_error: str | None = None
+    routes: list[str] = field(default_factory=list)
+
+    @property
+    def any_full_param_sync(self) -> bool:
+        """True if at least one transport can push full-model weights."""
+        return self.nccl or self.http_full
+
+    @property
+    def any_lora_sync(self) -> bool:
+        """True if at least one transport can push LoRA adapters."""
+        return self.lora_filesystem or self.lora_axolotl or self.nccl
+
+
+def probe_vllm_weight_sync(
+    base_url: str, timeout: float = 5.0
+) -> VLLMWeightSyncCapabilities:
+    """Detect which weight-sync routes the configured vLLM server exposes.
+
+    Uses the server's FastAPI ``/openapi.json`` — every weight-sync transport
+    we care about is mounted as a POST route there. Falls back to all-False
+    on any error so the caller can still decide what to do (typically: raise
+    a clear error rather than silently no-op).
+    """
+    import requests
+
+    caps = VLLMWeightSyncCapabilities()
+    try:
+        r = requests.get(f"{base_url.rstrip('/')}/openapi.json", timeout=timeout)
+        r.raise_for_status()
+        spec = r.json()
+        routes = sorted((spec.get("paths") or {}).keys())
+        caps.routes = routes
+        caps.nccl = "/init_communicator/" in routes and "/update_named_param/" in routes
+        caps.lora_filesystem = "/v1/load_lora_adapter" in routes
+        caps.lora_axolotl = "/set_lora_adapter/" in routes
+        caps.http_full = "/http_update_weights/" in routes
+        caps.probed = True
+    except Exception as exc:
+        caps.probe_error = f"{type(exc).__name__}: {exc}"
+        LOG.warning(
+            "NeMo Gym: failed to probe vLLM /openapi.json at %s — %s. "
+            "Will fall back to LoRA-only behavior.",
+            base_url,
+            caps.probe_error,
+        )
+    return caps
+
+
+def select_weight_sync_transport(
+    caps: VLLMWeightSyncCapabilities,
+    *,
+    has_lora: bool,
+    vllm_lora_sync_pref: bool,
+) -> str:
+    """Pick the right transport for a (server caps, model type) combo.
+
+    Returns one of: ``"lora_filesystem"``, ``"nccl"``, ``"http_full"``, or
+    ``"none"``. The caller decides what to do with ``"none"`` (typically:
+    raise an error explaining the misconfiguration).
+
+    Selection table:
+        LoRA model + lora endpoint + lora-sync pref    → lora_filesystem
+        LoRA model + lora endpoint                     → lora_filesystem
+        LoRA model + nccl endpoint                     → nccl (broadcast merged adapter)
+        Full model + nccl endpoint                     → nccl
+        Full model + http endpoint                     → http_full
+        anything else                                  → none
+    """
+    if has_lora:
+        if (caps.lora_filesystem or caps.lora_axolotl) and vllm_lora_sync_pref:
+            return "lora_filesystem"
+        if caps.lora_filesystem or caps.lora_axolotl:
+            return "lora_filesystem"
+        if caps.nccl:
+            return "nccl"
+        return "none"
+    # Full-parameter model
+    if caps.nccl:
+        return "nccl"
+    if caps.http_full:
+        return "http_full"
+    return "none"
+
+
 class NemoGymPlugin(BasePlugin):
    """Plugin for NVIDIA NeMo Gym integration with Axolotl.

@@ -50,37 +152,69 @@ class NemoGymPlugin(BasePlugin):
        self._reward_fn = None
        self._dataset_lookup = None
        self._agent_servers = {}
+        self._vllm_caps: VLLMWeightSyncCapabilities | None = None

    def get_input_args(self):
        return "axolotl.integrations.nemo_gym.NemoGymArgs"

    def pre_model_load(self, cfg):
-        """Apply monkeypatches before trainer creation."""
+        """Probe vLLM weight-sync routes and conditionally bypass NCCL init.
+
+        Replaces the previous unconditional ``init_communicator`` monkey-patch
+        with a probe of the configured vLLM server's ``/openapi.json``. We only
+        bypass NCCL init when the server we're talking to actually lacks the
+        ``/init_communicator/`` route (i.e. stock ``vllm serve``); against
+        TRL/axolotl serve modules that DO expose NCCL routes, we leave the
+        standard TRL flow alone so full-finetune training can sync weights.
+        """
        if not cfg.nemo_gym_enabled:
            return

-        # Always skip NCCL communicator init in NeMo Gym mode.
-        # NeMo Gym uses its own vLLM server (standard OpenAI API), not the TRL
-        # colocate/NCCL path. The NCCL init fails with vLLM V1 and standard servers.
        trl_cfg = getattr(cfg, "trl", None)
-        if trl_cfg and getattr(trl_cfg, "vllm_mode", "server") == "server":
+        if not (trl_cfg and getattr(trl_cfg, "vllm_mode", "server") == "server"):
+            return
+
+        host = getattr(trl_cfg, "vllm_server_host", None) or "127.0.0.1"
+        port = getattr(trl_cfg, "vllm_server_port", None) or 8000
+        base_url = f"http://{host}:{port}"
+        self._vllm_caps = probe_vllm_weight_sync(base_url)
+
+        if self._vllm_caps.probed:
+            LOG.info(
+                "NeMo Gym: vLLM weight-sync probe @ %s — nccl=%s lora_native=%s "
+                "lora_axolotl=%s http_full=%s",
+                base_url,
+                self._vllm_caps.nccl,
+                self._vllm_caps.lora_filesystem,
+                self._vllm_caps.lora_axolotl,
+                self._vllm_caps.http_full,
+            )
+
+        # Only bypass NCCL init when the server doesn't speak it. If NCCL is
+        # available we leave VLLMClient.init_communicator alone so the
+        # standard TRL sync flow can run for full-parameter training.
+        if not self._vllm_caps.nccl:
            self._patch_skip_nccl_init()

    def _patch_skip_nccl_init(self):
        """Monkeypatch VLLMClient.init_communicator to no-op.

-        NeMo Gym uses its own vLLM server (standard OpenAI API or custom LoRA
-        serve script). The NCCL communicator is not needed and fails with both
-        vLLM V1 engine and standard OpenAI server mode.
+        Only called when the configured vLLM server doesn't expose
+        ``/init_communicator/`` (e.g. stock ``vllm serve``). In that case
+        TRL's standard ``init_communicator`` would 404 inside trainer
+        construction; we no-op it so the LoRA filesystem path can install
+        its own sync in ``post_trainer_create``.
        """
        try:
            from trl.generation.vllm_client import VLLMClient

            VLLMClient._original_init_communicator = VLLMClient.init_communicator
            VLLMClient.init_communicator = lambda self, **kwargs: LOG.info(
-                "Skipping NCCL init_communicator (LoRA sync mode)"
+                "Skipping NCCL init_communicator (server has no /init_communicator/)"
+            )
+            LOG.info(
+                "Patched VLLMClient.init_communicator to no-op (server has no NCCL routes)"
            )
-            LOG.info("Patched VLLMClient.init_communicator to no-op for LoRA sync")
        except Exception as exc:
            LOG.warning(f"Failed to patch VLLMClient: {exc}")

@@ -234,30 +368,80 @@ class NemoGymPlugin(BasePlugin):
        verify_timeout = cfg.nemo_gym_verify_timeout or 30
        multi_turn = cfg.nemo_gym_multi_turn or False

-        # Handle weight sync. NeMo Gym skips NCCL init, so we need to either:
-        # - Install LoRA sync (when vllm_lora_sync=True)
-        # - Or no-op sync_weights (when using standard vLLM server)
+        # Pick a weight-sync transport based on what the configured vLLM
+        # server actually exposes (see ``pre_model_load`` probe) and what
+        # kind of model we're training. The selection table is documented
+        # in ``select_weight_sync_transport``.
        trl_cfg = getattr(cfg, "trl", None)
        if hasattr(trainer, "vllm_generation") and trainer.vllm_generation:
            vllm_gen = trainer.vllm_generation
-            if trl_cfg and getattr(trl_cfg, "vllm_lora_sync", False):
+            adapter = getattr(cfg, "adapter", None)
+            has_lora = adapter in ("lora", "qlora")
+            vllm_lora_sync_pref = bool(
+                trl_cfg and getattr(trl_cfg, "vllm_lora_sync", False)
+            )
+            caps = self._vllm_caps or VLLMWeightSyncCapabilities()
+            transport = select_weight_sync_transport(
+                caps,
+                has_lora=has_lora,
+                vllm_lora_sync_pref=vllm_lora_sync_pref,
+            )
+
+            if transport == "lora_filesystem":
                self._setup_lora_sync(trainer)
-                # Verify the vLLM server supports runtime LoRA loading
                self._check_lora_endpoint(vllm_gen)
-            else:
-                # No NCCL, no LoRA sync — skip all weight sync paths
-                vllm_gen.sync_weights = lambda: LOG.debug(
-                    "Weight sync skipped (NeMo Gym mode)"
+                LOG.info("NeMo Gym weight sync: LoRA filesystem")
+            elif transport == "nccl":
+                # Standard TRL NCCL path. We leave ``VLLMClient.init_communicator``
+                # alone (pre_model_load only patched it when the probe found no
+                # NCCL route) so the trainer's normal weight-sync flow runs.
+                LOG.info(
+                    "NeMo Gym weight sync: NCCL (server exposes /init_communicator/)"
                )
-                type(vllm_gen).sync_weights = lambda self: LOG.debug(
-                    "Weight sync skipped (NeMo Gym mode)"
+            elif transport == "http_full":
+                # Full-parameter HTTP sync — implementation lands in step 3.
+                # For now, fail loudly so users know the path is detected but
+                # not yet wired up, instead of silently no-oping like before.
+                raise NotImplementedError(
+                    "NeMo Gym + full fine-tune + HTTP weight sync is detected "
+                    "but the client-side sync helper is not yet implemented "
+                    "(planned). Use `adapter: lora|qlora` for now, or use a "
+                    "vLLM serve module that exposes /init_communicator/ for "
+                    "NCCL sync."
                )
-                # Also patch the async trainer's internal sync method
-                if hasattr(trainer, "_maybe_sync_vllm_weights"):
-                    trainer._maybe_sync_vllm_weights = lambda: LOG.debug(
-                        "Async weight sync skipped (NeMo Gym mode)"
+            else:  # transport == "none"
+                # No viable sync path. Build a precise error so the user knows
+                # exactly what's missing and how to fix it.
+                if not caps.probed:
+                    msg = (
+                        "could not probe the vLLM server's "
+                        f"/openapi.json: {caps.probe_error}. "
+                        "Verify that vLLM is reachable at "
+                        f"{getattr(trl_cfg, 'vllm_server_host', '?')}:"
+                        f"{getattr(trl_cfg, 'vllm_server_port', '?')}."
                    )
-                LOG.info("Disabled weight sync (NeMo Gym mode, no LoRA sync)")
+                elif has_lora:
+                    msg = (
+                        "the vLLM server has neither NCCL routes "
+                        "(/init_communicator/) nor a LoRA-loading route "
+                        "(/v1/load_lora_adapter or /set_lora_adapter/). "
+                        "Restart vLLM with `--enable-lora --max-lora-rank N "
+                        "VLLM_ALLOW_RUNTIME_LORA_UPDATING=1` for the stock "
+                        "server, or use `axolotl vllm-serve` for the "
+                        "NCCL-capable serve module."
+                    )
+                else:
+                    msg = (
+                        "the vLLM server exposes no full-parameter sync route "
+                        "(/init_communicator/ for NCCL or /http_update_weights/ "
+                        "for HTTP). Use `axolotl vllm-serve` (which has both) "
+                        "or set `adapter: lora|qlora`."
+                    )
+                raise ValueError(
+                    f"NeMo Gym: no usable weight-sync transport — {msg} Without "
+                    "weight sync the trainer's gradient updates never reach the "
+                    "rollout policy (functionally a no-op trainer)."
+                )

        if multi_turn:
            self._wire_multi_turn(cfg, trainer, model_name, verify_timeout)
--- a/src/axolotl/integrations/nemo_gym/server.py
+++ b/src/axolotl/integrations/nemo_gym/server.py
@@ -130,21 +130,41 @@ def start_servers(
    )


-def get_server_configs(head_port: int = 11000) -> dict:
+def get_server_configs(head_port: int = 11000, timeout: float = 30.0) -> dict:
    """Fetch the global config from the NeMo Gym head server.

+    Retries up to 3 times with exponential backoff. The default per-attempt
+    timeout is 30s (raised from the original 5s) because head servers can
+    be slow to respond when they're concurrently serving rollouts from a
+    prior training run. A 5s timeout was empirically too tight to survive
+    a kill-and-relaunch cycle.
+
    Returns:
        Dict mapping server_name -> server config.
    """
-    response = requests.get(
-        f"http://127.0.0.1:{head_port}/global_config_dict_yaml", timeout=5
+    url = f"http://127.0.0.1:{head_port}/global_config_dict_yaml"
+    last_exc: Exception | None = None
+    for attempt in (1, 2, 3):
+        try:
+            response = requests.get(url, timeout=timeout)
+            response.raise_for_status()
+            result = yaml.safe_load(response.text)
+            # NeMo Gym head server double-encodes: YAML string inside a YAML string
+            if isinstance(result, str):
+                result = yaml.safe_load(result)
+            return result
+        except (requests.exceptions.RequestException, OSError) as exc:
+            last_exc = exc
+            LOG.warning(
+                "NeMo Gym head probe attempt %d/3 failed: %s. Retrying...",
+                attempt,
+                type(exc).__name__,
+            )
+            if attempt < 3:
+                time.sleep(2.0 * attempt)
+    raise RuntimeError(
+        f"NeMo Gym head server at {url} did not respond after 3 attempts: {last_exc}"
    )
-    response.raise_for_status()
-    result = yaml.safe_load(response.text)
-    # NeMo Gym head server double-encodes: YAML string inside a YAML string
-    if isinstance(result, str):
-        result = yaml.safe_load(result)
-    return result


 def get_agent_servers(
--- a/src/axolotl/kernels/gemma4_fused_rope.py
+++ b/src/axolotl/kernels/gemma4_fused_rope.py
@@ -53,6 +53,7 @@ def _rms_norm_rope_forward_kernel(
    RSTD_ptr,
    RSTD_row_stride,
    n_cols,
+    n_rot,
    n_heads,
    eps,
    HAS_WEIGHT: tl.constexpr,
@@ -60,28 +61,35 @@ def _rms_norm_rope_forward_kernel(
 ):
    """
    Fused forward:
-      x_norm = x / rms(x) [* weight]   (RMSNorm)
-      y = x_norm * cos + rotate_half(x_norm) * sin  (RoPE)
+      x_norm = x / rms(x) [* weight]   (RMSNorm, full n_cols)
+      y[..., :n_rot]  = rope(x_norm[..., :n_rot])
+      y[..., n_rot:]  = x_norm[..., n_rot:]   (pass-through for partial rotary)

-    rotate_half swaps first/second halves and negates the first:
-      rotate_half([a, b]) = [-b, a]
+    rotate_half swaps first/second halves and negates the first, restricted
+    to the rotary span [0, n_rot):
+      rotate_half([a, b]) = [-b, a]   where len(a) = len(b) = n_rot/2
+
+    For the partial-rotary pass-through region we load cos with default 1.0
+    and sin with default 0.0 outside [0, n_rot), so the same formula
+    `Y = X_norm * cos + X_rot_norm * sin` collapses to `Y = X_norm`.

    cos/sin are indexed by row_idx // n_heads to handle per-head broadcast
-    (cos/sin have shape (B*S, D) while X has shape (B*S*H, D)).
+    (cos/sin have shape (B*S, n_rot) while X has shape (B*S*H, n_cols)).
    """
    row_idx = tl.program_id(0).to(tl.int64)
-    # cos/sin row: divide by n_heads since cos/sin are (B*S, D)
+    # cos/sin row: divide by n_heads since cos/sin are (B*S, n_rot)
    cs_row_idx = row_idx // n_heads
    col_offsets = tl.arange(0, BLOCK_SIZE)
    mask = col_offsets < n_cols
-    half_dim = n_cols // 2
+    rot_mask_col = col_offsets < n_rot
+    half_rot = n_rot // 2

    # Load input row
    X_row = tl.load(X_ptr + row_idx * X_row_stride + col_offsets, mask=mask, other=0)
    X_dtype = X_row.dtype
    X_fp32 = X_row.to(tl.float32)

-    # RMSNorm: compute 1/rms
+    # RMSNorm: compute 1/rms over the full row (rotary + pass-through)
    mean_sq = tl.sum(X_fp32 * X_fp32, axis=0) / n_cols
    rstd = rsqrt(mean_sq + eps)
    tl.store(RSTD_ptr + row_idx * RSTD_row_stride, rstd)
@@ -94,33 +102,38 @@ def _rms_norm_rope_forward_kernel(
        W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0).to(tl.float32)
        X_norm = X_norm * W_row

-    # RoPE: load cos/sin (broadcast across heads)
+    # RoPE: load cos/sin (broadcast across heads). For col >= n_rot we get
+    # cos=1, sin=0 so the formula leaves X_norm untouched.
    cos_row = tl.load(
-        COS_ptr + cs_row_idx * COS_row_stride + col_offsets, mask=mask, other=0
+        COS_ptr + cs_row_idx * COS_row_stride + col_offsets,
+        mask=rot_mask_col,
+        other=1.0,
    ).to(tl.float32)
    sin_row = tl.load(
-        SIN_ptr + cs_row_idx * SIN_row_stride + col_offsets, mask=mask, other=0
+        SIN_ptr + cs_row_idx * SIN_row_stride + col_offsets,
+        mask=rot_mask_col,
+        other=0.0,
    ).to(tl.float32)

-    # rotate_half: for col < half_dim, take -X_norm[col + half_dim]
-    #              for col >= half_dim, take  X_norm[col - half_dim]
+    # rotate_half within [0, n_rot):
+    #   for col < half_rot:  take -X_norm[col + half_rot]
+    #   for col in [half_rot, n_rot): take  X_norm[col - half_rot]
+    # For col >= n_rot the rotation is irrelevant (sin = 0 zeros it out).
    rot_offsets = tl.where(
-        col_offsets < half_dim, col_offsets + half_dim, col_offsets - half_dim
+        col_offsets < half_rot, col_offsets + half_rot, col_offsets - half_rot
    )
-    rot_mask = rot_offsets < n_cols
+    rot_load_mask = (rot_offsets < n_cols) & rot_mask_col
    X_rot = tl.load(
-        X_ptr + row_idx * X_row_stride + rot_offsets, mask=rot_mask & mask, other=0
+        X_ptr + row_idx * X_row_stride + rot_offsets, mask=rot_load_mask, other=0
    ).to(tl.float32)
    # Re-normalize the rotated values
    X_rot_norm = X_rot * rstd
    if HAS_WEIGHT:
-        W_rot = tl.load(W_ptr + rot_offsets, mask=rot_mask & mask, other=0).to(
-            tl.float32
-        )
+        W_rot = tl.load(W_ptr + rot_offsets, mask=rot_load_mask, other=0).to(tl.float32)
        X_rot_norm = X_rot_norm * W_rot

    # Negate the first half (rotate_half negates x2, which becomes the first half)
-    sign = tl.where(col_offsets < half_dim, -1.0, 1.0)
+    sign = tl.where(col_offsets < half_rot, -1.0, 1.0)
    X_rot_norm = X_rot_norm * sign

    # Final RoPE: y = x_norm * cos + rotate_half(x_norm) * sin
@@ -153,13 +166,21 @@ def _rms_norm_rope_backward_kernel(
    dW_row_stride,
    n_rows,
    n_cols,
+    n_rot,
    n_heads,
    rows_per_program,
    HAS_WEIGHT: tl.constexpr,
    BLOCK_SIZE: tl.constexpr,
 ):
    """
-    Backward for Y = RoPE(RMSNorm(X, W))
+    Backward for Y = RoPE(RMSNorm(X, W)) with optional partial rotary
+    (`n_rot <= n_cols`).
+
+    For col < n_rot the standard RoPE adjoint applies. For col >= n_rot the
+    output is just the normalized row, so dN[col] = dY[col] (achieved by
+    loading cos with default 1.0 and forcing the rotate-half contribution
+    to zero outside the rotary span).
+
    cos/sin indexed by row_idx // n_heads for per-head broadcast.
    """
    row_block_id = tl.program_id(0).to(tl.int64)
@@ -167,7 +188,8 @@ def _rms_norm_rope_backward_kernel(
    row_end = min((row_block_id + 1) * rows_per_program, n_rows)
    col_offsets = tl.arange(0, BLOCK_SIZE)
    mask = col_offsets < n_cols
-    half_dim = n_cols // 2
+    rot_mask_col = col_offsets < n_rot
+    half_rot = n_rot // 2

    dW_acc = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)

@@ -186,33 +208,37 @@ def _rms_norm_rope_backward_kernel(
        rstd = tl.load(RSTD_ptr + row_idx * RSTD_row_stride)

        cos_row = tl.load(
-            COS_ptr + cs_row_idx * COS_row_stride + col_offsets, mask=mask, other=0
+            COS_ptr + cs_row_idx * COS_row_stride + col_offsets,
+            mask=rot_mask_col,
+            other=1.0,
        ).to(tl.float32)

-        # dN = dY * cos + rotate_half^T(dY * sin)
+        # dN = dY * cos + rotate_half^T(dY * sin)   (within the rotary span)
        # rotate_half^T([a, b]) = [b, -a]  (adjoint of rotate_half)
        #
-        # Compute rotate_half_transpose(dY * sin) by loading dY and sin at
-        # rotated offsets directly:  dY[rot] * sin[rot] * adj_sign
-        # This is equivalent to rotating (dY * sin) because the rotation
-        # just permutes which elements are multiplied.
+        # For col >= n_rot the formula must collapse to dN = dY (since the
+        # forward is just a pass-through). cos defaults to 1.0 above; the
+        # rotate-half contribution is masked to zero below.
        rot_offsets = tl.where(
-            col_offsets < half_dim, col_offsets + half_dim, col_offsets - half_dim
+            col_offsets < half_rot, col_offsets + half_rot, col_offsets - half_rot
        )
-        rot_mask = rot_offsets < n_cols
+        rot_load_mask = (rot_offsets < n_cols) & rot_mask_col
        dY_rot = tl.load(
            dY_ptr + row_idx * dY_row_stride + rot_offsets,
-            mask=rot_mask & mask,
+            mask=rot_load_mask,
            other=0,
        ).to(tl.float32)
        sin_rot = tl.load(
            SIN_ptr + cs_row_idx * SIN_row_stride + rot_offsets,
-            mask=rot_mask & mask,
+            mask=rot_load_mask,
            other=0,
        ).to(tl.float32)

-        adj_sign = tl.where(col_offsets < half_dim, 1.0, -1.0)
-        dN = dY_row * cos_row + dY_rot * sin_rot * adj_sign
+        adj_sign = tl.where(col_offsets < half_rot, 1.0, -1.0)
+        rotate_term = dY_rot * sin_rot * adj_sign
+        # Zero out rotate-half contribution outside the rotary span.
+        rotate_term = tl.where(rot_mask_col, rotate_term, 0.0)
+        dN = dY_row * cos_row + rotate_term

        # Pre-weight normalized: n = rstd * x
        n = X_row * rstd
@@ -241,15 +267,17 @@ def _rms_norm_rope_backward_kernel(
        )


-def rms_norm_rope_forward(X, W, cos, sin, eps, n_heads):
+def rms_norm_rope_forward(X, W, cos, sin, eps, n_heads, n_rot):
    """
    Args:
        X:   (B*S*H, head_dim) — contiguous, flattened from (B, S, H, D)
        W:   (head_dim,) or None — RMSNorm weight
-        cos: (B*S, head_dim) — position embeddings (broadcast across heads)
-        sin: (B*S, head_dim) — position embeddings (broadcast across heads)
+        cos: (B*S, n_rot) — position embeddings (broadcast across heads)
+        sin: (B*S, n_rot) — position embeddings (broadcast across heads)
        eps: float
        n_heads: int — number of attention heads (for cos/sin indexing)
+        n_rot: int — rotary dim (== head_dim for full rotary, < head_dim for
+            partial rotary). Must be even and ``<= head_dim``.
    Returns:
        Y, X_saved, RSTD, BLOCK_SIZE, num_warps
    """
@@ -273,6 +301,7 @@ def rms_norm_rope_forward(X, W, cos, sin, eps, n_heads):
        RSTD,
        RSTD.stride(0),
        n_cols,
+        n_rot,
        n_heads,
        eps,
        HAS_WEIGHT=has_weight,
@@ -282,7 +311,9 @@ def rms_norm_rope_forward(X, W, cos, sin, eps, n_heads):
    return Y, X, RSTD, BLOCK_SIZE, num_warps


-def rms_norm_rope_backward(dY, X, W, cos, sin, RSTD, n_heads, BLOCK_SIZE, num_warps):
+def rms_norm_rope_backward(
+    dY, X, W, cos, sin, RSTD, n_heads, n_rot, BLOCK_SIZE, num_warps
+):
    n_rows, n_cols = dY.shape
    has_weight = W is not None

@@ -315,6 +346,7 @@ def rms_norm_rope_backward(dY, X, W, cos, sin, RSTD, n_heads, BLOCK_SIZE, num_wa
        _dW.stride(0),
        n_rows,
        n_cols,
+        n_rot,
        n_heads,
        rows_per_program,
        HAS_WEIGHT=has_weight,
@@ -329,13 +361,14 @@ def rms_norm_rope_backward(dY, X, W, cos, sin, RSTD, n_heads, BLOCK_SIZE, num_wa
 class FusedRMSNormRoPEFunction(torch.autograd.Function):
    @staticmethod
    @ensure_contiguous
-    def forward(ctx, X, W, cos, sin, eps, n_heads):
+    def forward(ctx, X, W, cos, sin, eps, n_heads, n_rot):
        """
-        X:   (B*S*H, head_dim)
-        W:   (head_dim,) or None
-        cos: (B*S, head_dim) — broadcast across heads
-        sin: (B*S, head_dim) — broadcast across heads
+        X:    (B*S*H, head_dim)
+        W:    (head_dim,) or None
+        cos:  (B*S, n_rot) — broadcast across heads
+        sin:  (B*S, n_rot) — broadcast across heads
        n_heads: int
+        n_rot:   int — rotary dim (<= head_dim)
        """
        Y, X_saved, RSTD, BLOCK_SIZE, num_warps = rms_norm_rope_forward(
            X,
@@ -344,11 +377,13 @@ class FusedRMSNormRoPEFunction(torch.autograd.Function):
            sin,
            eps,
            n_heads,
+            n_rot,
        )
        ctx.eps = eps
        ctx.BLOCK_SIZE = BLOCK_SIZE
        ctx.num_warps = num_warps
        ctx.n_heads = n_heads
+        ctx.n_rot = n_rot
        ctx.has_weight = W is not None
        ctx.save_for_backward(X_saved, W, cos, sin, RSTD)
        return Y
@@ -365,21 +400,26 @@ class FusedRMSNormRoPEFunction(torch.autograd.Function):
            sin,
            RSTD,
            ctx.n_heads,
+            ctx.n_rot,
            ctx.BLOCK_SIZE,
            ctx.num_warps,
        )
-        return dX, dW, None, None, None, None
+        return dX, dW, None, None, None, None, None


 def fused_rms_norm_rope(x, weight, cos, sin, eps=1e-6):
    """
-    Apply fused RMSNorm + RoPE.
+    Apply fused RMSNorm + (partial) RoPE.

    Args:
        x:      (batch, seq_len, num_heads, head_dim) — after projection + view
        weight: (head_dim,) — RMSNorm weight, or None for no-scale norm
-        cos:    (batch, seq_len, head_dim) — from RotaryEmbedding
-        sin:    (batch, seq_len, head_dim) — from RotaryEmbedding
+        cos:    (batch, seq_len, n_rot) — from RotaryEmbedding. ``n_rot``
+                must be even and ``<= head_dim``. When ``n_rot < head_dim``
+                the trailing ``head_dim - n_rot`` columns are RMSNorm-only
+                (partial-rotary pass-through), matching stock Gemma 4 with
+                ``partial_rotary_factor < 1.0``.
+        sin:    (batch, seq_len, n_rot) — same shape as ``cos``
        eps:    float — RMSNorm epsilon

    Returns:
@@ -387,14 +427,38 @@ def fused_rms_norm_rope(x, weight, cos, sin, eps=1e-6):
    """
    shape = x.shape  # (B, S, H, D)
    B, S, H, D = shape
+    n_rot = cos.shape[-1]
+    if sin.shape[-1] != n_rot:
+        raise ValueError(
+            f"cos and sin must have the same last dim, got cos={cos.shape[-1]} "
+            f"sin={sin.shape[-1]}"
+        )
+    if n_rot > D:
+        raise ValueError(f"rotary dim ({n_rot}) cannot exceed head_dim ({D})")
+    if n_rot % 2 != 0:
+        raise ValueError(f"rotary dim must be even, got {n_rot}")
+
    # Flatten to 2D: (B*S*H, D)
    x_flat = x.reshape(-1, D).contiguous()
-    # Flatten cos/sin to (B*S, D) — the kernel will handle per-head broadcast
-    # by dividing the row_idx by H to get the cos/sin row
-    cos_flat = cos.reshape(B * S, D).contiguous()
-    sin_flat = sin.reshape(B * S, D).contiguous()
+    # cos/sin may broadcast over the batch dim (e.g. (1, S, n_rot) when
+    # all sequences share the same rotary positions). The kernel needs a
+    # dense (B*S, n_rot) buffer so that row_idx // n_heads maps cleanly
+    # onto a single (b, s) pair, so expand-then-contiguous to materialize
+    # the per-batch broadcast. Expand is a no-op when B == cos.shape[0].
+    if cos.shape[0] != B:
+        if cos.shape[0] != 1:
+            raise ValueError(
+                f"cos/sin batch dim ({cos.shape[0]}) must be 1 or equal "
+                f"to x batch dim ({B})"
+            )
+        cos = cos.expand(B, S, n_rot)
+        sin = sin.expand(B, S, n_rot)
+    cos_flat = cos.reshape(B * S, n_rot).contiguous()
+    sin_flat = sin.reshape(B * S, n_rot).contiguous()

-    y_flat = FusedRMSNormRoPEFunction.apply(x_flat, weight, cos_flat, sin_flat, eps, H)
+    y_flat = FusedRMSNormRoPEFunction.apply(
+        x_flat, weight, cos_flat, sin_flat, eps, H, n_rot
+    )
    return y_flat.view(shape)


--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -156,12 +156,21 @@ class PatchManager:
            # which would clobber any earlier fix.
            self._fix_nemotron_h_conversion_mapping()

+        # Gemma 4 hybrid attention runs here in post-build (NOT post-load):
+        # the per-layer ``self_attn.config._attn_implementation="sdpa"``
+        # override needs to walk the raw model tree, which is broken by
+        # the post-load PEFT wrapping. The accompanying
+        # ``patch_gemma4_hybrid_mask`` monkey-patch is module-level and
+        # installation-time-independent, so both halves of the fix live
+        # cleanly in the same call even though one is instance-scoped
+        # and the other is module-scoped.
        self._apply_gemma_hybrid_attention(model)
        self._finalize_moe_expert_quantization(model)

    def apply_post_model_load_patches(self, model: PreTrainedModel):
        """Apply patches that require the model instance."""
        self._apply_llama_flash_attn_patches(model)
+        self._apply_unsloth_patches(model)
        self._apply_lora_kernel_patch(model)
        self._apply_scaling_softmax_patch(model)

@@ -172,12 +181,23 @@ class PatchManager:
        which exceeds flash attention's supported size. This patch loads the model
        with flash_attention_2 for the sliding window layers (head_dim=256), then
        gives each global layer a shallow-copied config with _attn_implementation="sdpa".
+
+        We also install :func:`axolotl.monkeypatch.gemma4_hybrid_mask.patch_gemma4_hybrid_mask`
+        which fixes the corresponding mask construction inside
+        ``Gemma4TextModel.forward``. Without it, the per-layer SDPA config
+        override is not enough — the forward still builds a 2D FA2-format mask
+        at the model level and the SDPA layers crash at long context lengths
+        with ``RuntimeError: The expanded size of the tensor ... must match``.
        """
        if not self.cfg.gemma4_hybrid_attn_impl:
            return

        import copy

+        from axolotl.monkeypatch.gemma4_hybrid_mask import patch_gemma4_hybrid_mask
+
+        patch_gemma4_hybrid_mask()
+
        # Navigate to the module that has 'layers' - varies by model structure:
        # Gemma4ForConditionalGeneration -> .model (Gemma4Model) -> .language_model (Gemma4TextModel) -> .layers
        # Gemma4ForCausalLM -> .model (Gemma4TextModel) -> .layers
@@ -391,20 +411,19 @@ class PatchManager:
                patch_qwen3_5_vlm_flash_attention()

            if self.cfg.model_config_type in ("gemma4", "gemma4_text"):
+                # The fused attn path is now compatible with
+                # ``gemma4_hybrid_attn_impl``: the kernel handles partial
+                # rotary (cos.shape[-1] < head_dim) and the fused forward
+                # mirrors the current ``Gemma4TextAttention.forward`` API
+                # for shared kv (read from / write to
+                # ``past_key_values.shared_layers``). See
+                # ``src/axolotl/kernels/GEMMA4_FUSED_ROPE_HYBRID_ATTN_BUG.md``
+                # for the history.
                from axolotl.monkeypatch.models.gemma4.fused_attn import (
                    patch_gemma4_fused_attn,
                )

-                # Shared-KV side channel when activation checkpointing (PR #3611).
-                fsdp_cfg = self.cfg.fsdp_config
-                needs_shared_kv_workaround = (not self.inference) and bool(
-                    self.cfg.gradient_checkpointing
-                    or self.cfg.activation_offloading
-                    or (fsdp_cfg is not None and fsdp_cfg.activation_checkpointing)
-                )
-                patch_gemma4_fused_attn(
-                    install_shared_kv_workaround=needs_shared_kv_workaround
-                )
+                patch_gemma4_fused_attn()

    @staticmethod
    def _fix_nemotron_h_conversion_mapping():
@@ -682,10 +701,24 @@ class PatchManager:
            )

            patch_fa_llama_cross_entropy()
+        elif self.cfg.unsloth_cross_entropy_loss:
+            from axolotl.monkeypatch.unsloth_ import integrate_cross_entropy_loss_patch
+
+            integrate_cross_entropy_loss_patch(model_type="llama")
+
        if self.cfg.flash_attn_rms_norm and self.has_flash_attn:
            from axolotl.monkeypatch.llama_attn_hijack_flash import patch_llama_rms_norm

            patch_llama_rms_norm()
+        elif self.cfg.unsloth_rms_norm:
+            from axolotl.monkeypatch.unsloth_ import patch_unsloth_layernorm
+
+            patch_unsloth_layernorm()
+
+        if self.cfg.unsloth_lora_qkv or self.cfg.unsloth_lora_o:
+            from axolotl.monkeypatch.unsloth_ import patch_self_attn_lora
+
+            patch_self_attn_lora()

    def _patch_llama_flash_attention(self):
        """Apply Flash Attention patches for LLaMA models."""
@@ -752,6 +785,23 @@ class PatchManager:
                LOG.info("Patching with SwiGLU...")
                replace_llama_mlp_with_swiglu(model)

+    def _apply_unsloth_patches(self, model):
+        """Apply unsloth optimization patches."""
+        if self.cfg.unsloth_lora_mlp:
+            from axolotl.monkeypatch.unsloth_ import integrate_lora_mlp_patch
+
+            integrate_lora_mlp_patch(peft_model=model)
+
+        if self.cfg.unsloth_lora_qkv or self.cfg.unsloth_lora_o:
+            from axolotl.monkeypatch.unsloth_ import integrate_lora_patch
+
+            integrate_lora_patch(peft_model=model, cfg=self.cfg)
+
+        if self.cfg.unsloth_rope:
+            from axolotl.monkeypatch.unsloth_ import integrate_rope_embeddings
+
+            integrate_rope_embeddings()
+
    def _apply_lora_kernel_patch(self, model):
        """Apply LoRA kernel patches."""
        if (
--- a/src/axolotl/monkeypatch/gemma4_hybrid_mask.py
+++ b/src/axolotl/monkeypatch/gemma4_hybrid_mask.py
@@ -0,0 +1,115 @@
+"""Hybrid attention mask fix for Gemma 4.
+
+Gemma 4 has full-attention (global) layers with ``head_dim=512`` which
+exceeds flash-attention-2's supported size. Axolotl's hybrid-attention
+patch in ``patch_manager._apply_gemma_hybrid_attention`` works around
+this by forcing ``_attn_implementation="sdpa"`` on each global layer's
+``self_attn.config``, leaving sliding-window layers on FA2.
+
+The per-layer config override alone is insufficient, however:
+``Gemma4TextModel.forward`` builds a single ``causal_mask_mapping`` dict
+using the **model-level** config and passes the mapped mask to each
+decoder layer. With FA2 still set at the model level, the ``full_attention``
+entry in that mapping is a 2D mask (FA2 format), but SDPA needs a 4D mask.
+The global layers then fail with::
+
+    RuntimeError: The expanded size of the tensor (S) must match the existing
+    size (B) at non-singleton dimension 2. Target sizes: [B, H, S, S]. Tensor
+    sizes: [B, S]
+
+...when the sequence length grows past roughly 7k tokens.
+
+This module fixes the symptom by monkey-patching ``create_causal_mask`` in
+``transformers.models.gemma4.modeling_gemma4``'s module namespace — NOT
+the original in ``masking_utils``. The wrapper forces
+``_attn_implementation="sdpa"`` on a shallow-copied config before calling
+through, so the ``full_attention`` mask built inside ``Gemma4TextModel.forward``
+is always 4D/SDPA-compatible. ``create_sliding_window_causal_mask`` is left
+alone, so sliding-window layers continue to receive FA2-format masks.
+
+The patch is idempotent. Install once per process, before any Gemma 4
+forward pass runs.
+"""
+
+from __future__ import annotations
+
+import copy
+from typing import Any
+
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+_PATCH_APPLIED = False
+
+
+def patch_gemma4_hybrid_mask() -> bool:
+    """Install the Gemma 4 hybrid-attention mask fix.
+
+    Returns ``True`` if the patch was installed (or was already installed),
+    ``False`` if the target module could not be imported (e.g. transformers
+    version predates Gemma 4) — in which case nothing is done and the
+    caller can continue unaffected.
+    """
+    global _PATCH_APPLIED
+    if _PATCH_APPLIED:
+        return True
+
+    try:
+        from transformers.models.gemma4 import modeling_gemma4
+    except ImportError:
+        LOG.debug(
+            "gemma4_hybrid_mask: transformers.models.gemma4 not importable, "
+            "skipping. This is fine for non-Gemma4 training."
+        )
+        return False
+
+    if not hasattr(modeling_gemma4, "create_causal_mask"):
+        LOG.warning(
+            "gemma4_hybrid_mask: modeling_gemma4 has no 'create_causal_mask' "
+            "binding, skipping. Transformers API may have changed."
+        )
+        return False
+
+    original = modeling_gemma4.create_causal_mask
+
+    def hybrid_create_causal_mask(config: Any, *args: Any, **kwargs: Any):
+        """Wrapper that forces SDPA format for the full-attention mask.
+
+        The global layers were patched to SDPA by
+        ``_apply_gemma_hybrid_attention``, so their mask must be 4D. The
+        original ``create_causal_mask`` dispatches on
+        ``config._attn_implementation``; we shadow that with a local
+        override.
+        """
+        sdpa_config = copy.copy(config)
+        sdpa_config._attn_implementation = "sdpa"
+        return original(sdpa_config, *args, **kwargs)
+
+    # Preserve the original reference on the wrapper for tests / teardown.
+    hybrid_create_causal_mask._axolotl_original = original  # type: ignore[attr-defined]
+
+    modeling_gemma4.create_causal_mask = hybrid_create_causal_mask
+    _PATCH_APPLIED = True
+    LOG.info(
+        "gemma4_hybrid_mask: patched modeling_gemma4.create_causal_mask to "
+        "force SDPA-format masks for full-attention layers"
+    )
+    return True
+
+
+def unpatch_gemma4_hybrid_mask() -> None:
+    """Restore the original ``create_causal_mask``. Useful for tests."""
+    global _PATCH_APPLIED
+    if not _PATCH_APPLIED:
+        return
+    try:
+        from transformers.models.gemma4 import modeling_gemma4
+    except ImportError:
+        _PATCH_APPLIED = False
+        return
+    current = modeling_gemma4.create_causal_mask
+    original = getattr(current, "_axolotl_original", None)
+    if original is not None:
+        modeling_gemma4.create_causal_mask = original
+    _PATCH_APPLIED = False
--- a/src/axolotl/monkeypatch/models/gemma4/fused_attn.py
+++ b/src/axolotl/monkeypatch/models/gemma4/fused_attn.py
@@ -6,29 +6,15 @@ kernels, eliminating intermediate tensor allocations from rotate_half / apply_ro

 Usage:
    from axolotl.monkeypatch.models.gemma4.fused_attn import patch_gemma4_fused_attn
-    # Pass install_shared_kv_workaround=True when activation checkpointing is enabled.
-    patch_gemma4_fused_attn(install_shared_kv_workaround=True)
+    patch_gemma4_fused_attn()
 """

+import logging
 from typing import Callable

 import torch

-from axolotl.utils.logging import get_logger
-
-logger = get_logger(__name__)
-
-# Module-level dict used as a side channel for shared KV states avoiding kwarg and TLS
-# to prevent memory leak on gradient checkpoint enabled training (PR #3611)
-_GEMMA4_SHARED_KV_STORE: dict = {"store": None}
-
-
-def _set_shared_kv_states(store):
-    _GEMMA4_SHARED_KV_STORE["store"] = store
-
-
-def _get_shared_kv_states():
-    return _GEMMA4_SHARED_KV_STORE["store"]
+logger = logging.getLogger(__name__)


 def _make_fused_forward(original_forward):
@@ -44,7 +30,7 @@ def _make_fused_forward(original_forward):
        hidden_states: torch.Tensor,
        position_embeddings: torch.Tensor,
        attention_mask: torch.Tensor | None,
-        shared_kv_states: dict[int, tuple[torch.Tensor, torch.Tensor]] | None = None,
+        shared_kv_states: dict[int, tuple[torch.Tensor, torch.Tensor]],
        past_key_values=None,
        **kwargs,
    ) -> tuple[torch.Tensor, torch.Tensor | None]:
@@ -53,10 +39,6 @@ def _make_fused_forward(original_forward):
            eager_attention_forward,
        )

-        store = _get_shared_kv_states()
-        if store is not None:
-            shared_kv_states = store
-
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)
        eps = self.config.rms_norm_eps
@@ -151,44 +133,15 @@ def _make_fused_forward(original_forward):
    return fused_forward


-def _patch_decoder_layer_call():
-    """Strip `shared_kv_states` from decoder-layer kwargs and route via the
-    module-level side channel so the checkpoint partial cannot pin it (PR #3611).
+def patch_gemma4_fused_attn():
    """
-    from transformers.models.gemma4.modeling_gemma4 import Gemma4TextDecoderLayer
-
-    if getattr(Gemma4TextDecoderLayer, "_axolotl_shared_kv_patched", False):
-        return
-
-    original_call = Gemma4TextDecoderLayer.__call__
-
-    def patched_call(self, *args, **kwargs):
-        shared_kv = kwargs.pop("shared_kv_states", None)
-        # Overwrite unconditionally (including with None) so a previous step's
-        # dict cannot leak into a later call without shared_kv_states (PR #3611).
-        _set_shared_kv_states(shared_kv)
-        return original_call(self, *args, **kwargs)
-
-    Gemma4TextDecoderLayer.__call__ = patched_call
-    Gemma4TextDecoderLayer._axolotl_shared_kv_patched = True
-
-
-def patch_gemma4_fused_attn(install_shared_kv_workaround: bool = False):
-    """
-    Monkeypatch Gemma4TextAttention.forward to use fused RMSNorm+RoPE kernels,
-    and optionally route `shared_kv_states` via a module-level side channel to
-    avoid a VRAM leak under activation checkpointing (PR #3611).
+    Monkeypatch Gemma4TextAttention.forward to use fused RMSNorm+RoPE kernels.
    """
    from transformers.models.gemma4.modeling_gemma4 import Gemma4TextAttention

    original_forward = Gemma4TextAttention.forward
    Gemma4TextAttention.forward = _make_fused_forward(original_forward)

-    if install_shared_kv_workaround:
-        _patch_decoder_layer_call()
-
    logger.info(
        "Patched Gemma4TextAttention.forward with fused RMSNorm+RoPE Triton kernels"
    )
-    if install_shared_kv_workaround:
-        logger.info("Installed Gemma4 shared_kv_states side channel (PR #3611)")
--- a/src/axolotl/monkeypatch/tiled_mlp/patch.py
+++ b/src/axolotl/monkeypatch/tiled_mlp/patch.py
@@ -24,7 +24,15 @@ def patch_tiled_mlp(model_type, use_original_mlp=True, cfg_num_shards=None):
        module_path = f"transformers.models.{model_type}.modeling_{model_type}"
        model_cls_prefix, _ = get_causal_lm_model_cls_prefix(model_type)
        module = __import__(module_path, fromlist=[f"{model_cls_prefix}MLP"])
-        mlp_cls = getattr(module, f"{model_cls_prefix}MLP")
+        # Some multimodal wrappers (e.g. Gemma 4) name the MLP class
+        # ``{prefix}TextMLP`` rather than ``{prefix}MLP`` because the
+        # language-side module is separated from the vision tower. Try
+        # both names before giving up.
+        mlp_cls = getattr(
+            module,
+            f"{model_cls_prefix}MLP",
+            None,
+        ) or getattr(module, f"{model_cls_prefix}TextMLP")

        if use_original_mlp:
            mlp_forward = mlp_cls.forward
--- a/src/axolotl/monkeypatch/unsloth_.py
+++ b/src/axolotl/monkeypatch/unsloth_.py
@@ -0,0 +1,252 @@
+"""module for patching with unsloth optimizations"""
+
+import inspect
+import types
+
+import torch
+from peft import PeftModelForCausalLM
+from torch import nn
+from transformers.models.llama.modeling_llama import LlamaFlashAttention2
+
+from axolotl.monkeypatch.utils import detab_code
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+ORIGINAL_QKV_CODE = """
+    query_states = self.q_proj(hidden_states)
+    key_states = self.k_proj(hidden_states)
+    value_states = self.v_proj(hidden_states)
+""".lstrip("\n")
+
+PATCHED_QKV_CODE = """
+    query_states, key_states, value_states = self.apply_qkv(self, hidden_states)
+""".lstrip("\n")
+
+ORIGINAL_O_CODE = """
+    attn_output = self.o_proj(attn_output)
+""".lstrip("\n")
+
+PATCHED_O_CODE = """
+    attn_output = self.apply_o(self, attn_output)
+""".lstrip("\n")
+
+
+def original_apply_qkv(self, hidden_states):
+    query_states = self.q_proj(hidden_states)
+    key_states = self.k_proj(hidden_states)
+    value_states = self.v_proj(hidden_states)
+    return query_states, key_states, value_states
+
+
+def original_apply_o(self, hidden_states):
+    attn_output = self.o_proj(hidden_states)
+    return attn_output
+
+
+def get_self_attn_code() -> str:
+    forward = inspect.getsource(LlamaFlashAttention2.forward)
+    return forward
+
+
+def check_self_attn_is_patchable() -> bool:
+    qkv = get_self_attn_code()
+    qkv, _ = detab_code(qkv)
+    return ORIGINAL_QKV_CODE in qkv and ORIGINAL_O_CODE in qkv
+
+
+def integrate_cross_entropy_loss_patch(model_type: str = "llama") -> None:
+    from unsloth.kernels.cross_entropy_loss import fast_cross_entropy_loss
+
+    def UnslothForCausalLMLoss(
+        logits,
+        labels,
+        vocab_size: int,
+        num_items_in_batch: int = None,
+        ignore_index: int = -100,
+        **kwargs,
+    ):
+        # Upcast to float if we need to compute the loss to avoid potential precision issues
+        logits = logits.float()
+        # Shift so that tokens < n predict n
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+
+        loss = fast_cross_entropy_loss(
+            logits=shift_logits, labels=shift_labels, n_items=num_items_in_batch
+        )
+        return loss
+
+    if model_type == "llama":
+        from transformers.loss import loss_utils
+
+        loss_utils.ForCausalLMLoss = UnslothForCausalLMLoss  # type: ignore[assignment]
+    else:
+        raise ValueError("Unsupported model type")
+
+
+self_attn_lora_patched = False
+
+
+def patch_self_attn_lora():
+    global self_attn_lora_patched
+    if self_attn_lora_patched:
+        # prevent patching multiple times
+        return
+    self_attn_forward = get_self_attn_code()
+    LlamaFlashAttention2._original_forward = self_attn_forward
+    self_attn_forward, _ = detab_code(self_attn_forward)
+    assert ORIGINAL_QKV_CODE in self_attn_forward, "Original qkv code not found"
+    assert ORIGINAL_O_CODE in self_attn_forward, "Original o code not found"
+
+    self_attn_forward = self_attn_forward.replace(ORIGINAL_QKV_CODE, PATCHED_QKV_CODE)
+    self_attn_forward = self_attn_forward.replace(ORIGINAL_O_CODE, PATCHED_O_CODE)
+    self_attn_forward = self_attn_forward.replace(
+        "def forward(",
+        "def unsloth_attn_forward(",
+        1,
+    )
+
+    # load imports necessary
+    import transformers.models.llama.modeling_llama
+
+    items_to_import = []
+    for item in dir(transformers.models.llama.modeling_llama):
+        if item in self_attn_forward:
+            items_to_import.append(item)
+
+    exec(
+        "from transformers.models.llama.modeling_llama import ("
+        + ", ".join(x for x in items_to_import)
+        + ")",
+        globals(),
+    )
+    exec(self_attn_forward, globals())
+    self_attn_lora_patched = True
+    LOG.info("patching unsloth attn lora")
+    LlamaFlashAttention2.forward = unsloth_attn_forward
+
+
+def integrate_rope_embeddings():
+    import transformers.models.llama.modeling_llama
+    from unsloth.kernels.rope_embedding import fast_rope_embedding
+
+    def apply_rotary_pos_emb(
+        q,
+        k,
+        cos,
+        sin,
+        position_ids=None,
+        unsqueeze_dim=1,
+    ):
+        return fast_rope_embedding(q, k, cos, sin)
+
+    LOG.info("patching unsloth RoPE embeddings")
+    transformers.models.llama.modeling_llama.apply_rotary_pos_emb = apply_rotary_pos_emb
+
+
+def integrate_lora_mlp_patch(peft_model: PeftModelForCausalLM):
+    if peft_model.base_model.config.model_type in ["llama", "mistral"]:
+        from unsloth.kernels import apply_lora_mlp_swiglu
+
+        apply_lora_mlp = apply_lora_mlp_swiglu
+    elif peft_model.base_model.config.model_type == "gemma":
+        from unsloth.kernels import apply_lora_mlp_geglu_approx
+
+        apply_lora_mlp = apply_lora_mlp_geglu_approx
+    else:
+        raise NotImplementedError(
+            f"Model type {peft_model.base_model.config.model_type} not supported"
+        )
+
+    for idx, layer in enumerate(peft_model.model.model.layers):
+        layer_modules = [
+            getattr(layer.mlp, linear_proj)
+            for linear_proj in ["gate_proj", "up_proj", "down_proj"]
+        ]
+        is_mlp_lora = all(hasattr(module, "lora_A") for module in layer_modules)
+        mlp_no_bias = all(
+            getattr(module, "base_layer", module).bias is None
+            for module in layer_modules
+        )
+        mlp_not_dora = all(
+            len(getattr(module, "lora_magnitude_vector", []) or []) == 0
+            for module in layer_modules
+        )
+
+        if is_mlp_lora and mlp_no_bias and mlp_not_dora:
+            layer.mlp.forward = types.MethodType(apply_lora_mlp, layer.mlp)
+        else:
+            LOG.warning(f"unable to apply unsloth lora mlp patch to layer {idx}")
+
+
+def integrate_lora_patch(peft_model: PeftModelForCausalLM, cfg):
+    from unsloth.kernels import apply_lora_o, apply_lora_qkv
+
+    for idx, layer in enumerate(peft_model.model.model.layers):
+        if cfg.unsloth_lora_qkv:
+            layer_modules = [
+                getattr(layer.self_attn, linear_proj)
+                for linear_proj in ["q_proj", "k_proj", "v_proj"]
+            ]
+            is_qkv_lora = all(hasattr(module, "lora_A") for module in layer_modules)
+            qkv_no_bias = all(
+                getattr(module, "base_layer", module).bias is None
+                for module in layer_modules
+            )
+            qkv_not_dora = all(
+                len(getattr(module, "lora_magnitude_vector", []) or []) == 0
+                for module in layer_modules
+            )
+
+            if is_qkv_lora and qkv_no_bias and qkv_not_dora:
+                layer.self_attn.apply_qkv = apply_lora_qkv
+            else:
+                layer.self_attn.apply_qkv = original_apply_qkv
+                LOG.warning(f"unable to apply unsloth lora qkv patch to layer {idx}")
+        if cfg.unsloth_lora_o:
+            layer_modules = [
+                getattr(layer.self_attn, linear_proj) for linear_proj in ["o_proj"]
+            ]
+            is_o_lora = all(hasattr(module, "lora_A") for module in layer_modules)
+            o_no_bias = all(
+                getattr(module, "base_layer", module).bias is None
+                for module in layer_modules
+            )
+            o_not_dora = all(
+                len(getattr(module, "lora_magnitude_vector", []) or []) == 0
+                for module in layer_modules
+            )
+
+            if is_o_lora and o_no_bias and o_not_dora:
+                layer.self_attn.apply_o = apply_lora_o
+            else:
+                layer.self_attn.apply_o = original_apply_o
+                LOG.warning(f"unable to apply unsloth lora o_proj patch to layer {idx}")
+
+
+def patch_unsloth_layernorm():
+    try:
+        import transformers.models.llama.modeling_llama
+        from unsloth.kernels.rms_layernorm import Fast_RMS_Layernorm
+
+        class LlamaRMSNorm(nn.Module):
+            """LlamaRMSNorm"""
+
+            def __init__(self, hidden_size, eps=1e-6):
+                """
+                LlamaRMSNorm is equivalent to T5LayerNorm
+                """
+                super().__init__()
+                self.weight = nn.Parameter(torch.ones(hidden_size))
+                self.variance_epsilon = eps
+
+            def forward(self, hidden_states):
+                return Fast_RMS_Layernorm.apply(
+                    hidden_states, self.weight, self.variance_epsilon, False
+                )
+
+        LOG.info("patching with unsloth.kernels.rms_layernorm")
+        transformers.models.llama.modeling_llama.LlamaRMSNorm = LlamaRMSNorm
+    except ImportError:
+        LOG.warning("missing unsloth library")
--- a/src/axolotl/scripts/vllm_serve_lora.py
+++ b/src/axolotl/scripts/vllm_serve_lora.py
@@ -320,6 +320,15 @@ def main(script_args: ScriptArguments):
    # --- Active LoRA state (shared across endpoints via closure) ---
    active_lora: dict = {"request": None}

+    # Serializes access to the worker pipe. The underlying
+    # multiprocessing.Connection is a single full-duplex stream shared
+    # across all HTTP handlers; concurrent requests interleave bytes on
+    # the wire and corrupt the pickle framing (seen as
+    # ``UnpicklingError: pickle data was truncated``). Any endpoint that
+    # does ``conn.send(...); conn.recv()`` MUST hold this lock across
+    # the round-trip so only one inflight call at a time per pipe.
+    worker_pipe_lock = asyncio.Lock()
+
    # ------------------------------------------------------------------
    # LoRA-specific endpoints
    # ------------------------------------------------------------------
@@ -631,6 +640,150 @@ def main(script_args: ScriptArguments):
            },
        }

+    @app.post("/v1/completions")
+    async def openai_completions(request_body: dict):
+        """OpenAI-compatible text-completions endpoint.
+
+        Accepts either a string ``prompt`` or a list-of-int
+        ``prompt_token_ids`` (as the text-completions spec allows). Routes
+        to the internal vLLM generate method with the active LoRA adapter
+        and returns an OpenAI /v1/completions-shaped response including
+        per-choice ``prompt_token_ids``, ``generation_token_ids``, and
+        ``generation_log_probs`` for NeMo Gym agents that need raw
+        tokens + logprobs.
+        """
+        import uuid
+
+        prompt_raw = request_body.get("prompt")
+        temperature = request_body.get("temperature", 1.0)
+        max_tokens = request_body.get("max_tokens", 512)
+        top_p = request_body.get("top_p", 1.0)
+        n = request_body.get("n", 1)
+        logprobs = request_body.get("logprobs") or 0
+        stop_token_ids = request_body.get("stop_token_ids") or None
+
+        # Accept either a string or a list[int] token id prompt. Lists
+        # must contain ints only (raise on lists of strings so callers get
+        # a clear error). Also accept [[int, int, ...]] nesting for the
+        # rare case callers pass a single-prompt batch.
+        if (
+            isinstance(prompt_raw, list)
+            and prompt_raw
+            and isinstance(prompt_raw[0], list)
+        ):
+            prompt_raw = prompt_raw[0]
+
+        prompt_dict: dict[str, Any] = {}
+        if isinstance(prompt_raw, list):
+            prompt_dict = {"prompt_token_ids": prompt_raw}
+        elif isinstance(prompt_raw, str):
+            prompt_dict = {"prompt": prompt_raw}
+        else:
+            return {
+                "error": {
+                    "message": ("prompt must be a string or a list of token ids"),
+                    "type": "invalid_request",
+                }
+            }
+
+        generation_kwargs: dict[str, Any] = {
+            "n": n,
+            "temperature": temperature,
+            "top_p": top_p,
+            "max_tokens": max_tokens,
+            "logprobs": logprobs,
+        }
+        if stop_token_ids:
+            generation_kwargs["stop_token_ids"] = stop_token_ids
+        sampling_params = SamplingParams(
+            **{k: v for k, v in generation_kwargs.items() if v is not None}
+        )
+
+        chunked = chunk_list([prompt_dict], script_args.data_parallel_size)
+
+        # Hold the pipe lock across send+recv — concurrent requests would
+        # otherwise interleave pickle frames on the worker connection.
+        async with worker_pipe_lock:
+            for conn, chunk in zip(connections, chunked, strict=True):
+                if not chunk:
+                    chunk = [{"prompt": "<placeholder>"}]
+                kwargs = {
+                    "prompts": chunk,
+                    "sampling_params": sampling_params,
+                    "lora_request": active_lora["request"],
+                }
+                conn.send({"type": "call", "method": "generate", "kwargs": kwargs})
+
+            loop = asyncio.get_running_loop()
+            all_outputs = await asyncio.gather(
+                *(loop.run_in_executor(None, safe_recv, conn) for conn in connections)
+            )
+
+        all_outputs = [o for o, c in zip(all_outputs, chunked, strict=True) if c]
+        for o in all_outputs:
+            if isinstance(o, dict) and "error" in o:
+                raise RuntimeError(f"vLLM worker error: {o['error']}")
+        all_outputs = list(chain.from_iterable(all_outputs))
+
+        if not all_outputs:
+            return {"choices": [], "model": script_args.model}
+
+        choices = []
+        for i, output in enumerate(all_outputs):
+            for j, out in enumerate(output.outputs):
+                text = out.text
+                # OpenAI-style `logprobs` block for text-completions:
+                #   { "tokens": [...], "token_logprobs": [...] }
+                lp_block = None
+                if out.logprobs:
+                    tokens_str: list[str] = []
+                    token_lps: list[float] = []
+                    for step in out.logprobs:
+                        chosen = next(iter(step.values()))
+                        tokens_str.append(getattr(chosen, "decoded_token", "") or "")
+                        token_lps.append(float(chosen.logprob))
+                    lp_block = {
+                        "tokens": tokens_str,
+                        "token_logprobs": token_lps,
+                    }
+
+                choice = {
+                    "index": i * n + j,
+                    "text": text,
+                    "finish_reason": "stop"
+                    if out.finish_reason == "stop"
+                    else "length",
+                    "logprobs": lp_block,
+                    # NeMo-Gym / retrace agent extras — preserved on the
+                    # choice so callers with raw-token pipelines don't
+                    # have to re-tokenize.
+                    "prompt_token_ids": output.prompt_token_ids,
+                    "generation_token_ids": list(out.token_ids),
+                    "generation_log_probs": (
+                        [float(next(iter(lp.values())).logprob) for lp in out.logprobs]
+                        if out.logprobs
+                        else []
+                    ),
+                }
+                choices.append(choice)
+
+        prompt_tokens = len(all_outputs[0].prompt_token_ids) if all_outputs else 0
+        completion_tokens = sum(
+            len(out.token_ids) for o in all_outputs for out in o.outputs
+        )
+
+        return {
+            "id": f"cmpl-{uuid.uuid4().hex[:8]}",
+            "object": "text_completion",
+            "model": script_args.model,
+            "choices": choices,
+            "usage": {
+                "prompt_tokens": prompt_tokens,
+                "completion_tokens": completion_tokens,
+                "total_tokens": prompt_tokens + completion_tokens,
+            },
+        }
+
    # --- Weight sync endpoints (legacy fallback, same as TRL) ---

    @app.post("/init_communicator/")
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -823,6 +823,13 @@ class AxolotlInputConfig(
        },
    )

+    unsloth_cross_entropy_loss: bool | None = None
+    unsloth_lora_mlp: bool | None = None
+    unsloth_lora_qkv: bool | None = None
+    unsloth_lora_o: bool | None = None
+    unsloth_rms_norm: bool | None = None
+    unsloth_rope: bool | None = None
+
    lora_mlp_kernel: bool | None = Field(
        default=None,
        json_schema_extra={
@@ -1462,6 +1469,21 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
            )
        return data

+    @model_validator(mode="before")
+    @classmethod
+    def check_multigpu_unsloth(cls, data):
+        if (
+            data.get("unsloth_lora_mlp")
+            or data.get("unsloth_lora_qkv")
+            or data.get("unsloth_lora_o")
+        ):
+            capabilities = data.get("capabilities")
+            if capabilities and capabilities.get("n_gpu", 0) > 1:
+                raise ValueError(
+                    "unsloth_lora_mlp, unsloth_lora_qkv, and unsloth_lora_o are not compatible with multi-GPU training."
+                )
+        return data
+
    @model_validator(mode="before")
    @classmethod
    def check_multigpu_lora_kernels(cls, data):
@@ -1515,7 +1537,8 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
            # RL trainers not tested so don't enable kernels by default
            return data
        if data.get("adapter") in ["lora", "qlora"]:
-            # Skip if already set or using 8-bit
+            # Skip if already set, using unsloth optimizations, or using 8-bit
+            unsloth_fields = ["unsloth_lora_mlp", "unsloth_lora_qkv", "unsloth_lora_o"]
            kernel_fields = [
                "lora_mlp_kernel",
                "lora_qkv_kernel",
@@ -1524,6 +1547,7 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
            ]
            if (
                any(data.get(k) is not None for k in kernel_fields)
+                or any(data.get(k) for k in unsloth_fields)
                or data.get("adapter") == "lora"
                and data.get("load_in_8bit")
            ):
--- a/src/axolotl/utils/schemas/validation.py
+++ b/src/axolotl/utils/schemas/validation.py
@@ -52,26 +52,6 @@ class DatasetValidationMixin:

        return datasets

-    @model_validator(mode="before")
-    @classmethod
-    def check_deprecated_unsloth_fields(cls, data):
-        deprecated_fields = [
-            "unsloth_cross_entropy_loss",
-            "unsloth_lora_mlp",
-            "unsloth_lora_qkv",
-            "unsloth_lora_o",
-            "unsloth_rms_norm",
-            "unsloth_rope",
-        ]
-        found = [f for f in deprecated_fields if data.get(f)]
-        if found:
-            raise ValueError(
-                f"`{'`, `'.join(found)}` {'has' if len(found) == 1 else 'have'} been removed. "
-                "Please use `lora_mlp_kernel`, `lora_qkv_kernel`, `lora_o_kernel` instead. "
-                "See: https://docs.axolotl.ai/docs/lora_optims.html"
-            )
-        return data
-
    @model_validator(mode="before")
    @classmethod
    def check_dataset_or_pretraining_dataset(cls, data):
@@ -627,6 +607,36 @@ class LoRAValidationMixin:
            )
        return data

+    @model_validator(mode="before")
+    @classmethod
+    def check_qlora_unsloth(cls, data):
+        if (
+            data.get("unsloth_lora_mlp")
+            or data.get("unsloth_lora_qkv")
+            or data.get("unsloth_lora_o")
+        ):
+            if data.get("adapter") == "lora" and data.get("load_in_8bit"):
+                raise ValueError(
+                    "unsloth_lora_mlp, unsloth_lora_qkv, and unsloth_lora_o are not compatible with 8-bit LoRA"
+                )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_lora_axolotl_unsloth(cls, data):
+        is_lora_kernel = any(
+            data.get(k) for k in ["lora_mlp_kernel", "lora_qkv_kernel", "lora_o_kernel"]
+        )
+        is_unsloth_lora = any(
+            data.get(k)
+            for k in ["unsloth_lora_mlp", "unsloth_lora_qkv", "unsloth_lora_o"]
+        )
+        if is_lora_kernel and is_unsloth_lora:
+            raise ValueError(
+                "both lora_mlp_kernel and unsloth_lora_mlp cannot be true (similarly for lora_qkv_kernel, lora_o_kernel)"
+            )
+        return data
+
    @model_validator(mode="after")
    def check_fused_lora(self):
        if self.adapter in ["lora", "qlora"] and self.flash_attn_fuse_mlp:
@@ -760,6 +770,88 @@ class RLValidationMixin:
            )
        return data

+    @model_validator(mode="before")
+    @classmethod
+    def check_grpo_batch_size_divisibility(cls, data):
+        """Surface GRPO batch-shape mismatches at config-parse time.
+
+        TRL's GRPOTrainer requires that the per-step generation batch size be
+        evenly divisible by ``num_generations`` so that every prompt can be
+        replicated exactly ``num_generations`` times. The runtime check inside
+        ``GRPOTrainer.__init__`` only fires after the model has been loaded —
+        too late and too cryptic for the user. We replicate the check here so
+        the failure is immediate and actionable.
+
+        Also enforces:
+          - ``num_generations >= 2`` (group-relative advantage needs variance)
+          - ``effective_gbs >= num_generations * world_size`` when capabilities
+            indicate multiple ranks (each rank needs at least one full group)
+        """
+        if data.get("rl") != "grpo":
+            return data
+
+        trl_cfg = data.get("trl") or {}
+        num_gen = trl_cfg.get("num_generations")
+        if num_gen is None:
+            # TRL's own default is 8 — but if the user didn't set it, we
+            # don't have enough info to validate anything. Let TRL's own
+            # init handle the default-vs-batch interaction.
+            return data
+        if num_gen < 2:
+            raise ValueError(
+                f"GRPO requires `trl.num_generations >= 2` (got {num_gen}). "
+                "With num_generations=1, every group has zero advantage and "
+                "the policy never updates."
+            )
+
+        explicit_gbs = trl_cfg.get("generation_batch_size")
+        if explicit_gbs is not None:
+            effective_gbs = int(explicit_gbs)
+            gbs_source = "trl.generation_batch_size"
+        else:
+            mb = data.get("micro_batch_size") or 1
+            ga = data.get("gradient_accumulation_steps") or 1
+            effective_gbs = int(mb) * int(ga)
+            gbs_source = f"micro_batch_size ({mb}) * gradient_accumulation_steps ({ga})"
+
+        if effective_gbs % num_gen != 0:
+            # Suggest the smallest GA bump that fixes it for the common case
+            # where the user hasn't set generation_batch_size explicitly.
+            hint = ""
+            if explicit_gbs is None:
+                from math import gcd
+
+                mb_val = int(data.get("micro_batch_size") or 1)
+                # smallest GA such that mb*GA is a multiple of num_gen
+                lcm = num_gen * mb_val // gcd(num_gen, mb_val)
+                suggested_ga = lcm // mb_val
+                hint = (
+                    f" Smallest fix: set `gradient_accumulation_steps: "
+                    f"{suggested_ga}` (so micro_batch_size * GA = "
+                    f"{mb_val * suggested_ga} is a multiple of {num_gen})."
+                )
+            raise ValueError(
+                f"GRPO: generation batch size must be divisible by "
+                f"`trl.num_generations`. Got effective_gbs={effective_gbs} "
+                f"(from {gbs_source}) and num_generations={num_gen}.{hint}"
+            )
+
+        # Multi-rank check: each rank must receive at least one full group
+        # per step. Without `capabilities` populated yet (mode='before'), we
+        # fall back to user-set distributed fields.
+        world_size = (
+            (data.get("capabilities") or {}).get("n_gpu") or data.get("world_size") or 1
+        )
+        if world_size and world_size > 1 and effective_gbs < num_gen * world_size:
+            raise ValueError(
+                f"GRPO with world_size={world_size} requires effective_gbs "
+                f">= num_generations * world_size = {num_gen * world_size}, "
+                f"got {effective_gbs}. Increase gradient_accumulation_steps "
+                f"or micro_batch_size."
+            )
+
+        return data
+

 class OptimizationValidationMixin:
    """Validation methods related to optimization and performance."""
@@ -850,6 +942,17 @@ class OptimizationValidationMixin:

        return data

+    @model_validator(mode="before")
+    @classmethod
+    def check_xentropy_patch_conflicts(cls, data):
+        if data.get("flash_attn_cross_entropy") and data.get(
+            "unsloth_cross_entropy_loss"
+        ):
+            raise ValueError(
+                "flash_attn_cross_entropy and unsloth_cross_entropy_loss cannot be both enabled"
+            )
+        return data
+
    @model_validator(mode="before")
    @classmethod
    def check_cross_entropy_conflicts(cls, data):
--- a/src/setuptools_axolotl_dynamic_dependencies.py
+++ b/src/setuptools_axolotl_dynamic_dependencies.py
@@ -0,0 +1,102 @@
+"""
+dynamic requirements for axolotl
+"""
+
+import platform
+import re
+from importlib.metadata import PackageNotFoundError, version
+
+from setuptools.command.build_py import build_py as _build_py
+
+
+def parse_requirements():
+    _install_requires = []
+    _dependency_links = []
+    with open("./requirements.txt", encoding="utf-8") as requirements_file:
+        lines = [r.strip() for r in requirements_file.readlines()]
+        for line in lines:
+            is_extras = (
+                "flash-attn" in line
+                or "flash-attention" in line
+                or "deepspeed" in line
+                or "mamba-ssm" in line
+                or "lion-pytorch" in line
+            )
+            if line.startswith("--extra-index-url"):
+                # Handle custom index URLs
+                _, url = line.split()
+                _dependency_links.append(url)
+            elif not is_extras and line and line[0] != "#":
+                # Handle standard packages
+                _install_requires.append(line)
+
+    try:
+        xformers_version = [req for req in _install_requires if "xformers" in req][0]
+        torchao_version = [req for req in _install_requires if "torchao" in req][0]
+
+        if "Darwin" in platform.system():
+            # don't install xformers on MacOS
+            _install_requires.pop(_install_requires.index(xformers_version))
+        else:
+            # detect the version of torch already installed
+            # and set it so dependencies don't clobber the torch version
+            try:
+                torch_version = version("torch")
+            except PackageNotFoundError:
+                torch_version = "2.5.1"
+            _install_requires.append(f"torch=={torch_version}")
+
+            version_match = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?", torch_version)
+            if version_match:
+                major, minor, patch = version_match.groups()
+                major, minor = int(major), int(minor)
+                patch = (
+                    int(patch) if patch is not None else 0
+                )  # Default patch to 0 if not present
+            else:
+                raise ValueError("Invalid version format")
+
+            if (major, minor) >= (2, 5):
+                _install_requires.pop(_install_requires.index(xformers_version))
+                if patch == 0:
+                    _install_requires.append("xformers==0.0.28.post2")
+                else:
+                    _install_requires.append("xformers==0.0.28.post3")
+            elif (major, minor) >= (2, 4):
+                if patch == 0:
+                    _install_requires.pop(_install_requires.index(xformers_version))
+                    _install_requires.append("xformers>=0.0.27")
+                else:
+                    _install_requires.pop(_install_requires.index(xformers_version))
+                    _install_requires.append("xformers==0.0.28.post1")
+            elif (major, minor) >= (2, 3):
+                _install_requires.pop(_install_requires.index(torchao_version))
+                if patch == 0:
+                    _install_requires.pop(_install_requires.index(xformers_version))
+                    _install_requires.append("xformers>=0.0.26.post1")
+                else:
+                    _install_requires.pop(_install_requires.index(xformers_version))
+                    _install_requires.append("xformers>=0.0.27")
+            elif (major, minor) >= (2, 2):
+                _install_requires.pop(_install_requires.index(torchao_version))
+                _install_requires.pop(_install_requires.index(xformers_version))
+                _install_requires.append("xformers>=0.0.25.post1")
+            else:
+                _install_requires.pop(_install_requires.index(torchao_version))
+                _install_requires.pop(_install_requires.index(xformers_version))
+                _install_requires.append("xformers>=0.0.23.post1")
+
+    except PackageNotFoundError:
+        pass
+    return _install_requires, _dependency_links
+
+
+class BuildPyCommand(_build_py):
+    """
+    custom build_py command to parse dynamic requirements
+    """
+
+    def finalize_options(self):
+        super().finalize_options()
+        install_requires, _ = parse_requirements()
+        self.distribution.install_requires = install_requires
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -325,10 +325,10 @@ def download_phi_4_reasoning_model_fixture():


@pytest.fixture(scope="session", autouse=True)
-def download_phi_3_mini_model_fixture():
+def download_phi_3_medium_model_fixture():
    # download the tokenizer only
    snapshot_download_w_retry(
-        "microsoft/Phi-3-mini-4k-instruct",
+        "microsoft/Phi-3-medium-128k-instruct",
        repo_type="model",
        allow_patterns=["*token*", "config.json"],
    )
--- a/tests/core/test_async_grpo.py
+++ b/tests/core/test_async_grpo.py
@@ -216,5 +216,197 @@ class TestValidateQuantPatchRestore(unittest.TestCase):
        self.assertIs(_trainer_module.validate_quantization_for_training, original)


+class TestVllmLoraSyncPatch(unittest.TestCase):
+    """The ``_generate_single_turn`` patch wires sync_weights to the right place.
+
+    These tests exercise the patch-installation branch in isolation. They build
+    a stub trainer with just enough attributes to look like
+    ``AsyncGRPOTrainer`` for the duration of the relevant code path.
+
+    Background — there are two correct behaviors and we historically had a bug
+    where both modes used the same one:
+
+      - Async prefetch ON: the BG generation thread can't safely call
+        sync_weights mid-rollout. We no-op the stock hook and drive sync from
+        the main thread via ``_maybe_sync_vllm_weights``.
+      - Async prefetch OFF: TRL's stock ``_generate_single_turn`` already
+        calls ``sync_weights`` once per step boundary on the main thread. We
+        wire that hook directly to ``_sync_lora_adapter`` because
+        ``_maybe_sync_vllm_weights`` short-circuits when async is off.
+
+    Before the fix, both modes installed ``lambda: None``, so sync mode never
+    pushed any LoRA adapter to vLLM and the trainer was a no-op.
+    """
+
+    @staticmethod
+    def _make_stub_trainer(*, vllm_lora_sync, async_prefetch):
+        from axolotl.core.trainers.grpo.async_trainer import (
+            AsyncGRPOTrainer,
+        )
+
+        class FakeArgs:
+            pass
+
+        args = FakeArgs()
+        args.vllm_lora_sync = vllm_lora_sync
+        args.async_prefetch = async_prefetch
+
+        class FakeVllmGen:
+            sync_weights = staticmethod(lambda: None)
+            model = MagicMock()
+
+        # Use object.__new__ so we don't run __init__ (which needs a real
+        # model, dataset, etc.). We only need the `_generate_single_turn`
+        # method's patch branch to run, so we set up the minimum state.
+        trainer = object.__new__(AsyncGRPOTrainer)
+        trainer.args = args
+        trainer.use_vllm = True
+        trainer.vllm_generation = FakeVllmGen()
+        trainer._patched_sync_weights = False
+        # Spy on _sync_lora_adapter so we can assert it's the function the
+        # hook delegates to in sync mode.
+        trainer._sync_lora_adapter = MagicMock(name="_sync_lora_adapter_spy")
+        trainer._sync_peft_weights_no_merge = MagicMock(
+            name="_sync_peft_weights_no_merge_spy"
+        )
+        return trainer
+
+    @staticmethod
+    def _run_patch_branch(trainer):
+        """Execute just the sync_weights-patching branch in isolation.
+
+        We can't easily call the real ``_generate_single_turn`` because it
+        does a full vLLM generate. Instead we copy the exact branch out of
+        the source so the test verifies the same logic the trainer runs.
+        """
+        if not getattr(trainer, "_patched_sync_weights", False):
+            if trainer.use_vllm and hasattr(trainer, "vllm_generation"):
+                if getattr(trainer.args, "vllm_lora_sync", False):
+                    if getattr(trainer.args, "async_prefetch", False):
+                        trainer.vllm_generation.sync_weights = lambda: None
+                    else:
+                        sync_helper = trainer._sync_lora_adapter
+
+                        def _lora_filesystem_sync():
+                            sync_helper()
+
+                        trainer.vllm_generation.sync_weights = _lora_filesystem_sync
+                    trainer._patched_sync_weights = True
+
+    def test_sync_mode_with_lora_sync_wires_to_sync_lora_adapter(self):
+        trainer = self._make_stub_trainer(vllm_lora_sync=True, async_prefetch=False)
+        self._run_patch_branch(trainer)
+
+        assert trainer._patched_sync_weights is True
+        # Trigger the patched hook — it must call _sync_lora_adapter.
+        trainer.vllm_generation.sync_weights()
+        trainer._sync_lora_adapter.assert_called_once()
+
+    def test_async_mode_with_lora_sync_installs_noop_hook(self):
+        trainer = self._make_stub_trainer(vllm_lora_sync=True, async_prefetch=True)
+        self._run_patch_branch(trainer)
+
+        assert trainer._patched_sync_weights is True
+        # Hook must be a no-op so BG-thread generation doesn't fight the
+        # main-thread optimizer step over the model weights.
+        trainer.vllm_generation.sync_weights()
+        trainer._sync_lora_adapter.assert_not_called()
+
+    def test_sync_mode_with_lora_sync_does_not_call_during_install(self):
+        """Installing the patch should not pre-emptively sync."""
+        trainer = self._make_stub_trainer(vllm_lora_sync=True, async_prefetch=False)
+        self._run_patch_branch(trainer)
+        # _sync_lora_adapter should only be called when the patched hook
+        # itself is invoked (e.g., from TRL's _generate_single_turn).
+        trainer._sync_lora_adapter.assert_not_called()
+
+    def test_patch_is_idempotent(self):
+        trainer = self._make_stub_trainer(vllm_lora_sync=True, async_prefetch=False)
+        self._run_patch_branch(trainer)
+        first_hook = trainer.vllm_generation.sync_weights
+        # Second call must not re-patch (otherwise we'd lose the original).
+        self._run_patch_branch(trainer)
+        assert trainer.vllm_generation.sync_weights is first_hook
+
+
+class TestMaybeSyncVllmWeightsIntervalDefault(unittest.TestCase):
+    """``_maybe_sync_vllm_weights`` must not crash when interval is unset.
+
+    Before the fix, ``step % self.args.vllm_sync_interval`` would TypeError
+    on the very first call when ``vllm_sync_interval`` was ``None`` (which
+    is the default for any config that doesn't explicitly set it). We now
+    fall back to interval=1 so unset means "sync every step", matching the
+    behavior of TRL's own ``_generate_single_turn``.
+    """
+
+    @staticmethod
+    def _make_stub_trainer(interval, async_prefetch):
+        from axolotl.core.trainers.grpo.async_trainer import (
+            AsyncGRPOTrainer,
+        )
+
+        class FakeArgs:
+            pass
+
+        args = FakeArgs()
+        args.async_prefetch = async_prefetch
+        args.vllm_sync_interval = interval
+        args.vllm_lora_sync = True
+
+        class FakeState:
+            global_step = 1
+
+        trainer = object.__new__(AsyncGRPOTrainer)
+        trainer.args = args
+        trainer.use_vllm = True
+        trainer.state = FakeState()
+        trainer._last_synced_step = 0
+        trainer._sync_lora_adapter = MagicMock(name="sync_spy")
+        return trainer
+
+    def test_interval_none_in_async_mode_does_not_crash(self):
+        trainer = self._make_stub_trainer(interval=None, async_prefetch=True)
+        from axolotl.core.trainers.grpo.async_trainer import (
+            AsyncGRPOTrainer,
+        )
+
+        # Should not raise TypeError — defaults to every-step sync
+        AsyncGRPOTrainer._maybe_sync_vllm_weights(trainer)
+        trainer._sync_lora_adapter.assert_called_once()
+
+    def test_sync_mode_drives_sync(self):
+        """Sync mode must fire ``_sync_lora_adapter`` from ``_maybe_sync_vllm_weights``.
+
+        The previous behavior (early return when ``not async_prefetch``)
+        assumed TRL's stock ``_generate_single_turn`` would handle sync.
+        That's true for vanilla GRPO but FALSE for NeMo Gym multi-turn
+        where the data producer bypasses ``_generate_single_turn``
+        entirely. Without this trigger no sync ever happens and the
+        trainer becomes a no-op.
+        """
+        trainer = self._make_stub_trainer(interval=1, async_prefetch=False)
+        from axolotl.core.trainers.grpo.async_trainer import (
+            AsyncGRPOTrainer,
+        )
+
+        AsyncGRPOTrainer._maybe_sync_vllm_weights(trainer)
+        trainer._sync_lora_adapter.assert_called_once()
+
+    def test_async_mode_with_explicit_interval_respects_modulo(self):
+        trainer = self._make_stub_trainer(interval=4, async_prefetch=True)
+        from axolotl.core.trainers.grpo.async_trainer import (
+            AsyncGRPOTrainer,
+        )
+
+        # global_step=1, interval=4 → 1 % 4 != 0 → no sync
+        AsyncGRPOTrainer._maybe_sync_vllm_weights(trainer)
+        trainer._sync_lora_adapter.assert_not_called()
+
+        # global_step=4 → 4 % 4 == 0 → sync
+        trainer.state.global_step = 4
+        AsyncGRPOTrainer._maybe_sync_vllm_weights(trainer)
+        trainer._sync_lora_adapter.assert_called_once()
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/tests/e2e/integrations/test_scattermoe_lora_olmoe.py
+++ b/tests/e2e/integrations/test_scattermoe_lora_olmoe.py
@@ -54,9 +54,7 @@ except (ImportError, ModuleNotFoundError):
        )

    def peft_lora_to_scattermoe(peft_A, peft_B, num_experts, rank):
-        smoe_A = peft_A
-        smoe_B = peft_lora_B_to_scattermoe(peft_B, num_experts, rank)
-        return smoe_A, smoe_B
+        return peft_A, peft_lora_B_to_scattermoe(peft_B, num_experts, rank)

    def _unwrap_experts_lora(experts_module):
        return experts_module, None, None
@@ -129,11 +127,7 @@ def scattermoe_lora_B_to_peft(smoe_B, num_experts, rank):


 def peft_gate_up_lora_to_scattermoe(peft_A, peft_B, num_experts, rank):
-    """Convert peft LoRA for gate_up_proj to scattermoe layout.
-
-    Both gate_up_proj and down_proj need the A<->B swap because
-    scattermoe transposes the parameter (W = param.T).
-    """
+    """Convert peft LoRA for gate_up_proj to scattermoe layout."""
    return peft_lora_to_scattermoe(peft_A, peft_B, num_experts, rank)


@@ -306,8 +300,6 @@ class TestLoRABLayoutConversion:
        hidden, inter = 32, 16
        scaling = 2.0

-        # peft >=0.19.1 for down_proj [E, hidden, inter]:
-        # swaps in/out, lora_A [r*E, inter], lora_B [hidden, r*E]
        peft_A = torch.randn(E * r, inter)
        peft_B = torch.randn(hidden, E * r)

@@ -316,6 +308,8 @@ class TestLoRABLayoutConversion:
        delta_peft = torch.einsum("o r e, e r i -> e o i", B_r, A_r) * scaling

        smoe_A, smoe_B = peft_lora_to_scattermoe(peft_A, peft_B, E, r)
+        assert smoe_A.shape == (E * r, inter)
+        assert smoe_B.shape == (hidden, E * r)
        for e in range(E):
            A_e = smoe_A[e * r : (e + 1) * r, :]
            B_e = smoe_B[:, e * r : (e + 1) * r]
@@ -325,22 +319,30 @@ class TestLoRABLayoutConversion:
            )

    def test_gate_up_proj_conversion(self):
-        """Verify gate_up_proj LoRA conversion with non-square dims.
+        """Verify gate_up_proj LoRA conversion with non-square dims (Qwen3-like).

        gate_up_proj param: [E, 2*inter, hidden].
-        peft swaps in/out for 3D: lora_A [r*E, hidden], lora_B [2*inter, r*E].
+        peft: in_features=hidden, out_features=2*inter.
+        peft lora_A: [r*E, hidden], lora_B: [2*inter, r*E].
+
+        scattermoe W = param.T = [E, hidden, 2*inter], K=hidden, N=2*inter.
        scattermoe needs: lora_A [r*E, K=hidden], lora_B [N=2*inter, r*E].
+
+        Uses non-square dims (hidden=32 != 2*inter=24) to catch layout bugs.
        """
        E, r = 4, 2
        hidden, inter = 32, 12  # 2*inter=24 != hidden=32
        scaling = 2.0

-        peft_A = torch.randn(E * r, hidden)  # [r*E, in=hidden]
-        peft_B = torch.randn(2 * inter, E * r)  # [out=2*inter, r*E]
+        # peft assigns: in_features=hidden, out_features=2*inter
+        peft_A = torch.randn(E * r, hidden)  # [r*E, in_features=hidden]
+        peft_B = torch.randn(2 * inter, E * r)  # [out_features=2*inter, r*E]

        A_r = peft_A.reshape(E, r, hidden)
        B_r = peft_B.reshape(2 * inter, r, E)
        delta_peft = torch.einsum("o r e, e r i -> e o i", B_r, A_r) * scaling
+        # delta_peft[e] has shape [out_features, in_features] = [2*inter, hidden]
+        # = param[e] shape [2*inter, hidden]

        smoe_A, smoe_B = peft_gate_up_lora_to_scattermoe(peft_A, peft_B, E, r)
        # smoe_A should be [r*E, K=hidden], smoe_B should be [N=2*inter, r*E]
@@ -398,7 +400,8 @@ class TestPeftLoRAWeightExtraction:
            r,
        )

-        # gate_up_proj [E, 2*inter, hidden] — peft swaps in/out for 3D
+        # gate_up_proj [E, 2*inter, hidden]
+        # peft: in_features=hidden (last dim), out_features=2*inter (middle dim)
        assert trainable[
            "base_model.model.moe.experts.base_layer.lora_A.default.weight"
        ].shape == (E * r, config.hidden_size)
@@ -406,7 +409,8 @@ class TestPeftLoRAWeightExtraction:
            "base_model.model.moe.experts.base_layer.lora_B.default.weight"
        ].shape == (2 * config.intermediate_size, E * r)

-        # down_proj [E, hidden, inter] — peft swaps in/out for 3D
+        # down_proj [E, hidden, inter]
+        # peft: in_features=inter (last dim), out_features=hidden (middle dim)
        assert trainable[
            "base_model.model.moe.experts.lora_A.default.weight"
        ].shape == (E * r, config.intermediate_size)
@@ -463,26 +467,29 @@ class TestPeftLoRAWeightExtraction:
        assert gup_lora is not None, "gate_up_proj LoRA not detected"
        assert down_lora is not None, "down_proj LoRA not detected"

-        # gate_up_proj: K=hidden, N=2*inter
+        # Check shapes after peft->scattermoe conversion.
+        # gate_up_proj: peft A [E*r, hidden] / B [2*inter, E*r]
+        # scattermoe: smoe_A [E*r, hidden], smoe_B [2*inter, E*r]
        E, r = config.num_experts, 4
        gup_A, gup_B, gup_s = gup_lora
        assert gup_A.shape == (E * r, config.hidden_size), (
-            f"gate_up_proj smoe_A: expected [r*E, K=hidden]={(E * r, config.hidden_size)}, "
+            f"gate_up_proj smoe_A: expected [r*E, hidden]={(E * r, config.hidden_size)}, "
            f"got {gup_A.shape}"
        )
        assert gup_B.shape == (2 * config.intermediate_size, E * r), (
-            f"gate_up_proj smoe_B: expected [N=2*inter, r*E]="
+            f"gate_up_proj smoe_B: expected [2*inter, r*E]="
            f"{(2 * config.intermediate_size, E * r)}, got {gup_B.shape}"
        )

-        # down_proj: K=inter, N=hidden
+        # down_proj: peft A [E*r, inter] / B [hidden, E*r]
+        # scattermoe: smoe_A [E*r, inter], smoe_B [hidden, E*r]
        down_A, down_B, down_s = down_lora
        assert down_A.shape == (E * r, config.intermediate_size), (
-            f"down_proj smoe_A: expected [r*E, K=inter]={(E * r, config.intermediate_size)}, "
+            f"down_proj smoe_A: expected [r*E, inter]={(E * r, config.intermediate_size)}, "
            f"got {down_A.shape}"
        )
        assert down_B.shape == (config.hidden_size, E * r), (
-            f"down_proj smoe_B: expected [N=hidden, r*E]={(config.hidden_size, E * r)}, "
+            f"down_proj smoe_B: expected [hidden, r*E]={(config.hidden_size, E * r)}, "
            f"got {down_B.shape}"
        )

--- a/tests/e2e/patched/test_unsloth_integration.py
+++ b/tests/e2e/patched/test_unsloth_integration.py
@@ -0,0 +1,21 @@
+"""Test module for checking whether the integration of Unsloth with Hugging Face Transformers is working as expected."""
+
+import unittest
+
+import pytest
+
+
+@pytest.mark.skip(
+    reason="Unsloth integration will be broken going into latest transformers"
+)
+class TestUnslothIntegration(unittest.TestCase):
+    """Unsloth monkeypatch integration tests."""
+
+    def test_is_self_attn_patchable(self):
+        from axolotl.monkeypatch.unsloth_ import check_self_attn_is_patchable
+
+        # ensures the current version of transformers has loss code that matches our patching code
+        self.assertTrue(
+            check_self_attn_is_patchable(),
+            "HF transformers self attention code has changed and isn't patchable",
+        )
--- a/tests/e2e/patched/test_unsloth_qlora.py
+++ b/tests/e2e/patched/test_unsloth_qlora.py
@@ -0,0 +1,184 @@
+"""
+e2e tests for unsloth qlora
+"""
+
+import pytest
+
+from axolotl.common.datasets import load_datasets
+from axolotl.train import train
+from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.dict import DictDefault
+
+from ..utils import check_model_output_exists, check_tensorboard
+
+
+@pytest.mark.skip(
+    reason="Unsloth integration will be broken going into latest transformers"
+)
+class TestUnslothQLoRA:
+    """
+    Test class for Unsloth QLoRA Llama models
+    """
+
+    @pytest.mark.parametrize(
+        "sample_packing",
+        [True, False],
+    )
+    def test_unsloth_llama_qlora_fa2(self, temp_dir, sample_packing):
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "sequence_len": 1024,
+                "sample_packing": sample_packing,
+                "flash_attention": True,
+                "unsloth_lora_mlp": True,
+                "unsloth_lora_qkv": True,
+                "unsloth_lora_o": True,
+                "load_in_4bit": True,
+                "adapter": "qlora",
+                "lora_r": 16,
+                "lora_alpha": 16,
+                "lora_dropout": 0.05,
+                "lora_target_linear": True,
+                "val_set_size": 0.05,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "mhenrichsen/alpaca_2k_test",
+                        "type": "alpaca",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 5,
+                "save_steps": 10,
+                "micro_batch_size": 4,
+                "gradient_accumulation_steps": 2,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_8bit",
+                "lr_scheduler": "cosine",
+                "use_tensorboard": True,
+                "bf16": "auto",
+                "save_first_step": False,
+            }
+        )
+
+        cfg = validate_config(cfg)
+        normalize_config(cfg)
+        dataset_meta = load_datasets(cfg=cfg)
+
+        train(cfg=cfg, dataset_meta=dataset_meta)
+        check_model_output_exists(temp_dir, cfg)
+
+        check_tensorboard(
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
+        )
+
+    def test_unsloth_llama_qlora_unpacked(self, temp_dir):
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "sequence_len": 1024,
+                "unsloth_lora_mlp": True,
+                "unsloth_lora_qkv": True,
+                "unsloth_lora_o": True,
+                "sample_packing": False,
+                "load_in_4bit": True,
+                "adapter": "qlora",
+                "lora_r": 16,
+                "lora_alpha": 16,
+                "lora_dropout": 0.05,
+                "lora_target_linear": True,
+                "val_set_size": 0.05,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "mhenrichsen/alpaca_2k_test",
+                        "type": "alpaca",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 5,
+                "save_steps": 10,
+                "micro_batch_size": 4,
+                "gradient_accumulation_steps": 2,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_8bit",
+                "lr_scheduler": "cosine",
+                "use_tensorboard": True,
+                "bf16": "auto",
+                "save_first_step": False,
+            }
+        )
+
+        cfg = validate_config(cfg)
+        normalize_config(cfg)
+        dataset_meta = load_datasets(cfg=cfg)
+
+        train(cfg=cfg, dataset_meta=dataset_meta)
+        check_model_output_exists(temp_dir, cfg)
+
+        check_tensorboard(
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
+        )
+
+    @pytest.mark.parametrize(
+        "sdp_attention",
+        [True, False],
+    )
+    def test_unsloth_llama_qlora_unpacked_no_fa2_fp16(self, temp_dir, sdp_attention):
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "sequence_len": 1024,
+                "unsloth_lora_mlp": True,
+                "unsloth_lora_qkv": True,
+                "unsloth_lora_o": True,
+                "sample_packing": False,
+                "load_in_4bit": True,
+                "adapter": "qlora",
+                "lora_r": 16,
+                "lora_alpha": 16,
+                "lora_dropout": 0.05,
+                "lora_target_linear": True,
+                "val_set_size": 0.05,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "mhenrichsen/alpaca_2k_test",
+                        "type": "alpaca",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 5,
+                "save_steps": 10,
+                "micro_batch_size": 4,
+                "gradient_accumulation_steps": 2,
+                "sdp_attention": sdp_attention,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_8bit",
+                "lr_scheduler": "cosine",
+                "use_tensorboard": True,
+                "fp16": True,
+                "save_first_step": False,
+            }
+        )
+
+        cfg = validate_config(cfg)
+        normalize_config(cfg)
+        dataset_meta = load_datasets(cfg=cfg)
+
+        train(cfg=cfg, dataset_meta=dataset_meta)
+        check_model_output_exists(temp_dir, cfg)
+
+        check_tensorboard(
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
+        )
--- a/tests/integrations/test_nemo_gym.py
+++ b/tests/integrations/test_nemo_gym.py
@@ -361,6 +361,329 @@ class TestPluginDefaults(unittest.TestCase):
        assert cfg.dataloader_num_workers == 0


+class TestSelectWeightSyncTransport(unittest.TestCase):
+    """Pure-logic table tests for ``select_weight_sync_transport``."""
+
+    def _caps(self, **kwargs):
+        from axolotl.integrations.nemo_gym.plugin import VLLMWeightSyncCapabilities
+
+        c = VLLMWeightSyncCapabilities(probed=True)
+        for k, v in kwargs.items():
+            setattr(c, k, v)
+        return c
+
+    def test_lora_with_native_endpoint(self):
+        from axolotl.integrations.nemo_gym.plugin import select_weight_sync_transport
+
+        caps = self._caps(lora_filesystem=True)
+        assert (
+            select_weight_sync_transport(caps, has_lora=True, vllm_lora_sync_pref=True)
+            == "lora_filesystem"
+        )
+
+    def test_lora_with_axolotl_endpoint(self):
+        from axolotl.integrations.nemo_gym.plugin import select_weight_sync_transport
+
+        caps = self._caps(lora_axolotl=True)
+        assert (
+            select_weight_sync_transport(caps, has_lora=True, vllm_lora_sync_pref=False)
+            == "lora_filesystem"
+        )
+
+    def test_lora_falls_back_to_nccl_when_no_lora_endpoint(self):
+        from axolotl.integrations.nemo_gym.plugin import select_weight_sync_transport
+
+        caps = self._caps(nccl=True)
+        assert (
+            select_weight_sync_transport(caps, has_lora=True, vllm_lora_sync_pref=False)
+            == "nccl"
+        )
+
+    def test_full_param_prefers_nccl(self):
+        from axolotl.integrations.nemo_gym.plugin import select_weight_sync_transport
+
+        caps = self._caps(nccl=True, http_full=True)
+        assert (
+            select_weight_sync_transport(
+                caps, has_lora=False, vllm_lora_sync_pref=False
+            )
+            == "nccl"
+        )
+
+    def test_full_param_falls_back_to_http(self):
+        from axolotl.integrations.nemo_gym.plugin import select_weight_sync_transport
+
+        caps = self._caps(http_full=True)
+        assert (
+            select_weight_sync_transport(
+                caps, has_lora=False, vllm_lora_sync_pref=False
+            )
+            == "http_full"
+        )
+
+    def test_full_param_no_routes_returns_none(self):
+        from axolotl.integrations.nemo_gym.plugin import select_weight_sync_transport
+
+        caps = self._caps()  # all False
+        assert (
+            select_weight_sync_transport(
+                caps, has_lora=False, vllm_lora_sync_pref=False
+            )
+            == "none"
+        )
+
+    def test_lora_no_routes_returns_none(self):
+        from axolotl.integrations.nemo_gym.plugin import select_weight_sync_transport
+
+        caps = self._caps()
+        assert (
+            select_weight_sync_transport(caps, has_lora=True, vllm_lora_sync_pref=True)
+            == "none"
+        )
+
+
+class TestProbeVllmWeightSync(unittest.TestCase):
+    """``probe_vllm_weight_sync`` reads a vLLM ``/openapi.json`` and reports caps."""
+
+    def test_stock_vllm_with_lora_enabled(self):
+        """Stock ``vllm serve --enable-lora`` exposes only LoRA endpoints."""
+        from unittest.mock import patch
+
+        from axolotl.integrations.nemo_gym.plugin import probe_vllm_weight_sync
+
+        spec = {
+            "paths": {
+                "/v1/models": {"get": {}},
+                "/v1/load_lora_adapter": {"post": {}},
+                "/v1/unload_lora_adapter": {"post": {}},
+                "/v1/completions": {"post": {}},
+            }
+        }
+        with patch("requests.get") as mock_get:
+            mock_get.return_value.raise_for_status = lambda: None
+            mock_get.return_value.json = lambda: spec
+            caps = probe_vllm_weight_sync("http://localhost:8000")
+
+        assert caps.probed is True
+        assert caps.lora_filesystem is True
+        assert caps.lora_axolotl is False
+        assert caps.nccl is False
+        assert caps.http_full is False
+
+    def test_axolotl_serve_lora_full_capabilities(self):
+        """``axolotl vllm-serve`` exposes NCCL + LoRA + HTTP full sync."""
+        from unittest.mock import patch
+
+        from axolotl.integrations.nemo_gym.plugin import probe_vllm_weight_sync
+
+        spec = {
+            "paths": {
+                "/init_communicator/": {"post": {}},
+                "/update_named_param/": {"post": {}},
+                "/batch_update_named_params/": {"post": {}},
+                "/set_lora_adapter/": {"post": {}},
+                "/clear_lora_adapter/": {"post": {}},
+                "/http_update_weights/": {"post": {}},
+                "/v1/load_lora_adapter": {"post": {}},
+            }
+        }
+        with patch("requests.get") as mock_get:
+            mock_get.return_value.raise_for_status = lambda: None
+            mock_get.return_value.json = lambda: spec
+            caps = probe_vllm_weight_sync("http://localhost:8000")
+
+        assert caps.probed is True
+        assert caps.nccl is True
+        assert caps.lora_axolotl is True
+        assert caps.lora_filesystem is True
+        assert caps.http_full is True
+
+    def test_trl_vllm_serve_nccl_only(self):
+        """``trl vllm-serve`` exposes NCCL routes but not LoRA filesystem."""
+        from unittest.mock import patch
+
+        from axolotl.integrations.nemo_gym.plugin import probe_vllm_weight_sync
+
+        spec = {
+            "paths": {
+                "/init_communicator/": {"post": {}},
+                "/update_named_param/": {"post": {}},
+                "/batch_update_named_params/": {"post": {}},
+                "/close_communicator/": {"post": {}},
+                "/generate/": {"post": {}},
+            }
+        }
+        with patch("requests.get") as mock_get:
+            mock_get.return_value.raise_for_status = lambda: None
+            mock_get.return_value.json = lambda: spec
+            caps = probe_vllm_weight_sync("http://localhost:8000")
+
+        assert caps.probed is True
+        assert caps.nccl is True
+        assert caps.lora_filesystem is False
+        assert caps.lora_axolotl is False
+        assert caps.http_full is False
+
+    def test_unreachable_server_records_error(self):
+        from unittest.mock import patch
+
+        from axolotl.integrations.nemo_gym.plugin import probe_vllm_weight_sync
+
+        with patch("requests.get") as mock_get:
+            mock_get.side_effect = ConnectionError("Connection refused")
+            caps = probe_vllm_weight_sync("http://localhost:9999")
+
+        assert caps.probed is False
+        assert caps.probe_error is not None
+        assert "ConnectionError" in caps.probe_error
+        assert caps.nccl is False
+        assert caps.lora_filesystem is False
+
+
+class TestPluginWeightSyncEnforcement(unittest.TestCase):
+    """End-to-end test of post_trainer_create's transport-selection branch.
+
+    The plugin used to silently no-op weight sync when ``vllm_lora_sync: false``,
+    leaving the trainer learning in isolation while vLLM kept serving the
+    unmodified base model. After the fix:
+
+      - LoRA + LoRA-loading endpoint   → installs filesystem LoRA sync
+      - LoRA + only NCCL endpoint      → uses NCCL broadcast
+      - Full FT + NCCL endpoint        → uses NCCL broadcast (standard TRL flow)
+      - Full FT + HTTP endpoint        → raises NotImplementedError (step 3)
+      - No usable transport            → raises ValueError with a precise diagnosis
+    """
+
+    @staticmethod
+    def _fake_cfg(adapter, vllm_lora_sync):
+        class FakeTRL:
+            pass
+
+        class FakeCfg:
+            pass
+
+        trl = FakeTRL()
+        trl.vllm_lora_sync = vllm_lora_sync
+        trl.vllm_server_host = "127.0.0.1"
+        trl.vllm_server_port = 8000
+
+        cfg = FakeCfg()
+        cfg.nemo_gym_enabled = True
+        cfg.nemo_gym_model_name = None
+        cfg.base_model = "test/model"
+        cfg.nemo_gym_verify_timeout = 30
+        cfg.nemo_gym_multi_turn = True
+        cfg.adapter = adapter
+        cfg.trl = trl
+        return cfg
+
+    @staticmethod
+    def _fake_trainer():
+        class FakeVLLMGen:
+            sync_weights = staticmethod(lambda: None)
+
+        class FakeTrainer:
+            vllm_generation = FakeVLLMGen()
+
+        return FakeTrainer()
+
+    @staticmethod
+    def _caps(**kwargs):
+        from axolotl.integrations.nemo_gym.plugin import VLLMWeightSyncCapabilities
+
+        c = VLLMWeightSyncCapabilities(probed=True)
+        for k, v in kwargs.items():
+            setattr(c, k, v)
+        return c
+
+    def test_lora_with_lora_endpoint_installs_filesystem_sync(self):
+        from unittest.mock import patch
+
+        from axolotl.integrations.nemo_gym.plugin import NemoGymPlugin
+
+        plugin = NemoGymPlugin()
+        plugin._vllm_caps = self._caps(lora_filesystem=True)
+        cfg = self._fake_cfg(adapter="lora", vllm_lora_sync=True)
+        trainer = self._fake_trainer()
+
+        with (
+            patch.object(plugin, "_setup_lora_sync") as setup,
+            patch.object(plugin, "_check_lora_endpoint") as check,
+            patch.object(plugin, "_wire_multi_turn") as wire,
+        ):
+            plugin.post_trainer_create(cfg, trainer)
+            setup.assert_called_once()
+            check.assert_called_once()
+            wire.assert_called_once()
+
+    def test_lora_with_no_routes_raises_with_lora_specific_message(self):
+        from axolotl.integrations.nemo_gym.plugin import NemoGymPlugin
+
+        plugin = NemoGymPlugin()
+        plugin._vllm_caps = self._caps()  # all False, but probed
+        cfg = self._fake_cfg(adapter="lora", vllm_lora_sync=False)
+        trainer = self._fake_trainer()
+
+        with self.assertRaises(ValueError) as ctx:
+            plugin.post_trainer_create(cfg, trainer)
+        msg = str(ctx.exception)
+        assert "no-op trainer" in msg
+        assert "load_lora_adapter" in msg
+        assert "VLLM_ALLOW_RUNTIME_LORA_UPDATING" in msg
+
+    def test_full_finetune_with_nccl_endpoint_uses_nccl(self):
+        from unittest.mock import patch
+
+        from axolotl.integrations.nemo_gym.plugin import NemoGymPlugin
+
+        plugin = NemoGymPlugin()
+        plugin._vllm_caps = self._caps(nccl=True)
+        cfg = self._fake_cfg(adapter=None, vllm_lora_sync=False)
+        trainer = self._fake_trainer()
+
+        with patch.object(plugin, "_wire_multi_turn") as wire:
+            plugin.post_trainer_create(cfg, trainer)
+            wire.assert_called_once()
+
+    def test_full_finetune_with_http_endpoint_not_implemented_yet(self):
+        from axolotl.integrations.nemo_gym.plugin import NemoGymPlugin
+
+        plugin = NemoGymPlugin()
+        plugin._vllm_caps = self._caps(http_full=True)
+        cfg = self._fake_cfg(adapter=None, vllm_lora_sync=False)
+        trainer = self._fake_trainer()
+        with self.assertRaises(NotImplementedError) as ctx:
+            plugin.post_trainer_create(cfg, trainer)
+        assert "HTTP weight sync" in str(ctx.exception)
+
+    def test_full_finetune_with_no_routes_raises_with_full_param_message(self):
+        from axolotl.integrations.nemo_gym.plugin import NemoGymPlugin
+
+        plugin = NemoGymPlugin()
+        plugin._vllm_caps = self._caps()
+        cfg = self._fake_cfg(adapter=None, vllm_lora_sync=False)
+        trainer = self._fake_trainer()
+        with self.assertRaises(ValueError) as ctx:
+            plugin.post_trainer_create(cfg, trainer)
+        msg = str(ctx.exception)
+        assert "no-op trainer" in msg
+        assert "init_communicator" in msg
+        assert "http_update_weights" in msg
+
+    def test_unprobed_caps_raises_with_probe_failure_message(self):
+        from axolotl.integrations.nemo_gym.plugin import NemoGymPlugin
+
+        plugin = NemoGymPlugin()
+        # Plugin._vllm_caps left as default-None: the post_trainer_create
+        # branch falls back to a fresh VLLMWeightSyncCapabilities() with
+        # probed=False, so the error path should mention probing.
+        cfg = self._fake_cfg(adapter="lora", vllm_lora_sync=True)
+        trainer = self._fake_trainer()
+        with self.assertRaises(ValueError) as ctx:
+            plugin.post_trainer_create(cfg, trainer)
+        assert "could not probe" in str(ctx.exception)
+
+
 class TestNemoGymE2E(unittest.TestCase):
    """End-to-end test: data producer → agent (mocked) → parse → tensors → rewards.

@@ -452,19 +775,15 @@ class TestNemoGymE2E(unittest.TestCase):
        trainer = self._make_mock_trainer()
        producer._trainer = trainer

-        # Mock the prompt iterator (returns a batch of 1 input)
-        producer._prompt_iter = iter(
-            [
-                [
-                    {
-                        "prompt": [{"role": "user", "content": "Play Wordle!"}],
-                    }
-                ]
-            ]
-        )
-        producer._prompt_dl = [
-            [{"prompt": [{"role": "user", "content": "Play Wordle!"}]}]
+        # Mock the prompt iterator. RepeatSampler(mini_repeat_count=num_generations)
+        # pre-expands prompts, so the iterator yields num_generations=2 consecutive
+        # copies of each unique prompt — one entry per rollout.
+        _prompt_batch = [
+            {"prompt": [{"role": "user", "content": "Play Wordle!"}]},
+            {"prompt": [{"role": "user", "content": "Play Wordle!"}]},
        ]
+        producer._prompt_iter = iter([_prompt_batch])
+        producer._prompt_dl = [_prompt_batch]

        # Call produce
        result = producer.produce(model=MagicMock(), global_step=1)
@@ -530,10 +849,13 @@ class TestNemoGymE2E(unittest.TestCase):
        producer._request_timeout = 30
        producer._num_generations = 2
        producer._trainer = self._make_mock_trainer()
-        producer._prompt_iter = iter(
-            [[{"prompt": [{"role": "user", "content": "Play!"}]}]]
-        )
-        producer._prompt_dl = [[{"prompt": [{"role": "user", "content": "Play!"}]}]]
+        # RepeatSampler pre-expands by num_generations=2.
+        _prompt_batch = [
+            {"prompt": [{"role": "user", "content": "Play!"}]},
+            {"prompt": [{"role": "user", "content": "Play!"}]},
+        ]
+        producer._prompt_iter = iter([_prompt_batch])
+        producer._prompt_dl = [_prompt_batch]

        result = producer.produce(model=MagicMock(), global_step=1)

--- a/tests/integrations/test_scattermoe_lora.py
+++ b/tests/integrations/test_scattermoe_lora.py
@@ -21,6 +21,51 @@ from unittest.mock import patch
 import pytest
 import torch

+
+class TestPeftScatterMoELoRALayout:
+    """CPU-only guards for PEFT target_parameters layout conversion."""
+
+    def test_peft_layout_keeps_a_and_reorders_b(self):
+        from axolotl.integrations.kernels.libs.scattermoe_lora.lora_layout import (
+            peft_lora_to_scattermoe,
+        )
+
+        E, r, K, N = 3, 2, 5, 7
+        scaling = 2.0
+        peft_A = torch.randn(E * r, K)
+        peft_B = torch.randn(N, E * r)
+
+        smoe_A, smoe_B = peft_lora_to_scattermoe(peft_A, peft_B, E, r)
+
+        assert smoe_A is peft_A
+        assert smoe_A.shape == (E * r, K)
+        assert smoe_B.shape == (N, E * r)
+
+        A_r = peft_A.reshape(E, r, K)
+        B_r = peft_B.reshape(N, r, E)
+        delta_peft = torch.einsum("o r e, e r i -> e o i", B_r, A_r) * scaling
+
+        for e in range(E):
+            A_e = smoe_A[e * r : (e + 1) * r, :]
+            B_e = smoe_B[:, e * r : (e + 1) * r]
+            torch.testing.assert_close(scaling * (B_e @ A_e), delta_peft[e])
+
+    def test_swapped_layout_fails_before_kernel_dispatch(self):
+        from axolotl.integrations.kernels.libs.scattermoe_lora.lora_layout import (
+            validate_scattermoe_lora_shapes,
+        )
+
+        E, r, K, N = 3, 2, 5, 7
+        expert_weights = torch.empty(E, K, N)
+
+        with pytest.raises(ValueError, match="Invalid ScatterMoE LoRA layout"):
+            validate_scattermoe_lora_shapes(
+                expert_weights=expert_weights,
+                lora_A=torch.empty(E * r, N),
+                lora_B=torch.empty(K, E * r),
+            )
+
+
 # ============================================================================
 # 1. KernelsArgs: disable_mlp_kernel validator
 # ============================================================================
--- a/tests/kernels/test_gemma4_fused_rope.py
+++ b/tests/kernels/test_gemma4_fused_rope.py
@@ -38,6 +38,30 @@ def _reference_norm_noscale(x, eps):
    return norm(x)


+def _reference_partial_norm_rope(x, weight, cos, sin, eps):
+    """Reference: Gemma4RMSNorm over the full head_dim, then stock
+    ``apply_rotary_pos_emb`` over the first ``cos.shape[-1]`` columns, with
+    the trailing columns passed through unchanged. Mirrors how Llama-style
+    partial rotary is layered on top of the stock RMSNorm + RoPE primitives.
+    """
+    from transformers.models.gemma4.modeling_gemma4 import (
+        Gemma4RMSNorm,
+        apply_rotary_pos_emb,
+    )
+
+    D = x.shape[-1]
+    n_rot = cos.shape[-1]
+    norm = Gemma4RMSNorm(D, eps=eps).to(x.device, x.dtype)
+    norm.weight.data.copy_(weight)
+    normed = norm(x)
+    if n_rot == D:
+        return apply_rotary_pos_emb(normed, cos, sin, unsqueeze_dim=2)
+    x_rot = normed[..., :n_rot]
+    x_pass = normed[..., n_rot:]
+    rotated = apply_rotary_pos_emb(x_rot, cos, sin, unsqueeze_dim=2)
+    return torch.cat([rotated, x_pass], dim=-1)
+
+
@pytest.fixture(
    params=[
        (2, 64, 32, 256),  # sliding window layer shape
@@ -194,6 +218,172 @@ class TestFusedRMSNormRoPEBackward:
        assert w.grad.abs().sum() > 0, "w.grad is all zeros"


+class TestFusedRMSNormRoPEPartialRotary:
+    """Partial-rotary: cos/sin last dim is smaller than head_dim.
+
+    Compares against the original primitives (`Gemma4RMSNorm` +
+    `apply_rotary_pos_emb`) applied to the rotated slice with the trailing
+    columns passed through. Without the kernel fix this used to crash with
+    `RuntimeError: shape '[..., D]' is invalid for input of size B*S*n_rot`.
+    """
+
+    @pytest.mark.parametrize(
+        "B,S,H,D,n_rot",
+        [
+            (2, 16, 4, 64, 32),  # half rotary (Llama-style 0.5)
+            (2, 16, 4, 64, 16),  # quarter rotary
+            (2, 32, 8, 128, 64),  # half rotary, larger heads
+            (1, 8, 2, 256, 64),  # 26B sliding-shape, 0.25 partial
+            (1, 8, 2, 64, 64),  # n_rot == D: must still match full-rotary path
+        ],
+        ids=["half_64", "quarter_64", "half_128", "quarter_256", "full_64"],
+    )
+    def test_forward_matches_reference(self, B, S, H, D, n_rot):
+        from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_rope
+
+        eps = 1e-6
+        x = torch.randn(B, S, H, D, device="cuda", dtype=torch.bfloat16)
+        weight = torch.randn(D, device="cuda", dtype=torch.bfloat16)
+        cos = torch.randn(B, S, n_rot, device="cuda", dtype=torch.bfloat16)
+        sin = torch.randn(B, S, n_rot, device="cuda", dtype=torch.bfloat16)
+
+        y_ref = _reference_partial_norm_rope(x.clone(), weight, cos, sin, eps)
+        y_fused = fused_rms_norm_rope(x.clone(), weight, cos, sin, eps=eps)
+
+        assert y_fused.shape == y_ref.shape == (B, S, H, D)
+        cos_sim = torch.nn.functional.cosine_similarity(
+            y_ref.flatten().float(), y_fused.flatten().float(), dim=0
+        )
+        assert cos_sim > 0.999, (
+            f"partial rotary forward cosine_sim={cos_sim:.6f} "
+            f"(B={B},S={S},H={H},D={D},n_rot={n_rot})"
+        )
+
+        # The pass-through tail must equal the reference RMSNorm output bit-
+        # for-bit (any deviation would mean the kernel is touching it with a
+        # spurious rotation, which is the original bug class).
+        torch.testing.assert_close(
+            y_fused[..., n_rot:], y_ref[..., n_rot:], rtol=1e-2, atol=1e-2
+        )
+
+    @pytest.mark.parametrize(
+        "B,S,H,D,n_rot",
+        [(2, 16, 4, 64, 32), (1, 8, 2, 256, 64)],
+        ids=["half_64", "quarter_256"],
+    )
+    def test_x_grad_matches_reference(self, B, S, H, D, n_rot):
+        from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_rope
+
+        eps = 1e-6
+        cos = torch.randn(B, S, n_rot, device="cuda", dtype=torch.bfloat16)
+        sin = torch.randn(B, S, n_rot, device="cuda", dtype=torch.bfloat16)
+        weight_init = torch.randn(D, device="cuda", dtype=torch.bfloat16)
+        x_data = torch.randn(B, S, H, D, device="cuda", dtype=torch.bfloat16)
+
+        # Reference backward via the original primitives
+        x_ref = x_data.clone().requires_grad_(True)
+        w_ref = weight_init.clone()
+        y_ref = _reference_partial_norm_rope(x_ref, w_ref, cos, sin, eps)
+        y_ref.sum().backward()
+
+        # Fused backward
+        x_fused = x_data.clone().requires_grad_(True)
+        w_fused = weight_init.clone().requires_grad_(True)
+        y_fused = fused_rms_norm_rope(x_fused, w_fused, cos, sin, eps=eps)
+        y_fused.sum().backward()
+
+        cos_sim_x = torch.nn.functional.cosine_similarity(
+            x_fused.grad.flatten().float(), x_ref.grad.flatten().float(), dim=0
+        )
+        assert cos_sim_x > 0.999, f"partial rotary x grad cosine_sim={cos_sim_x:.6f}"
+
+    @pytest.mark.parametrize(
+        "B,S,H,D,n_rot",
+        [(2, 16, 4, 64, 32), (1, 8, 2, 256, 64)],
+        ids=["half_64", "quarter_256"],
+    )
+    def test_weight_grad_matches_reference(self, B, S, H, D, n_rot):
+        from transformers.models.gemma4.modeling_gemma4 import Gemma4RMSNorm
+
+        from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_rope
+
+        eps = 1e-6
+        cos = torch.randn(B, S, n_rot, device="cuda", dtype=torch.bfloat16)
+        sin = torch.randn(B, S, n_rot, device="cuda", dtype=torch.bfloat16)
+        weight_init = torch.randn(D, device="cuda", dtype=torch.bfloat16)
+        x_data = torch.randn(B, S, H, D, device="cuda", dtype=torch.bfloat16)
+
+        # Reference: Gemma4RMSNorm whose .weight collects grads, then partial
+        # rotary applied to the rotated slice.
+        norm_ref = Gemma4RMSNorm(D, eps=eps).cuda().to(torch.bfloat16)
+        norm_ref.weight = torch.nn.Parameter(weight_init.clone())
+        normed = norm_ref(x_data)
+        from transformers.models.gemma4.modeling_gemma4 import apply_rotary_pos_emb
+
+        rotated = apply_rotary_pos_emb(normed[..., :n_rot], cos, sin, unsqueeze_dim=2)
+        y_ref = torch.cat([rotated, normed[..., n_rot:]], dim=-1)
+        y_ref.sum().backward()
+
+        w_fused = weight_init.clone().requires_grad_(True)
+        fused_rms_norm_rope(x_data.clone(), w_fused, cos, sin, eps=eps).sum().backward()
+
+        cos_sim_w = torch.nn.functional.cosine_similarity(
+            w_fused.grad.flatten().float(),
+            norm_ref.weight.grad.flatten().float(),
+            dim=0,
+        )
+        assert cos_sim_w > 0.995, (
+            f"partial rotary weight grad cosine_sim={cos_sim_w:.6f}"
+        )
+
+    def test_full_rotary_unchanged_when_n_rot_equals_d(self):
+        """Regression: passing cos/sin with shape == head_dim must still
+        match the full-rotary reference (the partial-rotary code path must
+        not perturb the existing full-rotary output)."""
+        from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_rope
+
+        B, S, H, D = 2, 16, 4, 64
+        eps = 1e-6
+        x = torch.randn(B, S, H, D, device="cuda", dtype=torch.bfloat16)
+        weight = torch.randn(D, device="cuda", dtype=torch.bfloat16)
+        cos = torch.randn(B, S, D, device="cuda", dtype=torch.bfloat16)
+        sin = torch.randn(B, S, D, device="cuda", dtype=torch.bfloat16)
+
+        y_ref = _reference_norm_rope(x.clone(), weight, cos, sin, eps)
+        y_fused = fused_rms_norm_rope(x.clone(), weight, cos, sin, eps=eps)
+        cos_sim = torch.nn.functional.cosine_similarity(
+            y_ref.flatten().float(), y_fused.flatten().float(), dim=0
+        )
+        assert cos_sim > 0.999, f"full-rotary regression cos_sim={cos_sim:.6f}"
+
+    def test_validation_errors(self):
+        """Wrapper rejects misshaped inputs cleanly (instead of a cryptic
+        Triton crash deeper in the kernel)."""
+        from axolotl.kernels.gemma4_fused_rope import fused_rms_norm_rope
+
+        B, S, H, D = 1, 4, 2, 64
+        x = torch.randn(B, S, H, D, device="cuda", dtype=torch.bfloat16)
+        w = torch.randn(D, device="cuda", dtype=torch.bfloat16)
+
+        # n_rot > head_dim
+        cos_big = torch.randn(B, S, D + 16, device="cuda", dtype=torch.bfloat16)
+        sin_big = torch.randn(B, S, D + 16, device="cuda", dtype=torch.bfloat16)
+        with pytest.raises(ValueError, match="cannot exceed head_dim"):
+            fused_rms_norm_rope(x, w, cos_big, sin_big)
+
+        # cos/sin last-dim mismatch
+        cos = torch.randn(B, S, 32, device="cuda", dtype=torch.bfloat16)
+        sin = torch.randn(B, S, 16, device="cuda", dtype=torch.bfloat16)
+        with pytest.raises(ValueError, match="same last dim"):
+            fused_rms_norm_rope(x, w, cos, sin)
+
+        # odd rotary dim
+        cos_odd = torch.randn(B, S, 31, device="cuda", dtype=torch.bfloat16)
+        sin_odd = torch.randn(B, S, 31, device="cuda", dtype=torch.bfloat16)
+        with pytest.raises(ValueError, match="must be even"):
+            fused_rms_norm_rope(x, w, cos_odd, sin_odd)
+
+
 class TestFusedRMSNormNoScale:
    """Tests for v_norm (RMSNorm without learnable scale)."""

--- a/tests/monkeypatch/test_gemma4_fused_attn.py
+++ b/tests/monkeypatch/test_gemma4_fused_attn.py
@@ -0,0 +1,219 @@
+"""Tests for the Gemma 4 fused-attention monkey-patch.
+
+These tests exercise the patched ``Gemma4TextAttention.forward`` against
+the stock implementation it replaces. The hybrid Gemma 4 model intentionally
+mixes a sliding (`head_dim=32`) layer with a full-attention proportional-rope
+layer (`global_head_dim=64`, `partial_rotary_factor=0.25`) so that the
+partial-rotary RMSNorm+RoPE path through the fused Triton kernel is
+exercised end-to-end (this is the bug originally documented in
+``GEMMA4_FUSED_ROPE_HYBRID_ATTN_BUG.md``).
+
+The full-model forward also pins that the fused forward keeps accepting
+whatever call shape ``Gemma4TextDecoderLayer.forward`` produces in the
+installed transformers version — so any future signature drift on
+upstream's side trips a clear failure here instead of a confusing
+TypeError deep in a training run.
+"""
+
+import pytest
+import torch
+
+pytestmark = [
+    pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required"),
+]
+
+pytest.importorskip(
+    "transformers.models.gemma4",
+    reason="fused_attn patch only matters when Gemma 4 is available",
+)
+
+
+@pytest.fixture
+def restore_gemma4_attention():
+    """Snapshot ``Gemma4TextAttention.forward`` and restore after the test
+    so the monkey-patch does not leak across the suite."""
+    from transformers.models.gemma4.modeling_gemma4 import Gemma4TextAttention
+
+    saved = Gemma4TextAttention.forward
+    yield Gemma4TextAttention
+    Gemma4TextAttention.forward = saved
+
+
+def _build_hybrid_config():
+    """Tiny hybrid Gemma 4 config: one sliding layer + one full-attention
+    layer with proportional rope and partial_rotary_factor=0.25. This is
+    the same shape pattern as ``google/gemma-4-26B-A4B-it`` but small
+    enough to fit on any GPU."""
+    from transformers.models.gemma4.configuration_gemma4 import Gemma4TextConfig
+
+    cfg = Gemma4TextConfig(
+        vocab_size=128,
+        hidden_size=64,
+        intermediate_size=128,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        num_key_value_heads=2,
+        head_dim=32,
+        global_head_dim=64,
+        layer_types=["sliding_attention", "full_attention"],
+        sliding_window=64,
+        max_position_embeddings=2048,
+        hidden_size_per_layer_input=16,
+        vocab_size_per_layer_input=128,
+        rope_parameters={
+            "sliding_attention": {
+                "rope_type": "default",
+                "rope_theta": 10000.0,
+            },
+            "full_attention": {
+                "rope_type": "proportional",
+                "rope_theta": 1000000.0,
+                "partial_rotary_factor": 0.25,
+            },
+        },
+    )
+    cfg._attn_implementation = "sdpa"
+    return cfg
+
+
+def _build_model(seed=0):
+    from transformers.models.gemma4.modeling_gemma4 import Gemma4TextModel
+
+    torch.manual_seed(seed)
+    cfg = _build_hybrid_config()
+    return Gemma4TextModel(cfg).cuda().to(torch.bfloat16).eval()
+
+
+class TestFusedAttnSignature:
+    """The fused forward must accept the same call shape as
+    ``Gemma4TextDecoderLayer`` produces in the installed transformers
+    version. Any signature drift surfaces here as a TypeError."""
+
+    def test_decoder_layer_can_call_fused_forward(self, restore_gemma4_attention):
+        """Run a model forward that exercises the real
+        ``Gemma4TextDecoderLayer -> Gemma4TextAttention`` call path with
+        the fused patch installed."""
+        from axolotl.monkeypatch.models.gemma4.fused_attn import (
+            patch_gemma4_fused_attn,
+        )
+
+        m = _build_model()
+        ids = torch.randint(0, 128, (2, 16), device="cuda")
+        mask = torch.ones(2, 16, dtype=torch.long, device="cuda")
+
+        patch_gemma4_fused_attn()
+        with torch.no_grad():
+            out = m(input_ids=ids, attention_mask=mask).last_hidden_state
+
+        assert out.shape == (2, 16, 64)
+        assert torch.isfinite(out).all()
+
+
+class TestFusedAttnPerLayerCorrectness:
+    """Compare the patched attention layer to the stock implementation
+    on a single forward call. This isolates the fused kernel correctness
+    from cross-layer numerical drift."""
+
+    def _run_attention(self, model, layer_idx, hidden_states, position_ids):
+        """Call ``Gemma4TextAttention.forward`` (whatever is currently
+        installed) for one layer and return the output."""
+        attn = model.layers[layer_idx].self_attn
+        layer_type = model.config.layer_types[layer_idx]
+        cos, sin = model.rotary_emb(hidden_states, position_ids, layer_type)
+        out, _ = attn(
+            hidden_states=hidden_states,
+            position_embeddings=(cos, sin),
+            attention_mask=None,
+            shared_kv_states={},
+        )
+        return out
+
+    @pytest.mark.parametrize(
+        "layer_idx",
+        [0, 1],
+        ids=["sliding_head32", "global_head64_proportional"],
+    )
+    def test_forward_matches_stock(self, restore_gemma4_attention, layer_idx):
+        from axolotl.monkeypatch.models.gemma4.fused_attn import (
+            patch_gemma4_fused_attn,
+        )
+
+        m = _build_model(seed=1)
+        hs = torch.randn(2, 16, 64, device="cuda", dtype=torch.bfloat16)
+        pos = torch.arange(16, device="cuda").unsqueeze(0).expand(2, -1)
+
+        with torch.no_grad():
+            ref = self._run_attention(m, layer_idx, hs, pos)
+
+        patch_gemma4_fused_attn()
+        with torch.no_grad():
+            got = self._run_attention(m, layer_idx, hs, pos)
+
+        assert got.shape == ref.shape
+        assert torch.isfinite(got).all()
+        cos_sim = torch.nn.functional.cosine_similarity(
+            ref.flatten().float(), got.flatten().float(), dim=0
+        )
+        assert cos_sim > 0.999, (
+            f"layer {layer_idx} fused vs stock cosine_sim={cos_sim:.6f}"
+        )
+        # bf16 precision: a few millis of absolute drift per element is
+        # acceptable for a Q/K/V projection pipeline. Anything larger is
+        # a real bug.
+        torch.testing.assert_close(got, ref, rtol=5e-2, atol=5e-2)
+
+
+class TestFusedAttnFullModel:
+    """End-to-end model forward + backward through both layer types."""
+
+    def test_full_forward_matches_stock(self, restore_gemma4_attention):
+        from axolotl.monkeypatch.models.gemma4.fused_attn import (
+            patch_gemma4_fused_attn,
+        )
+
+        m = _build_model(seed=2)
+        ids = torch.randint(0, 128, (2, 32), device="cuda")
+        mask = torch.ones(2, 32, dtype=torch.long, device="cuda")
+
+        with torch.no_grad():
+            ref = m(input_ids=ids, attention_mask=mask).last_hidden_state.clone()
+
+        patch_gemma4_fused_attn()
+        with torch.no_grad():
+            got = m(input_ids=ids, attention_mask=mask).last_hidden_state.clone()
+
+        assert got.shape == ref.shape
+        assert torch.isfinite(got).all()
+        cos_sim = torch.nn.functional.cosine_similarity(
+            ref.flatten().float(), got.flatten().float(), dim=0
+        )
+        # End-to-end through 2 layers (RMSNorm, attention, MLP/MoE) in bf16
+        # accumulates a small amount of numerical drift; we just want to
+        # pin that the two paths are computing the same function.
+        assert cos_sim > 0.999, f"end-to-end cosine_sim={cos_sim:.6f}"
+
+    def test_backward_grad_flows_through_fused_path(self, restore_gemma4_attention):
+        """Gradients must propagate through the fused RMSNorm+RoPE kernels
+        for both the sliding and proportional-rope layers."""
+        from axolotl.monkeypatch.models.gemma4.fused_attn import (
+            patch_gemma4_fused_attn,
+        )
+
+        m = _build_model(seed=3).train()
+        patch_gemma4_fused_attn()
+
+        ids = torch.randint(0, 128, (2, 16), device="cuda")
+        mask = torch.ones(2, 16, dtype=torch.long, device="cuda")
+        out = m(input_ids=ids, attention_mask=mask).last_hidden_state
+        out.sum().backward()
+
+        # Both layers must accumulate gradients on q_norm.weight and
+        # k_norm.weight — that proves the fused kernel ran the backward.
+        for i, layer in enumerate(m.layers[:2]):
+            attn = layer.self_attn
+            assert attn.q_norm.weight.grad is not None, f"layer {i} q_norm no grad"
+            assert attn.k_norm.weight.grad is not None, f"layer {i} k_norm no grad"
+            assert attn.q_norm.weight.grad.isfinite().all()
+            assert attn.k_norm.weight.grad.isfinite().all()
+            assert attn.q_norm.weight.grad.abs().sum() > 0
+            assert attn.k_norm.weight.grad.abs().sum() > 0
--- a/tests/monkeypatch/test_gemma4_fused_attn_patch.py
+++ b/tests/monkeypatch/test_gemma4_fused_attn_patch.py
@@ -1,171 +0,0 @@
-"""Unit tests for the Gemma4 fused-attention shared_kv_states routing patch."""
-
-import pytest
-
-gemma4_modeling = pytest.importorskip("transformers.models.gemma4.modeling_gemma4")
-
-
-@pytest.fixture
-def clean_decoder_layer_patch_slate():
-    """Save and restore Gemma4TextDecoderLayer.__call__ and the sentinel."""
-    from axolotl.monkeypatch.models.gemma4 import fused_attn
-
-    cls = gemma4_modeling.Gemma4TextDecoderLayer
-    original_call = cls.__call__
-    had_sentinel = getattr(cls, "_axolotl_shared_kv_patched", False)
-
-    if had_sentinel:
-        del cls._axolotl_shared_kv_patched
-
-    try:
-        yield cls, fused_attn
-    finally:
-        cls.__call__ = original_call
-        if had_sentinel:
-            cls._axolotl_shared_kv_patched = True
-        elif hasattr(cls, "_axolotl_shared_kv_patched"):
-            del cls._axolotl_shared_kv_patched
-        fused_attn._set_shared_kv_states(None)
-
-
-class TestPatchedDecoderLayerCall:
-    def test_pops_shared_kv_states_and_populates_store(
-        self, clean_decoder_layer_patch_slate
-    ):
-        cls, fused_attn = clean_decoder_layer_patch_slate
-
-        captured = {}
-
-        def spy(self, *args, **kwargs):
-            captured["args"] = args
-            captured["kwargs"] = dict(kwargs)
-            return "spy_return"
-
-        cls.__call__ = spy
-        fused_attn._patch_decoder_layer_call()
-
-        assert getattr(cls, "_axolotl_shared_kv_patched", False) is True
-        assert cls.__call__ is not spy
-
-        shared_kv = {"layer_0": ("k", "v")}
-        result = cls.__call__(
-            object(),
-            "positional_arg",
-            shared_kv_states=shared_kv,
-            other_kwarg="keep_me",
-        )
-
-        assert result == "spy_return"
-        assert captured["args"] == ("positional_arg",)
-        assert "shared_kv_states" not in captured["kwargs"]
-        assert captured["kwargs"] == {"other_kwarg": "keep_me"}
-        assert fused_attn._get_shared_kv_states() is shared_kv
-
-    def test_clears_store_when_kwarg_absent(self, clean_decoder_layer_patch_slate):
-        """Regression for commit 251021e1: a prior step's dict must not leak
-        into a later call that omits `shared_kv_states`."""
-        cls, fused_attn = clean_decoder_layer_patch_slate
-
-        def spy(self, *args, **kwargs):
-            return None
-
-        cls.__call__ = spy
-        fused_attn._patch_decoder_layer_call()
-
-        stale = {"stale_step": True}
-        fused_attn._set_shared_kv_states(stale)
-        assert fused_attn._get_shared_kv_states() is stale
-
-        cls.__call__(object())
-
-        assert fused_attn._get_shared_kv_states() is None
-
-    def test_store_visible_across_threads(self):
-        """Regression for commit e3669b2c: the store must be readable from
-        threads other than the one that set it. `threading.local()` failed
-        this invariant, crashing with 'NoneType' object is not subscriptable'
-        on MoE Gemma4 variants when autograd worker threads ran backward
-        recompute under HF-Trainer gradient_checkpointing."""
-        import threading
-
-        from axolotl.monkeypatch.models.gemma4 import fused_attn
-
-        sentinel = {"layer_0": ("k", "v")}
-        try:
-            fused_attn._set_shared_kv_states(sentinel)
-
-            seen = {}
-
-            def worker():
-                seen["value"] = fused_attn._get_shared_kv_states()
-
-            t = threading.Thread(target=worker)
-            t.start()
-            t.join()
-
-            assert seen["value"] is sentinel
-        finally:
-            fused_attn._set_shared_kv_states(None)
-
-
-@pytest.fixture
-def clean_entry_point_patch_slate():
-    """Save and restore Gemma4TextAttention.forward and Gemma4TextDecoderLayer.__call__."""
-    from axolotl.monkeypatch.models.gemma4 import fused_attn
-
-    decoder_cls = gemma4_modeling.Gemma4TextDecoderLayer
-    attn_cls = gemma4_modeling.Gemma4TextAttention
-
-    original_call = decoder_cls.__call__
-    original_forward = attn_cls.forward
-    had_sentinel = getattr(decoder_cls, "_axolotl_shared_kv_patched", False)
-
-    if had_sentinel:
-        del decoder_cls._axolotl_shared_kv_patched
-
-    try:
-        yield decoder_cls, attn_cls, original_call, original_forward, fused_attn
-    finally:
-        decoder_cls.__call__ = original_call
-        attn_cls.forward = original_forward
-        if had_sentinel:
-            decoder_cls._axolotl_shared_kv_patched = True
-        elif hasattr(decoder_cls, "_axolotl_shared_kv_patched"):
-            del decoder_cls._axolotl_shared_kv_patched
-        fused_attn._set_shared_kv_states(None)
-
-
-class TestPatchGemma4FusedAttnEntryPoint:
-    def test_default_flag_swaps_only_attention_forward(
-        self, clean_entry_point_patch_slate
-    ):
-        (
-            decoder_cls,
-            attn_cls,
-            original_call,
-            original_forward,
-            fused_attn,
-        ) = clean_entry_point_patch_slate
-
-        fused_attn.patch_gemma4_fused_attn()
-
-        assert attn_cls.forward is not original_forward
-        assert decoder_cls.__call__ is original_call
-        assert not getattr(decoder_cls, "_axolotl_shared_kv_patched", False)
-
-    def test_workaround_flag_installs_decoder_layer_patch(
-        self, clean_entry_point_patch_slate
-    ):
-        (
-            decoder_cls,
-            attn_cls,
-            original_call,
-            original_forward,
-            fused_attn,
-        ) = clean_entry_point_patch_slate
-
-        fused_attn.patch_gemma4_fused_attn(install_shared_kv_workaround=True)
-
-        assert attn_cls.forward is not original_forward
-        assert decoder_cls.__call__ is not original_call
-        assert getattr(decoder_cls, "_axolotl_shared_kv_patched", False) is True
--- a/tests/monkeypatch/test_gemma4_hybrid_mask.py
+++ b/tests/monkeypatch/test_gemma4_hybrid_mask.py
@@ -0,0 +1,343 @@
+"""Tests for the Gemma 4 hybrid-attention mask fix.
+
+These tests pin the single critical behavior: after installing the patch,
+``modeling_gemma4.create_causal_mask`` passes an SDPA-overridden config to
+the underlying mask builder regardless of what the caller's config says.
+This is what keeps full-attention (head_dim=512) global layers from
+crashing at long sequence lengths — they need a 4D SDPA-format mask, not
+the 2D FA2 mask that would be built from the model-level config.
+
+The tests use a mocked ``create_causal_mask`` so they don't have to load
+a real 26B Gemma 4 model or even have access to its weights. What matters
+for the bug fix is which config is handed to the mask factory, not the
+factory's actual output.
+"""
+
+from types import SimpleNamespace
+from unittest.mock import MagicMock
+
+import pytest
+
+pytest.importorskip(
+    "transformers.models.gemma4",
+    reason="gemma4_hybrid_mask patch only matters when Gemma 4 is available",
+)
+
+
+@pytest.fixture
+def restore_gemma4_module():
+    """Snapshot ``modeling_gemma4.create_causal_mask`` and restore after
+    each test so patch state doesn't leak across the suite."""
+    from transformers.models.gemma4 import modeling_gemma4
+
+    saved = modeling_gemma4.create_causal_mask
+    yield modeling_gemma4
+    modeling_gemma4.create_causal_mask = saved
+    # Reset the module-level flag so the next test can re-install cleanly.
+    from axolotl.monkeypatch import gemma4_hybrid_mask
+
+    gemma4_hybrid_mask._PATCH_APPLIED = False
+
+
+def test_patch_replaces_create_causal_mask(restore_gemma4_module):
+    modeling_gemma4 = restore_gemma4_module
+    from axolotl.monkeypatch.gemma4_hybrid_mask import patch_gemma4_hybrid_mask
+
+    original = modeling_gemma4.create_causal_mask
+    assert patch_gemma4_hybrid_mask() is True
+
+    assert modeling_gemma4.create_causal_mask is not original
+    assert modeling_gemma4.create_causal_mask._axolotl_original is original, (
+        "patched wrapper must expose the original reference for teardown"
+    )
+
+
+def test_patch_is_idempotent(restore_gemma4_module):
+    modeling_gemma4 = restore_gemma4_module
+    from axolotl.monkeypatch.gemma4_hybrid_mask import patch_gemma4_hybrid_mask
+
+    patch_gemma4_hybrid_mask()
+    wrapper_first = modeling_gemma4.create_causal_mask
+
+    # Second call must not re-wrap the already-wrapped function (which
+    # would leak the original reference through a chain of wrappers).
+    patch_gemma4_hybrid_mask()
+    wrapper_second = modeling_gemma4.create_causal_mask
+
+    assert wrapper_first is wrapper_second
+
+
+def test_patched_mask_forces_sdpa_config(restore_gemma4_module):
+    """Core invariant: when the patched wrapper is called with a config
+    that says ``flash_attention_2``, the underlying mask factory receives
+    a shallow-copied config whose ``_attn_implementation`` is ``"sdpa"``.
+
+    Without this, the full-attention global layers get a 2D FA2 mask and
+    crash at long seq lens with the [B, H, S, S] / [B, S] expand error.
+    """
+    modeling_gemma4 = restore_gemma4_module
+    from axolotl.monkeypatch.gemma4_hybrid_mask import patch_gemma4_hybrid_mask
+
+    # Swap in a mock BEFORE installing the patch so the wrapper captures
+    # it as the "original". The mock records every call so we can inspect
+    # what config got passed through.
+    mock_factory = MagicMock(name="create_causal_mask", return_value="mask_4d")
+    modeling_gemma4.create_causal_mask = mock_factory
+    patch_gemma4_hybrid_mask()
+
+    # Caller-supplied config says FA2 (that's the model-level setting).
+    caller_config = SimpleNamespace(
+        _attn_implementation="flash_attention_2",
+        head_dim=512,
+        some_other_attr="preserved",
+    )
+    result = modeling_gemma4.create_causal_mask(
+        caller_config,
+        inputs_embeds=None,
+        attention_mask=None,
+        past_key_values=None,
+        position_ids=None,
+    )
+
+    # Wrapper returned whatever the mock returned — no transformation of
+    # the result itself.
+    assert result == "mask_4d"
+
+    # The mock was called exactly once with a config whose
+    # ``_attn_implementation`` is sdpa, NOT the caller's fa2.
+    assert mock_factory.call_count == 1
+    (passed_config, *_), passed_kwargs = mock_factory.call_args
+    assert passed_config._attn_implementation == "sdpa"
+
+    # The wrapper must NOT mutate the caller's config in place — other
+    # mask builders (e.g. create_sliding_window_causal_mask) read from
+    # the same config and must still see fa2.
+    assert caller_config._attn_implementation == "flash_attention_2"
+
+    # Other attributes on the config must be preserved so the underlying
+    # factory has everything it needs (head_dim, rope_theta, vocab_size, ...).
+    assert passed_config.head_dim == 512
+    assert passed_config.some_other_attr == "preserved"
+
+
+def test_patched_wrapper_passes_through_all_kwargs(restore_gemma4_module):
+    """The wrapper must forward positional + keyword args to the original
+    unchanged, so transformers' own call-site in Gemma4TextModel.forward
+    keeps working across minor transformers-version signature drift."""
+    modeling_gemma4 = restore_gemma4_module
+    from axolotl.monkeypatch.gemma4_hybrid_mask import patch_gemma4_hybrid_mask
+
+    mock_factory = MagicMock(return_value="mask")
+    modeling_gemma4.create_causal_mask = mock_factory
+    patch_gemma4_hybrid_mask()
+
+    caller_config = SimpleNamespace(_attn_implementation="flash_attention_2")
+    modeling_gemma4.create_causal_mask(
+        caller_config,
+        "positional_arg",
+        inputs_embeds="embeds",
+        attention_mask="mask_2d",
+        past_key_values="cache",
+        position_ids="positions",
+        or_mask_function="or_fn",
+    )
+
+    args, kwargs = mock_factory.call_args
+    # First positional (after config override) is preserved.
+    assert args[1] == "positional_arg"
+    # All kwargs are forwarded untouched.
+    assert kwargs["inputs_embeds"] == "embeds"
+    assert kwargs["attention_mask"] == "mask_2d"
+    assert kwargs["past_key_values"] == "cache"
+    assert kwargs["position_ids"] == "positions"
+    assert kwargs["or_mask_function"] == "or_fn"
+
+
+def test_unpatch_restores_original(restore_gemma4_module):
+    modeling_gemma4 = restore_gemma4_module
+    from axolotl.monkeypatch.gemma4_hybrid_mask import (
+        patch_gemma4_hybrid_mask,
+        unpatch_gemma4_hybrid_mask,
+    )
+
+    sentinel = MagicMock(name="original")
+    modeling_gemma4.create_causal_mask = sentinel
+    patch_gemma4_hybrid_mask()
+    assert modeling_gemma4.create_causal_mask is not sentinel
+
+    unpatch_gemma4_hybrid_mask()
+    assert modeling_gemma4.create_causal_mask is sentinel
+
+
+def test_unpatch_is_safe_without_prior_patch(restore_gemma4_module):
+    from axolotl.monkeypatch.gemma4_hybrid_mask import unpatch_gemma4_hybrid_mask
+
+    # Should be a no-op, no exception.
+    unpatch_gemma4_hybrid_mask()
+
+
+def test_sliding_window_mask_builder_is_not_patched(restore_gemma4_module):
+    """Only ``create_causal_mask`` is overridden — the sliding-window
+    factory must remain bound to its original to preserve FA2 masks for
+    the sliding-attention layers. If we accidentally patch both, the
+    sliding layers get SDPA format and lose the FA2 speedup."""
+    modeling_gemma4 = restore_gemma4_module
+    from axolotl.monkeypatch.gemma4_hybrid_mask import patch_gemma4_hybrid_mask
+
+    if not hasattr(modeling_gemma4, "create_sliding_window_causal_mask"):
+        pytest.skip("transformers version has no create_sliding_window_causal_mask")
+
+    sliding_before = modeling_gemma4.create_sliding_window_causal_mask
+    patch_gemma4_hybrid_mask()
+    sliding_after = modeling_gemma4.create_sliding_window_causal_mask
+    assert sliding_after is sliding_before
+
+
+# ---------------------------------------------------------------------------
+# Integration tests with a tiny randomly-initialized Gemma4TextModel.
+#
+# These do NOT load real 26B weights. They build a ~350k-param Gemma 4 text
+# model with 2 layers (one sliding, one full_attention), apply the hybrid
+# attention path end-to-end, and run a forward pass with a padded
+# attention_mask at a long-ish seq len. The invariant we're pinning is that
+# the full_attention layer does not crash with the
+#   "Target sizes: [B, H, S, S]. Tensor sizes: [B, S]"
+# error — the exact failure that blew up the Gemma 4 MoE 26B pilot at ~7k
+# tokens in the FSDP2 training run.
+# ---------------------------------------------------------------------------
+
+
+def _build_tiny_gemma4_text_model():
+    """Return a tiny randomly-initialized Gemma4TextModel with mixed layers."""
+    import torch
+    from transformers.models.gemma4.configuration_gemma4 import Gemma4TextConfig
+    from transformers.models.gemma4.modeling_gemma4 import Gemma4TextModel
+
+    cfg = Gemma4TextConfig(
+        vocab_size=128,
+        hidden_size=64,
+        intermediate_size=128,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        num_key_value_heads=2,
+        head_dim=32,
+        layer_types=["sliding_attention", "full_attention"],
+        sliding_window=64,
+        max_position_embeddings=2048,
+        hidden_size_per_layer_input=16,
+        vocab_size_per_layer_input=128,
+    )
+    # Caller-supplied attn impl simulates the pilot config (fa2 at model
+    # level). The hybrid patch is what makes this survive long context.
+    cfg._attn_implementation = "sdpa"  # start safe; the test toggles fa2 later
+    torch.manual_seed(42)
+    model = Gemma4TextModel(cfg).eval()
+    return model, cfg
+
+
+def _apply_hybrid_attn_inline(model, cfg):
+    """Replicate what ``patch_manager._apply_gemma_hybrid_attention`` does
+    to a model, without needing a full PatchManager / pydantic cfg."""
+    import copy
+
+    from axolotl.monkeypatch.gemma4_hybrid_mask import patch_gemma4_hybrid_mask
+
+    for layer_idx, layer in enumerate(model.layers):
+        if cfg.layer_types[layer_idx] != "sliding_attention":
+            attn = getattr(layer, "self_attn", None)
+            if attn is not None and hasattr(attn, "config"):
+                sdpa_cfg = copy.copy(attn.config)
+                sdpa_cfg._attn_implementation = "sdpa"
+                attn.config = sdpa_cfg
+    patch_gemma4_hybrid_mask()
+
+
+def test_tiny_gemma4_long_context_forward_does_not_crash(restore_gemma4_module):
+    """End-to-end invariant: with the hybrid attn patch applied, a tiny
+    Gemma4TextModel runs a forward at long context (1024 tokens) with
+    real padding in the attention mask, producing the expected output
+    shape. This exercises the actual code path that crashed the pilot
+    without needing a real 26B checkpoint or CUDA."""
+    import torch
+
+    model, cfg = _build_tiny_gemma4_text_model()
+    _apply_hybrid_attn_inline(model, cfg)
+
+    B, S = 2, 1024
+    input_ids = torch.randint(0, cfg.vocab_size, (B, S))
+    attn_mask = torch.ones(B, S, dtype=torch.long)
+    # Pad positions in the second row. Without padding, SDPA falls back to
+    # ``is_causal=True`` with ``mask=None`` — we need a materialized 4D
+    # mask to exercise the actual bug site.
+    attn_mask[1, S // 2 :] = 0
+
+    with torch.no_grad():
+        out = model(input_ids=input_ids, attention_mask=attn_mask)
+
+    assert out.last_hidden_state.shape == (B, S, cfg.hidden_size)
+    assert torch.isfinite(out.last_hidden_state).all()
+
+
+def test_patched_create_causal_mask_returns_4d_for_real_config(
+    restore_gemma4_module,
+):
+    """Hit the REAL ``create_causal_mask`` (not a mock) via the wrapper
+    and verify the returned mask is a 4D tensor — which is the shape the
+    SDPA-patched global layers need. Without the patch and with a
+    caller-supplied FA2 config this would return a 2D mask and the layer
+    would crash at long context."""
+    import torch
+    from transformers.cache_utils import DynamicCache
+    from transformers.models.gemma4.configuration_gemma4 import Gemma4TextConfig
+
+    from axolotl.monkeypatch.gemma4_hybrid_mask import patch_gemma4_hybrid_mask
+
+    patch_gemma4_hybrid_mask()
+    modeling_gemma4 = restore_gemma4_module
+
+    cfg = Gemma4TextConfig(
+        vocab_size=128,
+        hidden_size=64,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        num_key_value_heads=2,
+        head_dim=32,
+        layer_types=["sliding_attention", "full_attention"],
+        sliding_window=64,
+        max_position_embeddings=2048,
+        hidden_size_per_layer_input=16,
+        vocab_size_per_layer_input=128,
+    )
+    # Simulate the pilot: caller says flash_attention_2, but global layers
+    # were switched to SDPA per-layer. Without the patch, create_causal_mask
+    # would return an FA2 2D mask here and the SDPA layer would crash.
+    cfg._attn_implementation = "flash_attention_2"
+
+    B, S = 2, 1024
+    inputs_embeds = torch.zeros((B, S, cfg.hidden_size), dtype=torch.float32)
+    attention_mask = torch.ones((B, S), dtype=torch.long)
+    attention_mask[1, S // 2 :] = 0  # force the 4D materialized path
+    position_ids = torch.arange(S).unsqueeze(0).expand(B, -1)
+    past_key_values = DynamicCache(config=cfg)
+
+    mask = modeling_gemma4.create_causal_mask(
+        config=cfg,
+        inputs_embeds=inputs_embeds,
+        attention_mask=attention_mask,
+        past_key_values=past_key_values,
+        position_ids=position_ids,
+    )
+
+    assert mask is not None
+    assert isinstance(mask, torch.Tensor)
+    assert mask.dim() == 4, (
+        f"expected a 4D SDPA-format mask, got {mask.dim()}D "
+        f"shape={tuple(mask.shape)}. The full_attention global layers need "
+        "this shape or they crash at long context."
+    )
+    assert mask.shape[0] == B
+    assert mask.shape[-1] == S
+    assert mask.shape[-2] == S
+
+    # Caller's config must be untouched — other code paths still read it.
+    assert cfg._attn_implementation == "flash_attention_2"
--- a/tests/prompt_strategies/test_dpo_chat_templates.py
+++ b/tests/prompt_strategies/test_dpo_chat_templates.py
@@ -111,7 +111,7 @@ def fixture_argilla_chat_dataset():
@pytest.fixture(name="phi3_tokenizer")
@enable_hf_offline
 def fixture_phi3_tokenizer():
-    tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
+    tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-medium-128k-instruct")

    return tokenizer

@@ -214,8 +214,8 @@ class TestAssistantDPOChatTemplatePhi3:
            + "<|user|>\ngoodbye<|end|>\n"
            + "<|assistant|>\n"
        )
-        assert result["chosen"] == "goodbye<|end|>\n<|endoftext|>"
-        assert result["rejected"] == "party on<|end|>\n<|endoftext|>"
+        assert result["chosen"] == "goodbye<|end|>"
+        assert result["rejected"] == "party on<|end|>"


 class TestAssistantDPOChatTemplateGemma:
@@ -290,8 +290,8 @@ class TestArgillaChatDPOChatTemplate:
        )
        result = transform_fn(argilla_chat_dataset[0], tokenizer=phi3_tokenizer)
        assert result["prompt"] == "<|user|>\nhello<|end|>\n" + "<|assistant|>\n"
-        assert result["chosen"] == "goodbye<|end|>\n<|endoftext|>"
-        assert result["rejected"] == "party on<|end|>\n<|endoftext|>"
+        assert result["chosen"] == "goodbye<|end|>"
+        assert result["rejected"] == "party on<|end|>"


 class TestDPOChatTemplateToolRole:
--- a/tests/utils/lora/test_merge_lora.py
+++ b/tests/utils/lora/test_merge_lora.py
@@ -491,7 +491,8 @@ class TestEfficientMerge:
        out_features = 4
        alpha = 4

-        base = torch.randn(num_experts, in_features, out_features)
+        # PEFT ParamWrapper treats non-transposed 3D weights as (experts, out, in)
+        base = torch.randn(num_experts, out_features, in_features)
        lora_a = torch.randn(r * num_experts, in_features)
        lora_b = torch.randn(out_features, r * num_experts)

@@ -507,7 +508,7 @@ class TestEfficientMerge:
        scale = alpha / r
        wa = lora_a.reshape(num_experts, r, in_features)
        wb = lora_b.reshape(out_features, r, num_experts)
-        manual_delta = torch.einsum("o r e, e r i -> e i o", wb, wa) * scale
+        manual_delta = torch.einsum("o r e, e r i -> e o i", wb, wa) * scale
        for e in range(num_experts):
            assert torch.allclose(merged[e], base[e] + manual_delta[e], atol=1e-5), (
                f"Expert {e} mismatch"
--- a/tests/utils/schemas/validation/test_config_validators.py
+++ b/tests/utils/schemas/validation/test_config_validators.py
@@ -5,6 +5,8 @@ Covers:
  - save_strategy: 'best' requires metric_for_best_model
  - streaming=True with val_set_size > 0 is rejected
  - lora_target_modules with invalid regex patterns is rejected
+  - GRPO: generation batch size must be divisible by num_generations,
+    num_generations >= 2, and effective_gbs >= num_generations * world_size
 """

 import pytest
@@ -117,3 +119,136 @@ class TestLoraTargetModulesRegexValidator:
        )
        with pytest.raises(ValueError, match="invalid regex pattern"):
            validate_config(cfg)
+
+
+class TestGRPOBatchSizeValidator:
+    """GRPO requires (mb*GA) % num_generations == 0 and num_generations >= 2.
+
+    These call the @model_validator(mode="before") classmethod directly on a
+    plain dict — same input shape it receives during full Pydantic validation,
+    just without dragging in unrelated fields (datasets / model loading / etc.)
+    that aren't relevant to what's under test. The validator is registered on
+    ``RLValidationMixin`` (which ``AxolotlInputConfig`` inherits) so this is the
+    same code path ``axolotl train`` exercises.
+    """
+
+    @staticmethod
+    def _check(data):
+        from axolotl.utils.schemas.validation import RLValidationMixin
+
+        return RLValidationMixin.check_grpo_batch_size_divisibility(data)
+
+    def test_divisible_passes(self):
+        data = {
+            "rl": "grpo",
+            "micro_batch_size": 1,
+            "gradient_accumulation_steps": 4,
+            "trl": {"num_generations": 4},
+        }
+        # Should return data unchanged (no exception)
+        out = self._check(data)
+        assert out["trl"]["num_generations"] == 4
+
+    def test_non_divisible_raises(self):
+        data = {
+            "rl": "grpo",
+            "micro_batch_size": 1,
+            "gradient_accumulation_steps": 2,
+            "trl": {"num_generations": 4},
+        }
+        with pytest.raises(ValueError, match="num_generations"):
+            self._check(data)
+
+    def test_non_divisible_error_includes_fix_hint(self):
+        data = {
+            "rl": "grpo",
+            "micro_batch_size": 1,
+            "gradient_accumulation_steps": 3,
+            "trl": {"num_generations": 4},
+        }
+        with pytest.raises(ValueError, match="gradient_accumulation_steps: 4"):
+            self._check(data)
+
+    def test_num_generations_one_raises(self):
+        data = {
+            "rl": "grpo",
+            "micro_batch_size": 1,
+            "gradient_accumulation_steps": 4,
+            "trl": {"num_generations": 1},
+        }
+        with pytest.raises(ValueError, match=r"num_generations >= 2"):
+            self._check(data)
+
+    def test_explicit_generation_batch_size_divisible_passes(self):
+        data = {
+            "rl": "grpo",
+            "micro_batch_size": 1,
+            "gradient_accumulation_steps": 1,
+            "trl": {"num_generations": 4, "generation_batch_size": 8},
+        }
+        out = self._check(data)
+        assert out["trl"]["generation_batch_size"] == 8
+
+    def test_explicit_generation_batch_size_non_divisible_raises(self):
+        data = {
+            "rl": "grpo",
+            "micro_batch_size": 1,
+            "gradient_accumulation_steps": 1,
+            "trl": {"num_generations": 4, "generation_batch_size": 6},
+        }
+        with pytest.raises(ValueError, match="trl.generation_batch_size"):
+            self._check(data)
+
+    def test_non_grpo_skips_check(self):
+        # Anything other than rl=grpo should pass through untouched, even
+        # with non-divisible batch sizes — they're irrelevant to other RL
+        # methods that don't use group-relative advantages.
+        data = {
+            "rl": "dpo",
+            "micro_batch_size": 1,
+            "gradient_accumulation_steps": 3,
+            "trl": {"num_generations": 4},
+        }
+        assert self._check(data) is data
+
+    def test_no_rl_set_skips_check(self):
+        data = {
+            "micro_batch_size": 1,
+            "gradient_accumulation_steps": 3,
+        }
+        assert self._check(data) is data
+
+    def test_grpo_without_num_generations_skips_check(self):
+        # If num_generations isn't set, TRL uses its own default — we don't
+        # have enough info to validate, so the validator must short-circuit
+        # rather than guess.
+        data = {
+            "rl": "grpo",
+            "micro_batch_size": 1,
+            "gradient_accumulation_steps": 3,
+            "trl": {},
+        }
+        out = self._check(data)
+        assert out["rl"] == "grpo"
+
+    def test_multi_rank_group_size_check(self):
+        data = {
+            "rl": "grpo",
+            "micro_batch_size": 1,
+            "gradient_accumulation_steps": 4,  # gbs=4
+            "world_size": 2,  # need gbs >= 4*2 = 8
+            "trl": {"num_generations": 4},
+        }
+        with pytest.raises(ValueError, match=r"world_size=2"):
+            self._check(data)
+
+    def test_multi_rank_group_size_satisfied(self):
+        data = {
+            "rl": "grpo",
+            "micro_batch_size": 1,
+            "gradient_accumulation_steps": 8,  # gbs=8 >= 4*2
+            "world_size": 2,
+            "trl": {"num_generations": 4},
+        }
+        out = self._check(data)
+        assert out["gradient_accumulation_steps"] == 8
Author	SHA1	Message	Date
Wing Lian	d17ed89a3c	add missing file	2026-04-21 08:44:01 -04:00
Wing Lian	02e4f2350d	fixes for scattermoe from latest peft upgrade	2026-04-21 08:00:16 -04:00
Wing Lian	4195605ab2	fix test dims	2026-04-21 00:44:26 +00:00
Wing Lian	37acb28d02	fix einsum dims	2026-04-20 23:09:47 +00:00
Wing Lian	4a5281e61a	Fix shape	2026-04-19 01:53:05 +00:00
Wing Lian	a892d8cce1	chore: lint	2026-04-17 17:48:26 +00:00
Wing Lian	78de2919a6	tiled mlp fix for gemma4	2026-04-16 13:24:41 +00:00
Wing Lian	28283ff373	revert shared_kv_states workaround with transformers 5.5.4	2026-04-15 13:32:59 +00:00
Wing Lian	dc16859983	[gemma4] fix fused RMSNorm+RoPE on hybrid attention models - Kernel: fused_rms_norm_rope crashed when cos.shape[-1] < x.shape[-1]. Triton forward/backward take an n_rot runtime arg that restricts rotate_half to [0, n_rot) and treats trailing cols as RMSNorm-only pass-through (cos=1, sin=0 defaults). Wrapper also expands cos/sin that broadcast over batch. - Forward: _make_fused_forward used a stale shared_kv_states kwarg the current decoder layer no longer passes. Now mirrors stock attention, reading/writing past_key_values.shared_layers.	2026-04-15 13:27:31 +00:00
Wing Lian	d4e9cf2eec	lint	2026-04-15 13:27:30 +00:00
Wing Lian	53391a10d7	vllm-serve-lora add /v1/completions route + worker pipe lock The LoRA vllm-serve wrapper only exposed /v1/chat/completions, but retrace's SWE agent server uses the token-id-aware /v1/completions endpoint so it can feed raw prompt_token_ids + track per-token logprobs across multi-turn rollouts. Add the route, mirroring the shape of /v1/chat/completions but routing to the vLLM worker's generate() method so prompt_token_ids are passed through as-is. Also add a worker_pipe_lock around conn.send/conn.recv. The multiprocessing.Connection to the vLLM worker is a single shared full-duplex pipe; concurrent HTTP requests interleave pickle frames on the wire and corrupt the stream (observed as UnpicklingError: pickle data was truncated, surfacing as 500s). The agent server fires ~8 concurrent rollout requests at once, so this was a hard blocker for any multi-concurrent workload. Serialize access to the pipe per-request round-trip.	2026-04-15 13:27:30 +00:00
Wing Lian	7617b951a8	make _maybe_sync_vllm_weights actually fire in sync mode Two bugs in ``AsyncGRPOTrainer._maybe_sync_vllm_weights`` plus a companion bug in the sync-hook patch site that together neutralized LoRA weight sync entirely whenever ``async_prefetch=False`` was combined with NeMo Gym's data-producer path: 1. ``_maybe_sync_vllm_weights`` had ``if not async_prefetch: return`` at the top. The original design assumed sync mode would fall back to TRL's stock per-step ``sync_weights`` call inside ``_generate_single_turn`` — true for vanilla GRPO but FALSE in NeMo Gym multi-turn, where ``NemoGymDataProducer`` calls the agent server directly and ``_generate_single_turn`` is never invoked. Result: no sync ever happened in NeMo Gym sync mode. 2. ``step % vllm_sync_interval`` would TypeError on the first call if ``vllm_sync_interval`` was unset (the default for any config that doesn't explicitly set it). 3. The ``_generate_single_turn`` patch installed ``vllm_generation.sync_weights = lambda: None`` unconditionally for vllm_lora_sync runs. That's correct in async-prefetch mode (BG thread can't safely sync) but wrong in sync mode: TRL's per-step auto-sync inside ``_generate_single_turn`` was the fallback that the early return in (1) was assuming, and the no-op patch was killing it. Fix: - Drop the ``not async_prefetch`` early return; ``_maybe_sync_vllm_weights`` is now the canonical sync trigger and runs in both modes from ``_prepare_inputs_with_data_producer`` / ``_prepare_inputs_legacy_async``. - Default ``vllm_sync_interval`` to 1 when unset. - In the ``_generate_single_turn`` patch, route sync_weights to ``_sync_lora_adapter`` in sync mode (and keep the lambda no-op in async mode for the BG-thread safety reason).	2026-04-15 13:27:30 +00:00
Wing Lian	e993ed5208	retry head-server probe with longer timeout ``get_server_configs`` was hardcoded to a 5s timeout with no retry. That's empirically too tight to survive a kill-and-relaunch cycle: when the agent server is finishing in-flight rollouts from a prior run, it can take 10-30s to respond to /global_config_dict_yaml, and the trainer would crash at startup with a ReadTimeoutError. Bump the per-attempt timeout to 30s and retry up to 3 times with a 2s/4s backoff. The retry intentionally raises a RuntimeError after the third failure rather than returning empty config — silent failure here would let training proceed with no agent servers discovered, which is also a no-op trainer.	2026-04-15 13:27:30 +00:00
Wing Lian	69f165b39b	probe vLLM weight-sync routes and select transport per server The plugin used to unconditionally monkey-patch VLLMClient.init_communicator to a no-op AND silently no-op sync_weights when vllm_lora_sync was off. Combined, this turned the trainer into a functional no-op whenever (a) the user ran NeMo Gym + LoRA without remembering to set vllm_lora_sync=true or (b) the user ran NeMo Gym + full fine-tune (which had no working sync path under the old code). Replace both patches with: 1. A probe of the configured vLLM server's /openapi.json at pre_model_load. Three transports are recognized: - NCCL (/init_communicator/ + /update_named_param/) — TRL serve and axolotl vllm-serve both expose this - LoRA filesystem (/v1/load_lora_adapter or /set_lora_adapter/) - HTTP base64 full-weight (/http_update_weights/) — axolotl vllm-serve only 2. A pure-logic ``select_weight_sync_transport`` that picks the right one for (server caps × adapter type). 3. ``init_communicator`` is only patched out when the server has no NCCL routes; against TRL/axolotl serve modules it stays live so full-finetune NCCL sync works. 4. ``post_trainer_create`` uses the selection table to install LoRA filesystem sync OR leave the standard NCCL flow alone OR raise NotImplementedError (HTTP — pending) OR raise a precise diagnosis when no transport is viable. No more silent no-op trainers.	2026-04-15 13:27:30 +00:00
Wing Lian	80a97f192b	validate batch shape against num_generations at config time Surfaces a class of GRPO config errors at axolotl-train startup instead of letting them bubble out of GRPOTrainer.__init__ after the model loads. Three checks under RLValidationMixin.check_grpo_batch_size_divisibility: - effective generation_batch_size (or mbGA fallback) must be divisible by trl.num_generations, with a hint pointing at the smallest GA bump that fixes the violation - num_generations >= 2 (group-relative advantage needs variance; with num_gen=1 the policy never updates) - When world_size > 1, effective gbs >= num_generations world_size 11 unit tests cover the table: divisible/non-divisible, explicit and implicit gbs, multi-rank constraint, GRPO-disabled passthrough, and unset num_generations.	2026-04-15 13:27:30 +00:00