update create_optimizer for updated api

fix labels
fix for tokenizers change
2026-02-19 23:49:32 -05:00 · 2026-02-19 23:44:46 -05:00 · 2026-02-19 21:52:44 -05:00 · 2026-02-19 18:34:13 -05:00
134 changed files with 509 additions and 13633 deletions
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -70,11 +70,6 @@ You can skip certain CI checks by including specific keywords in your commit mes

 axolotl uses [{codestyle}]({URLofCodestyle}) as its code style guide. Please ensure that your code follows these guidelines.

-Use the pre-commit linter to ensure that your code is formatted consistently.
-```bash
-pre-commit run --all-files
-```
-
 ### Commit Messages

 Write clear and concise commit messages that briefly describe the changes made in each commit. Use the imperative mood and start with a capitalized verb, e.g., "Add new feature" or "Fix bug in function".
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -51,30 +51,14 @@ jobs:
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
            platforms: "linux/amd64,linux/arm64"
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.10.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
-            platforms: "linux/amd64,linux/arm64"
-          - cuda: "128"
-            cuda_version: 12.8.1
+          - cuda: "129"
+            cuda_version: 12.9.1
            cudnn_version: ""
            python_version: "3.12"
-            pytorch: 2.10.0
+            pytorch: 2.9.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
            platforms: "linux/amd64,linux/arm64"
-#          - cuda: "129"
-#            cuda_version: 12.9.1
-#            cudnn_version: ""
-#            python_version: "3.12"
-#            pytorch: 2.9.1
-#            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-#            dockerfile: "Dockerfile-base"
-#            platforms: "linux/amd64,linux/arm64"
          - cuda: "130"
            cuda_version: 13.0.0
            cudnn_version: ""
@@ -91,14 +75,6 @@ jobs:
            torch_cuda_arch_list: "9.0+PTX"
            dockerfile: "Dockerfile-base"
            platforms: "linux/amd64,linux/arm64"
-          - cuda: "130"
-            cuda_version: 13.0.0
-            cudnn_version: ""
-            python_version: "3.12"
-            pytorch: 2.10.0
-            torch_cuda_arch_list: "9.0+PTX"
-            dockerfile: "Dockerfile-base"
-            platforms: "linux/amd64,linux/arm64"
 #          - cuda: "128"
 #            cuda_version: 12.8.1
 #            cudnn_version: ""
@@ -181,30 +157,14 @@ jobs:
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64,linux/arm64"
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.10.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-uv-base"
-            platforms: "linux/amd64,linux/arm64"
-          - cuda: "128"
-            cuda_version: 12.8.1
+          - cuda: "129"
+            cuda_version: 12.9.1
            cudnn_version: ""
            python_version: "3.12"
-            pytorch: 2.10.0
+            pytorch: 2.9.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64,linux/arm64"
-#          - cuda: "129"
-#            cuda_version: 12.9.1
-#            cudnn_version: ""
-#            python_version: "3.12"
-#            pytorch: 2.9.1
-#            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-#            dockerfile: "Dockerfile-uv-base"
-#            platforms: "linux/amd64,linux/arm64"
          - cuda: "130"
            cuda_version: 13.0.0
            cudnn_version: ""
@@ -221,14 +181,6 @@ jobs:
            torch_cuda_arch_list: "9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64,linux/arm64"
-          - cuda: "130"
-            cuda_version: 13.0.0
-            cudnn_version: ""
-            python_version: "3.12"
-            pytorch: 2.10.0
-            torch_cuda_arch_list: "9.0+PTX"
-            dockerfile: "Dockerfile-uv-base"
-            platforms: "linux/amd64,linux/arm64"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -34,28 +34,16 @@ jobs:
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
            is_latest: true
-          - cuda: 128
-            cuda_version: 12.8.1
+          - cuda: 129
+            cuda_version: 12.9.1
            python_version: "3.12"
-            pytorch: 2.10.0
-            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
-#          - cuda: 129
-#            cuda_version: 12.9.1
-#            python_version: "3.12"
-#            pytorch: 2.9.1
-#            axolotl_extras:
-#            platforms: "linux/amd64,linux/arm64"
-          - cuda: 130
-            cuda_version: 13.0.0
-            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
          - cuda: 130
            cuda_version: 13.0.0
-            python_version: "3.12"
-            pytorch: 2.10.0
+            python_version: "3.11"
+            pytorch: 2.9.1
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
    runs-on: axolotl-gpu-runner
@@ -98,77 +86,6 @@ jobs:
            ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
          labels: ${{ steps.metadata.outputs.labels }}

-  build-axolotl-uv:
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.9.1
-            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
-            is_latest: true
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.12"
-            pytorch: 2.10.0
-            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
-          - cuda: 130
-            cuda_version: 13.0.0
-            python_version: "3.11"
-            pytorch: 2.9.1
-            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
-          - cuda: 130
-            cuda_version: 13.0.0
-            python_version: "3.12"
-            pytorch: 2.10.0
-            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
-    runs-on: axolotl-gpu-runner
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Docker metadata
-        id: metadata
-        uses: docker/metadata-action@v5
-        with:
-          images: |
-            axolotlai/axolotl-uv
-          tags: |
-            type=ref,event=branch
-            type=pep440,pattern={{version}}
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-      # guidance for testing before pushing: https://docs.docker.com/build/ci/github-actions/test-before-push/
-      - name: Build and export to Docker
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          platforms: ${{ matrix.platforms }}
-          build-args: |
-            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
-            CUDA=${{ matrix.cuda }}
-            PYTORCH_VERSION=${{ matrix.pytorch }}
-            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
-            AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}
-          file: ./docker/Dockerfile-uv
-          push: ${{ github.event_name != 'pull_request' }}
-          tags: |
-            ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-            ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
-            ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
-          labels: ${{ steps.metadata.outputs.labels }}
-
  build-axolotl-cloud:
    needs: build-axolotl
    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
@@ -195,28 +112,16 @@ jobs:
            axolotl_extras:
            is_latest: true
            platforms: "linux/amd64,linux/arm64"
-          - cuda: 128
-            cuda_version: 12.8.1
+          - cuda: 129
+            cuda_version: 12.9.1
            python_version: "3.12"
-            pytorch: 2.10.0
-            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
-#          - cuda: 129
-#            cuda_version: 12.9.1
-#            python_version: "3.12"
-#            pytorch: 2.9.1
-#            axolotl_extras:
-#            platforms: "linux/amd64,linux/arm64"
-          - cuda: 130
-            cuda_version: 13.0.0
-            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
          - cuda: 130
            cuda_version: 13.0.0
-            python_version: "3.12"
-            pytorch: 2.10.0
+            python_version: "3.11"
+            pytorch: 2.9.1
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
    runs-on: axolotl-gpu-runner
@@ -254,73 +159,6 @@ jobs:
             ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
          labels: ${{ steps.metadata.outputs.labels }}

-  build-axolotl-cloud-uv:
-    needs: build-axolotl-uv
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
-    # this job needs to be run on self-hosted GPU runners...
-    strategy:
-      matrix:
-        include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.9.1
-            axolotl_extras:
-            is_latest: true
-            platforms: "linux/amd64,linux/arm64"
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.12"
-            pytorch: 2.10.0
-            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
-          - cuda: 130
-            cuda_version: 13.0.0
-            python_version: "3.11"
-            pytorch: 2.9.1
-            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
-          - cuda: 130
-            cuda_version: 13.0.0
-            python_version: "3.12"
-            pytorch: 2.10.0
-            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
-    runs-on: axolotl-gpu-runner
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Docker metadata
-        id: metadata
-        uses: docker/metadata-action@v5
-        with:
-          images: |
-            axolotlai/axolotl-cloud-uv
-          tags: |
-            type=ref,event=branch
-            type=pep440,pattern={{version}}
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Build
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          platforms: ${{ matrix.platforms }}
-          build-args: |
-            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-            CUDA=${{ matrix.cuda }}
-          file: ./docker/Dockerfile-cloud-uv
-          push: ${{ github.event_name != 'pull_request' }}
-          tags: |
-             ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-             ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
-          labels: ${{ steps.metadata.outputs.labels }}
-
  build-axolotl-cloud-no-tmux:
    needs: build-axolotl
    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -8,7 +8,6 @@ on:
      - 'setup.py'
      - 'pyproject.toml'
      - '.github/workflows/multi-gpu-e2e.yml'
-      - 'scripts/cutcrossentropy_install.py'
      - 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
      - 'src/axolotl/utils/distributed.py'
  workflow_dispatch:
@@ -36,13 +35,19 @@ jobs:
            pytorch: 2.8.0
            axolotl_extras: fbgemm-gpu
            num_gpus: 2
-#          - cuda: 129
-#            cuda_version: 12.9.1
-#            python_version: "3.12"
-#            pytorch: 2.9.1
-#            axolotl_extras: "fbgemm-gpu"
-#            num_gpus: 2
-#            dockerfile: "Dockerfile-uv.jinja"
+          - cuda: 128
+            cuda_version: 12.8.1
+            python_version: "3.11"
+            pytorch: 2.9.1
+            axolotl_extras: "fbgemm-gpu"
+            num_gpus: 2
+          - cuda: 129
+            cuda_version: 12.9.1
+            python_version: "3.12"
+            pytorch: 2.9.1
+            axolotl_extras: "fbgemm-gpu"
+            num_gpus: 2
+            dockerfile: "Dockerfile-uv.jinja"
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.11"
@@ -50,13 +55,6 @@ jobs:
            axolotl_extras:
 #            axolotl_extras: fbgemm-gpu
            num_gpus: 2
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.10.0
-            axolotl_extras: "fbgemm-gpu"
-            num_gpus: 2
-            dockerfile: "Dockerfile-uv.jinja"
    runs-on: [self-hosted, modal]
    timeout-minutes: 120
    steps:
--- a/.github/workflows/preview-docs.yml
+++ b/.github/workflows/preview-docs.yml
@@ -14,8 +14,14 @@ on:
      - .github/workflows/preview-docs.yml

 permissions:
-  contents: read
+  checks: write
+  contents: write
+  deployments: write
+  issues: write
+  discussions: write
+  pages: write
  pull-requests: write
+  statuses: write

 jobs:
  preview:
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -3,10 +3,6 @@ on:
  workflow_dispatch:
  schedule:
    - cron: '0 0 * * *'  # Runs at 00:00 UTC every day
-  pull_request:
-    types: [opened, synchronize, reopened, ready_for_review]
-    paths:
-      - '.github/workflows/tests-nightly.yml'

 jobs:
  pre-commit:
@@ -22,26 +18,15 @@ jobs:
        env:
          SKIP: no-commit-to-branch

-  prime-cdn-s3-cache:
-    name: Prefetch S3 once to prime the CDN cache
-    runs-on: ubuntu-latest
-    if: ${{ !github.event.pull_request.draft }}
-    timeout-minutes: 10
-    steps:
-      - name: Restore Cache from S3
-        id: hf-cache-restore-s3
-        run: |
-          curl -v -H "Range: bytes=0-1023" -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null
-
  pytest:
    name: PyTest
    runs-on: ubuntu-latest
-    needs: [prime-cdn-s3-cache]
    strategy:
      fail-fast: false
+      max-parallel: 2
      matrix:
-        python_version: ["3.12"]  # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
-        pytorch_version: ["2.8.0", "2.9.1", "2.10.0"]
+        python_version: ["3.11"]
+        pytorch_version: ["2.8.0", "2.9.0", "2.9.1"]
    timeout-minutes: 20

    steps:
@@ -52,7 +37,7 @@ jobs:
        id: hf-cache-restore-s3
        run: |
          mkdir -p /home/runner/.cache/huggingface/hub
-          curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd
+          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd

      - name: Setup Python
        uses: actions/setup-python@v5
@@ -63,7 +48,7 @@ jobs:
      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==26.0 setuptools==78.1.1 wheel
+          pip3 install --upgrade packaging==26.0 setuptools==75.8.0 wheel

      - name: Install PyTorch
        run: |
@@ -117,23 +102,16 @@ jobs:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
-            pytorch: 2.9.1
+            pytorch: 2.8.0
            num_gpus: 1
            axolotl_extras:
            nightly_build: "true"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
-            pytorch: 2.10.0
-            num_gpus: 1
-            axolotl_extras:
-          - cuda: 130
-            cuda_version: 13.0.0
-            python_version: "3.12"
            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
-            dockerfile: "Dockerfile-uv.jinja"
            nightly_build: "true"
    steps:
      - name: Checkout
@@ -154,7 +132,6 @@ jobs:
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -46,32 +46,21 @@ jobs:
        env:
          SKIP: no-commit-to-branch

-  prime-cdn-s3-cache:
-    name: Prefetch S3 once to prime the CDN cache
-    runs-on: ubuntu-latest
-    if: ${{ !github.event.pull_request.draft }}
-    timeout-minutes: 10
-    steps:
-      - name: Restore Cache from S3
-        id: hf-cache-restore-s3
-        run: |
-          curl -v -H "Range: bytes=0-1023" -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null
-
  pytest:
    name: PyTest
    runs-on: ubuntu-latest
    if: ${{ !github.event.pull_request.draft }}
-    needs: [prime-cdn-s3-cache]
+#    needs: [preload-cache]
    strategy:
      fail-fast: false
      matrix:
-        python_version: ["3.12"]  # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
-        pytorch_version: ["2.8.0", "2.9.1", "2.10.0"]
-#        exclude:
-#          - python_version: "3.14"
-#            pytorch_version: "2.8.0"
-#          - python_version: "3.14"
-#            pytorch_version: "2.9.1"
+        python_version: ["3.11", "3.12"]
+        pytorch_version: ["2.8.0", "2.9.0", "2.9.1"]
+        exclude:
+          - python_version: "3.12"
+            pytorch_version: "2.8.0"
+          - python_version: "3.12"
+            pytorch_version: "2.9.0"
    timeout-minutes: 20

    steps:
@@ -86,7 +75,7 @@ jobs:
        id: hf-cache-restore-s3
        run: |
          mkdir -p ~/.cache/huggingface/hub
-          curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/  --use-compress-program unzstd --strip-components=1
+          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/  --use-compress-program unzstd --strip-components=1
          ls -ltr ~/.cache/huggingface/hub/

      - name: Setup Python
@@ -157,18 +146,17 @@ jobs:
    name: PyTest from Source Dist
    runs-on: ubuntu-latest
    if: ${{ !github.event.pull_request.draft }}
-    needs: [prime-cdn-s3-cache]
    strategy:
      fail-fast: false
      matrix:
-        python_version: ["3.12"]  # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
-        pytorch_version: ["2.8.0", "2.9.1", "2.10.0"]
-#        exclude:
-#          - python_version: "3.14"
-#            pytorch_version: "2.8.0"
-#          - python_version: "3.14"
-#            pytorch_version: "2.9.1"
-    timeout-minutes: 30
+        python_version: ["3.11", "3.12"]
+        pytorch_version: ["2.8.0", "2.9.0", "2.9.1"]
+        exclude:
+          - python_version: "3.12"
+            pytorch_version: "2.8.0"
+          - python_version: "3.12"
+            pytorch_version: "2.9.0"
+    timeout-minutes: 20

    steps:
      - name: cleanup node
@@ -182,7 +170,7 @@ jobs:
        id: hf-cache-restore-s3
        run: |
          mkdir -p ~/.cache/huggingface/hub
-          curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/  --use-compress-program unzstd --strip-components=1
+          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/  --use-compress-program unzstd --strip-components=1
          ls -ltr ~/.cache/huggingface/hub/

      - name: Setup Python
@@ -276,8 +264,8 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 130
-            cuda_version: 13.0.0
+          - cuda: 129
+            cuda_version: 12.9.1
            python_version: "3.12"
            pytorch: 2.9.1
            num_gpus: 1
@@ -338,12 +326,6 @@ jobs:
            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.10.0
-            num_gpus: 1
-            axolotl_extras:
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.11"
@@ -387,9 +369,9 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
+          - cuda: 129
+            cuda_version: 12.9.1
+            python_version: "3.12"
            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,7 +11,7 @@ repos:
    -   id: no-commit-to-branch
        args: ['--branch', 'main']
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.15.4
+    rev: v0.14.10
    hooks:
    -   id: ruff
        args: [--fix]
@@ -26,7 +26,7 @@ repos:
            'pydantic>=2.5.3',
        ]
 -   repo: https://github.com/PyCQA/bandit
-    rev: 1.9.4
+    rev: 1.9.2
    hooks:
    -   id: bandit
        args: [
--- a/README.md
+++ b/README.md
@@ -29,23 +29,8 @@

 ## 🎉 Latest Updates

- 2026/03:
-  - New model support has been added in Axolotl for [Qwen3.5, Qwen3.5 MoE](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen3.5), [GLM-4.7-Flash](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm47-flash), [GLM-4.6V](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm46v), and [GLM-4.5-Air](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm45).
-  - [MoE expert quantization](https://docs.axolotl.ai/docs/expert_quantization.html) support (via `quantize_moe_experts: true`) greatly reduces VRAM when training MoE models (FSDP2 compat).
- 2026/02:
-  - [ScatterMoE LoRA](https://github.com/axolotl-ai-cloud/axolotl/pull/3410) support. LoRA fine-tuning directly on MoE expert weights using custom Triton kernels.
-  - Axolotl now has support for [SageAttention](https://github.com/axolotl-ai-cloud/axolotl/pull/2823) and [GDPO](https://github.com/axolotl-ai-cloud/axolotl/pull/3353) (Generalized DPO).
- 2026/01:
-  - New integration for [EAFT](https://github.com/axolotl-ai-cloud/axolotl/pull/3366) (Entropy-Aware Focal Training), weights loss by entropy of the top-k logit distribution, and [Scalable Softmax](https://github.com/axolotl-ai-cloud/axolotl/pull/3338), improves long context in attention.
- 2025/12:
-  - Axolotl now includes support for [Kimi-Linear](https://docs.axolotl.ai/docs/models/kimi-linear.html), [Plano-Orchestrator](https://docs.axolotl.ai/docs/models/plano.html), [MiMo](https://docs.axolotl.ai/docs/models/mimo.html), [InternVL 3.5](https://docs.axolotl.ai/docs/models/internvl3_5.html), [Olmo3](https://docs.axolotl.ai/docs/models/olmo3.html), [Trinity](https://docs.axolotl.ai/docs/models/trinity.html), and [Ministral3](https://docs.axolotl.ai/docs/models/ministral3.html).
-  - [Distributed Muon Optimizer](https://github.com/axolotl-ai-cloud/axolotl/pull/3264) support has been added for FSDP2 pretraining.
+- 2025/12: Axolotl now includes support for [Kimi-Linear](https://docs.axolotl.ai/docs/models/kimi-linear.html), [Plano-Orchestrator](https://docs.axolotl.ai/docs/models/plano.html), [MiMo](https://docs.axolotl.ai/docs/models/mimo.html), [InternVL 3.5](https://docs.axolotl.ai/docs/models/internvl3_5.html), [Olmo3](https://docs.axolotl.ai/docs/models/olmo3.html), [Trinity](https://docs.axolotl.ai/docs/models/trinity.html), and [Ministral3](https://docs.axolotl.ai/docs/models/ministral3.html).
 - 2025/10: New model support has been added in Axolotl for: [Qwen3 Next](https://docs.axolotl.ai/docs/models/qwen3-next.html), [Qwen2.5-vl, Qwen3-vl](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen2_5-vl), [Qwen3, Qwen3MoE](https://docs.axolotl.ai/docs/models/qwen3.html), [Granite 4](https://docs.axolotl.ai/docs/models/granite4.html), [HunYuan](https://docs.axolotl.ai/docs/models/hunyuan.html), [Magistral 2509](https://docs.axolotl.ai/docs/models/magistral/vision.html), [Apertus](https://docs.axolotl.ai/docs/models/apertus.html), and [Seed-OSS](https://docs.axolotl.ai/docs/models/seed-oss.html).
-
-<details>
-
-<summary>Expand older updates</summary>
-
 - 2025/09: Axolotl now has text diffusion training. Read more [here](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations/diffusion).
 - 2025/08: QAT has been updated to include NVFP4 support. See [PR](https://github.com/axolotl-ai-cloud/axolotl/pull/3107).
 - 2025/07:
@@ -54,10 +39,15 @@
  - FP8 finetuning with fp8 gather op is now possible in Axolotl via `torchao`. Get started [here](https://docs.axolotl.ai/docs/mixed_precision.html#sec-fp8)!
  - [Voxtral](https://docs.axolotl.ai/docs/models/voxtral.html), [Magistral 1.1](https://docs.axolotl.ai/docs/models/magistral.html), and [Devstral](https://docs.axolotl.ai/docs/models/devstral.html) with mistral-common tokenizer support has been integrated in Axolotl!
  - TiledMLP support for single-GPU to multi-GPU training with DDP, DeepSpeed and FSDP support has been added to support Arctic Long Sequence Training. (ALST). See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/alst) for using ALST with Axolotl!
- 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [docs](https://docs.axolotl.ai/docs/models/magistral.html) to start training your own Magistral models with Axolotl!
 - 2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the [docs](https://docs.axolotl.ai/docs/qat.html) to learn more!
- 2025/04: Llama 4 support has been added in Axolotl. See [docs](https://docs.axolotl.ai/docs/models/llama-4.html) to start training your own Llama 4 models with Axolotl's linearized version!
+
+<details>
+
+<summary>Expand older updates</summary>
+
 - 2025/03: Axolotl has implemented Sequence Parallelism (SP) support. Read the [blog](https://huggingface.co/blog/axolotl-ai-co/long-context-with-sequence-parallelism-in-axolotl) and [docs](https://docs.axolotl.ai/docs/sequence_parallelism.html) to learn how to scale your context length when fine-tuning.
+- 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [docs](https://docs.axolotl.ai/docs/models/magistral.html) to start training your own Magistral models with Axolotl!
+- 2025/04: Llama 4 support has been added in Axolotl. See [docs](https://docs.axolotl.ai/docs/models/llama-4.html) to start training your own Llama 4 models with Axolotl's linearized version!
 - 2025/03: (Beta) Fine-tuning Multimodal models is now supported in Axolotl. Check out the [docs](https://docs.axolotl.ai/docs/multimodal.html) to fine-tune your own!
 - 2025/02: Axolotl has added LoRA optimizations to reduce memory usage and improve training speed for LoRA and QLoRA in single GPU and multi-GPU training (DDP and DeepSpeed). Jump into the [docs](https://docs.axolotl.ai/docs/lora_optims.html) to give it a try.
 - 2025/02: Axolotl has added GRPO support. Dive into our [blog](https://huggingface.co/blog/axolotl-ai-co/training-llms-w-interpreter-feedback-wasm) and [GRPO example](https://github.com/axolotl-ai-cloud/grpo_code) and have some fun!
@@ -72,10 +62,10 @@ Axolotl is a free and open-source tool designed to streamline post-training and
 Features:

 - **Multiple Model Support**: Train various models like GPT-OSS, LLaMA, Mistral, Mixtral, Pythia, and many more models available on the Hugging Face Hub.
- **Multimodal Training**: Fine-tune vision-language models (VLMs) including LLaMA-Vision, Qwen2-VL, Pixtral, LLaVA, SmolVLM2, GLM-4.6V, InternVL 3.5, Gemma 3n, and audio models like Voxtral with image, video, and audio support.
- **Training Methods**: Full fine-tuning, LoRA, QLoRA, GPTQ, QAT, Preference Tuning (DPO, IPO, KTO, ORPO), RL (GRPO, GDPO), and Reward Modelling (RM) / Process Reward Modelling (PRM).
+- **Multimodal Training**: Fine-tune vision-language models (VLMs) including LLaMA-Vision, Qwen2-VL, Pixtral, LLaVA, SmolVLM2, and audio models like Voxtral with image, video, and audio support.
+- **Training Methods**: Full fine-tuning, LoRA, QLoRA, GPTQ, QAT, Preference Tuning (DPO, IPO, KTO, ORPO), RL (GRPO), and Reward Modelling (RM) / Process Reward Modelling (PRM).
 - **Easy Configuration**: Re-use a single YAML configuration file across the full fine-tuning pipeline: dataset preprocessing, training, evaluation, quantization, and inference.
- **Performance Optimizations**: [Multipacking](https://docs.axolotl.ai/docs/multipack.html), [Flash Attention](https://github.com/Dao-AILab/flash-attention), [Xformers](https://github.com/facebookresearch/xformers), [Flex Attention](https://pytorch.org/blog/flexattention/), [SageAttention](https://github.com/thu-ml/SageAttention), [Liger Kernel](https://github.com/linkedin/Liger-Kernel), [Cut Cross Entropy](https://github.com/apple/ml-cross-entropy/tree/main), [ScatterMoE](https://docs.axolotl.ai/docs/custom_integrations.html#kernels-integration), [Sequence Parallelism (SP)](https://docs.axolotl.ai/docs/sequence_parallelism.html), [LoRA optimizations](https://docs.axolotl.ai/docs/lora_optims.html), [Multi-GPU training (FSDP1, FSDP2, DeepSpeed)](https://docs.axolotl.ai/docs/multi-gpu.html), [Multi-node training (Torchrun, Ray)](https://docs.axolotl.ai/docs/multi-node.html), and many more!
+- **Performance Optimizations**: [Multipacking](https://docs.axolotl.ai/docs/multipack.html), [Flash Attention](https://github.com/Dao-AILab/flash-attention), [Xformers](https://github.com/facebookresearch/xformers), [Flex Attention](https://pytorch.org/blog/flexattention/), [Liger Kernel](https://github.com/linkedin/Liger-Kernel), [Cut Cross Entropy](https://github.com/apple/ml-cross-entropy/tree/main), [Sequence Parallelism (SP)](https://docs.axolotl.ai/docs/sequence_parallelism.html), [LoRA optimizations](https://docs.axolotl.ai/docs/lora_optims.html), [Multi-GPU training (FSDP1, FSDP2, DeepSpeed)](https://docs.axolotl.ai/docs/multi-gpu.html), [Multi-node training (Torchrun, Ray)](https://docs.axolotl.ai/docs/multi-node.html), and many more!
 - **Flexible Dataset Handling**: Load from local, HuggingFace, and cloud (S3, Azure, GCP, OCI) datasets.
 - **Cloud Ready**: We ship [Docker images](https://hub.docker.com/u/axolotlai) and also [PyPI packages](https://pypi.org/project/axolotl/) for use on cloud platforms and local hardware.

--- a/2
+++ b/2
@@ -1 +1 @@
-0.16.0.dev0
+0.15.0.dev0
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -331,7 +331,6 @@ website:
            - docs/sequence_parallelism.qmd
            - docs/gradient_checkpointing.qmd
            - docs/nd_parallelism.qmd
-            - docs/expert_quantization.qmd

        - section: "Troubleshooting"
          contents:
--- a/cicd/Dockerfile-uv.jinja
+++ b/cicd/Dockerfile-uv.jinja
@@ -31,9 +31,8 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
    fi

-RUN uv pip install packaging==26.0 setuptools==78.1.1
+RUN uv pip install packaging==26.0 setuptools==75.8.0
 RUN uv pip install torchvision
-RUN uv pip uninstall causal_conv1d
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -32,8 +32,7 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
    fi

-RUN pip install packaging==26.0 setuptools==78.1.1 psutil
-RUN pip uninstall -y causal_conv1d
+RUN pip install packaging==26.0 setuptools==75.8.0 psutil
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -3,12 +3,6 @@ set -e

 python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"

-# curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xpf - -C "${HF_HOME}/hub/"  --use-compress-program unzstd --strip-components=1
-hf download "NousResearch/Meta-Llama-3-8B"
-hf download "NousResearch/Meta-Llama-3-8B-Instruct"
-hf download "microsoft/Phi-4-reasoning"
-hf download "microsoft/Phi-3.5-mini-instruct"
-
 # Run unit tests with initial coverage report
 pytest -v --durations=10 -n8 \
  --ignore=tests/e2e/ \
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -22,7 +22,6 @@ RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
 WORKDIR /workspace/axolotl

 # If AXOLOTL_EXTRAS is set, append it in brackets; don't install deepspeed with arm64
-RUN pip uninstall -y causal_conv1d
 RUN if [ "$TARGETARCH" = "arm64" ]; then \
        BASE_EXTRAS="flash-attn,ring-flash-attn,optimizers,ray"; \
    else \
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -59,18 +59,34 @@ RUN git lfs install --skip-repo && \
    pip3 install -U --no-cache-dir pydantic==1.10.10 && \
    pip3 cache purge

-# Map Python version (e.g., 3.12 -> cp312)
-RUN PYTHON_CP="cp$(echo $PYTHON_VERSION | tr -d '.')" && \
-    # Map PyTorch version (e.g., 2.9.1 -> torch2.9, 2.10.0 -> torch2.10)
-    TORCH_TAG="torch$(echo $PYTORCH_VERSION | grep -oP '^\d+\.\d+')" && \
-    # Map architecture
-    case "$TARGETARCH" in \
-        amd64) ARCH_TAG="x86_64" ;; \
-        arm64) ARCH_TAG="aarch64" ;; \
-        *) echo "Unsupported architecture: $TARGETARCH"; exit 1 ;; \
-    esac && \
-    WHL_VERSION="v0.7.16" && \
-    WHL_FILE="flash_attn-2.8.3+cu${CUDA}${TORCH_TAG}-${PYTHON_CP}-${PYTHON_CP}-linux_${ARCH_TAG}.whl" && \
-    wget -nv "https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}" && \
-    pip3 install --no-cache-dir "${WHL_FILE}" && \
-    rm "${WHL_FILE}"
+RUN case "$PYTORCH_VERSION" in \
+        2.9.[0-9]*) \
+            if [ "$CUDA" = "128" ]; then \
+                if [ "$TARGETARCH" = "amd64" ]; then \
+                    WHL_FILE="flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl"; \
+                    WHL_VERSION="v0.5.4"; \
+                elif [ "$TARGETARCH" = "arm64" ]; then \
+                    WHL_FILE="flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_aarch64.whl"; \
+                    WHL_VERSION="v0.6.4"; \
+                else \
+                    echo "Unsupported architecture: $TARGETARCH"; exit 1; \
+                fi; \
+                wget -nv https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}; \
+                pip3 install --no-cache-dir ${WHL_FILE}; \
+                rm ${WHL_FILE}; \
+            elif [ "$CUDA" = "130" ]; then \
+                if [ "$TARGETARCH" = "amd64" ]; then \
+                    WHL_FILE="flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_x86_64.whl"; \
+                    WHL_VERSION="v0.5.4"; \
+                elif [ "$TARGETARCH" = "arm64" ]; then \
+                    WHL_FILE="flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_aarch64.whl"; \
+                    WHL_VERSION="v0.6.4"; \
+                else \
+                    echo "Unsupported architecture: $TARGETARCH"; exit 1; \
+                fi; \
+                wget -nv https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}; \
+                pip3 install --no-cache-dir ${WHL_FILE}; \
+                rm ${WHL_FILE}; \
+            fi \
+            ;; \
+    esac
--- a/docker/Dockerfile-cloud-uv
+++ b/docker/Dockerfile-cloud-uv
@@ -1,30 +0,0 @@
-ARG BASE_TAG=main
-FROM axolotlai/axolotl-uv:$BASE_TAG
-
-ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
-ENV HF_HUB_CACHE="/workspace/data/huggingface-cache/hub"
-ENV HF_HOME="/workspace/data/huggingface-cache/hub"
-ENV HF_HUB_ENABLE_HF_TRANSFER="1"
-
-EXPOSE 8888
-EXPOSE 22
-
-COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh
-COPY scripts/motd /etc/motd
-
-RUN uv pip install jupyterlab notebook ipywidgets && \
-    jupyter lab clean
-RUN apt update && \
-    apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop && \
-    rm -rf /var/cache/apt/archives && \
-    rm -rf /var/lib/apt/lists/* && \
-    mkdir -p ~/.ssh && \
-    chmod 700 ~/.ssh && \
-    printf "\n[[ -z \"\$TMUX\"  ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \
-    printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \
-    chmod +x /workspace/axolotl/scripts/cloud-entrypoint.sh && \
-    chmod +x /root/cloud-entrypoint.sh && \
-    echo 'set-option -g history-limit 5000' >> ~/.tmux.conf
-
-ENTRYPOINT ["/root/cloud-entrypoint.sh"]
-CMD ["sleep", "infinity"]
--- a/docker/Dockerfile-uv
+++ b/docker/Dockerfile-uv
@@ -1,48 +0,0 @@
-ARG BASE_TAG=main-base
-FROM axolotlai/axolotl-base-uv:$BASE_TAG
-
-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
-ARG AXOLOTL_EXTRAS=""
-ARG AXOLOTL_ARGS=""
-ARG CUDA="118"
-ARG PYTORCH_VERSION="2.1.2"
-ARG TARGETARCH
-
-ENV PYTORCH_VERSION=$PYTORCH_VERSION
-
-RUN apt-get update && \
-    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev rsync s3fs && \
-    rm -rf /var/cache/apt/archives && \
-    rm -rf /var/lib/apt/lists/*
-
-WORKDIR /workspace
-
-RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
-
-WORKDIR /workspace/axolotl
-
-# If AXOLOTL_EXTRAS is set, append it in brackets; don't install deepspeed with arm64
-RUN uv pip uninstall causal_conv1d
-RUN if [ "$TARGETARCH" = "arm64" ]; then \
-        BASE_EXTRAS="flash-attn,ring-flash-attn,optimizers,ray"; \
-    else \
-        BASE_EXTRAS="deepspeed,flash-attn,ring-flash-attn,optimizers,ray"; \
-    fi && \
-    if [ "$AXOLOTL_EXTRAS" != "" ]; then \
-        uv pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
-    else \
-        uv pip install --no-build-isolation -e .[$BASE_EXTRAS] $AXOLOTL_ARGS; \
-    fi && \
-    python scripts/unsloth_install.py --uv | sh && \
-    python scripts/cutcrossentropy_install.py --uv | sh && \
-    uv pip install pytest && \
-    uv cache clean
-
-# fix so that git fetch/pull from remote works with shallow clone
-RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
-    git config --get remote.origin.fetch && \
-    git config --global credential.helper store
-
-COPY .axolotl-complete.bash /root/.axolotl-complete.bash
-RUN chmod +x /root/.axolotl-complete.bash && \
-    echo 'source /root/.axolotl-complete.bash' >> ~/.bashrc
--- a/docker/Dockerfile-uv-base
+++ b/docker/Dockerfile-uv-base
@@ -6,7 +6,6 @@ ARG TARGETARCH

 FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder

-ARG TARGETARCH
 ARG PYTHON_VERSION="3.11"
 ARG PYTORCH_VERSION="2.6.0"
 ARG CUDA="126"
@@ -40,18 +39,28 @@ RUN if [ "$TARGETARCH" = "amd64" ]; then \
        uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"; \
    fi

-# Map Python version (e.g., 3.12 -> cp312)
-RUN PYTHON_CP="cp$(echo $PYTHON_VERSION | tr -d '.')" && \
-    # Map PyTorch version (e.g., 2.9.1 -> torch2.9, 2.10.0 -> torch2.10)
-    TORCH_TAG="torch$(echo $PYTORCH_VERSION | grep -oP '^\d+\.\d+')" && \
-    # Map architecture
-    case "$TARGETARCH" in \
-        amd64) ARCH_TAG="x86_64" ;; \
-        arm64) ARCH_TAG="aarch64" ;; \
-        *) echo "Unsupported architecture: $TARGETARCH"; exit 1 ;; \
-    esac && \
-    WHL_VERSION="v0.7.16" && \
-    WHL_FILE="flash_attn-2.8.3+cu${CUDA}${TORCH_TAG}-${PYTHON_CP}-${PYTHON_CP}-linux_${ARCH_TAG}.whl" && \
-    wget -nv "https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}" && \
-    uv pip install --no-cache-dir "${WHL_FILE}" && \
-    rm "${WHL_FILE}"
+RUN case "$PYTORCH_VERSION" in \
+        2.9.[0-9]*) \
+            if [ "$TARGETARCH" = "amd64" ]; then \
+                if [ "$CUDA" = "128" ]; then \
+                    wget -nv https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.5.4/flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
+                    uv pip install --no-cache-dir flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
+                    rm flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
+                elif [ "$CUDA" = "130" ]; then \
+                    wget -nv https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.5.4/flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_x86_64.whl; \
+                    uv pip install --no-cache-dir flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_x86_64.whl; \
+                    rm flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_x86_64.whl; \
+                fi \
+            elif [ "$TARGETARCH" = "arm64" ]; then \
+                if [ "$CUDA" = "128" ]; then \
+                    wget -nv https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.6.4/flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_aarch64.whl; \
+                    uv pip install --no-cache-dir flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_aarch64.whl; \
+                    rm flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_aarch64.whl; \
+                elif [ "$CUDA" = "130" ]; then \
+                    wget -nv https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.6.4/flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_aarch64.whl; \
+                    uv pip install --no-cache-dir flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_aarch64.whl; \
+                    rm flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_aarch64.whl; \
+                fi \
+            fi \
+            ;; \
+    esac
--- a/docs/expert_quantization.qmd
+++ b/docs/expert_quantization.qmd
@@ -1,67 +0,0 @@
---
-title: "MoE Expert Quantization"
-description: "Reduce VRAM usage when training MoE model adapters by quantizing expert weights on load"
---
-
-Transformers v5 changed MoE expert layers from `nn.Linear` to fused `nn.Parameter` (3D+ tensors).
-This means `bitsandbytes` can no longer quantize them during model loading, resulting in all expert
-weights being loaded in full bf16 precision and causing massive VRAM usage.
-
-`quantize_moe_experts` solves this by quantizing expert weights during model loading.
-It intercepts the weight loading process, quantizes each expert tensor on the fly, and
-immediately frees the original bf16 tensor from VRAM. This dramatically reduces peak memory.
-For example, GLM-4.7-Flash QLoRA drops from ~127GiB to ~23GiB reserved memory.
-
-## Usage
-
-Enable expert quantization in your Axolotl config:
-
-```yaml
-quantize_moe_experts: true
-```
-
-This works with both 4-bit (QLoRA) and 8-bit (LoRA) quantization.
-
-### Expert LoRA targeting
-
-You can optionally apply LoRA adapters directly to expert weights using `lora_target_parameters`:
-
-```yaml
-lora_target_parameters:
-  - mlp.experts.gate_up_proj
-  - mlp.experts.down_proj
-  # - mlp.gate.weight  # router
-```
-
-::: {.callout-note}
-`lora_dropout` must be `0` when using `lora_target_parameters`.
-:::
-
-## Requirements
-
- Requires (`adapter: lora` and `load_in_8bit: true`) or (`adapter: qlora` and `load_in_4bit: true`)
- CUDA GPUs only (not tested with ROCm or other backends)
- FSDP2 compatible for distributed training
-
-## Limitations
-
- `lora_target_linear` is not compatible with `quantize_moe_experts`. See [Expert LoRA targeting](#expert-lora-targeting) instead.
- `cpu_ram_efficient_loading` hangs / takes long time with FSDP2 + QLoRA.
- Total model parameter count may display incorrectly (trainable param count is correct).
- FSDP LoRA (8-bit) may have a large initial VRAM spike at the first 1-2 steps, which then drops. QLoRA does not exhibit this.
- FSDP2 may use more VRAM per GPU than single GPU training due to not all layers being properly sharded across ranks.
- Model loading takes longer due to on-demand quantization, even on consecutive runs.
- DeepSpeed has not been tested.
-
-## Implementation details
-
-The quantization is applied by patching transformers to intercept weight loading.
-When a 3D+ CUDA tensor with "expert" in its name is detected:
-
- **4-bit mode:** Uses bitsandbytes NF4 parametrization (configurable via `bnb_4bit_quant_type`).
- **8-bit mode:** Uses a custom row-wise int8 parametrization with bitsandbytes dequantization.
-
-The original bf16 tensor is freed immediately after quantization. Multiple sub-patches are applied to
-transformers, PEFT and accelerate FSDP2 to support these parametrized expert modules.
-
-For full implementation details, see [PR #3439](https://github.com/axolotl-ai-cloud/axolotl/pull/3439).
--- a/docs/optimizations.qmd
+++ b/docs/optimizations.qmd
@@ -66,15 +66,6 @@ Provides efficient Triton kernels to improve training speed and reduce memory us

 - **Learn more:** [Custom Integrations - Liger Kernels](custom_integrations.qmd#liger-kernels)

-### Expert Kernels
-
-Optimized kernel implementations for Mixture of Experts (MoE) model training.
-
- **ScatterMoE**: Triton-based MoE kernels with fused LoRA support.
- **SonicMoE**: CUTLASS-based MoE kernels for NVIDIA Hopper and Blackwell GPUs.
-
- **Learn more:** [Custom Integrations - Kernels Integration](custom_integrations.qmd#kernels-integration)
-
 ## Long Context Models

 Techniques to train models on sequences longer than their original context window.
@@ -140,10 +131,3 @@ Simulates quantization effects during training, helping the model adapt and pote
 Allows you to finetune LoRA adapters on top of a model that has already been quantized using the GPTQ method.

 - **Example:** [GPTQ LoRA Example](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/llama-2/gptq-lora.yml)
-
-### MoE Expert Quantization
-
-Quantizes MoE expert weights on load to reduce VRAM when training MoE models with adapters. Required for Transformers v5+ MoE models where experts use fused `nn.Parameter` tensors.
-
- **Config:** `quantize_moe_experts: true`
- **Learn more:** [MoE Expert Quantization](expert_quantization.qmd)
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -40,7 +40,7 @@
    "%%capture\n",
    "# This step can take ~5-10 minutes to install dependencies\n",
    "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
-    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@e8ad129\""
+    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0d4ce4b\""
   ]
  },
  {
--- a/examples/glm45/README.md
+++ b/examples/glm45/README.md
@@ -1,72 +0,0 @@
-# Finetune Z.ai's GLM-4.5-Air with Axolotl
-
-[GLM-4.5-Air](https://huggingface.co/zai-org/GLM-4.5-Air) is a MoE model by Z.ai.
-
-This guide shows how to fine-tune it with Axolotl.
-
-## Getting started
-
-1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
-
-2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
-
-3. Run the finetuning example:
-
-```bash
-# QLoRA (1x80GB @ ~63.4GiB/GPU)
-axolotl train examples/glm45/glm-45-air-qlora.yaml
-```
-
-### Dataset
-
-In addition to the standard OpenAI Messages format, GLM-4.5 supports an extra parameter for thinking in the assistant section.
-
-```json
-{
-    "role": "assistant",
-    "reasoning_content": "...",  // or have </think>...</think> in `content`
-    "content": "..."
-}
-```
-
-Make sure you set the below extra attributes if needed:
-
-```yaml
-datasets:
-  - path: ...
-    type: chat_template
-    message_property_mappings:
-      role: role
-      content: content
-
-    #   tool_calls: tool_calls  # uncomment if using tools
-    #   reasoning_content: reasoning_content  # uncomment if have reasoning
-
-# Uncomment if training on tool role (you would rarely if ever need this)
-# eot_tokens:
-#   - <|observation|>
-```
-
-### Tips
-
- The role name for tools in this template is `tool`.
- You will see this Axolotl WARNING — this is expected as the template does not use EOS:
-  ```
-  EOS token '<|endoftext|>' not found in chat_template. Please check if your template/EOS token is correct.
-  ```
- You can run a full finetuning by removing `adapter: qlora`, `load_in_4bit: true`, and `quantize_moe_experts: true` from the config.
- **LoRA kernels**: Incompatible with this model. Must be explicitly disabled (`lora_*_kernel: false`).
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
-
-## Optimization Guides
-
-Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
-
-## Related Resources
-
- [GLM-4.5-Air on HuggingFace](https://huggingface.co/zai-org/GLM-4.5-Air)
- [GLM-4.5 Blog](https://z.ai/blog/glm-4.5)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
--- a/examples/glm45/glm-45-air-qlora.yaml
+++ b/examples/glm45/glm-45-air-qlora.yaml
@@ -1,64 +0,0 @@
-base_model: zai-org/GLM-4.5-Air
-
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-
-plugins:
-  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
-
-load_in_8bit: false
-load_in_4bit: true
-
-quantize_moe_experts: true # important
-
-datasets:
-  - path: fozziethebeat/alpaca_messages_2k_test
-    type: chat_template
-
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.1
-output_dir: ./outputs/lora-out
-
-adapter: qlora
-lora_model_dir:
-
-sequence_len: 2048
-sample_packing: true
-
-lora_r: 16
-lora_alpha: 8
-lora_dropout: 0
-lora_target_modules:
-  - q_proj
-  - v_proj
-  - k_proj
-  - o_proj
-
-# lora_target_parameters:
-#   - mlp.experts.gate_up_proj
-#   - mlp.experts.down_proj
-
-lora_mlp_kernel: false
-lora_qkv_kernel: false
-lora_o_kernel: false
-
-gradient_accumulation_steps: 2
-micro_batch_size: 2
-num_epochs: 1
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: false
-
-gradient_checkpointing: true
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_ratio: 0.1
-evals_per_epoch: 1
-saves_per_epoch: 1
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/glm47-flash/README.md
+++ b/examples/glm47-flash/README.md
@@ -1,65 +0,0 @@
-# Finetune Z.ai's GLM-4.7-Flash with Axolotl
-
-[GLM-4.7-Flash](https://huggingface.co/zai-org/GLM-4.7-Flash) is a 30B-A3B MoE model by Z.ai.
-
-This guide shows how to fine-tune it with Axolotl.
-
-## Getting started
-
-1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
-
-2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
-
-3. Run the finetuning example:
-
-```bash
-# QLoRA
-# - no target experts (1x48GB @ ~24GiB/GPU)
-# - target experts (1x48GB @ ~34GiB/GPU)
-axolotl train examples/glm47-flash/qlora.yaml
-
-# QLoRA FSDP2 no target experts (2x48GB @ ~29GiB/GPU)
-axolotl train examples/glm47-flash/qlora_fsdp.yaml
-```
-
-```bash
-# LoRA
-# - no target experts (1x48GB @ ~35GiB/GPU)
-# - target experts (1x48GB @ OOM. Projected ~45-50GiB/GPU)
-axolotl train examples/glm47-flash/lora.yaml
-
-# LoRA FSDP2 no target experts (2x48GB @ ~43GiB/GPU)
-axolotl train examples/glm47-flash/lora_fsdp.yaml
-```
-
-### MoE Expert Quantization & Expert LoRA
-
-This model quantize expert weights on load. To learn about expert quantization, expert LoRA targeting, and related limitations, see the [MoE Expert Quantization](https://docs.axolotl.ai/docs/expert_quantization.html) docs.
-
-## Limitations
-
- **lora_target_linear**: Incompatible for this model.
- **LoRA kernels**: Incompatible with this model due to non-standard attention projections (DSA). Must be explicitly disabled (`lora_*_kernel: false`).
-
-
-### TIPS
-
- For inference, the official Z.ai team recommends these default settings (most tasks):
-  - `temperature: 1.0`
-  - `top_p: 0.95`
-  - `max_new_tokens: 131072`
- You can run a full finetuning by removing `adapter: qlora`, `load_in_4bit: true`, and `quantize_moe_experts: true` from the config. This is heavy, so we have not tested this.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
-
-## Optimization Guides
-
-Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
-
-## Related Resources
-
- [GLM-4.7-Flash on HuggingFace](https://huggingface.co/zai-org/GLM-4.7-Flash)
- [GLM-4.7 Blog](https://z.ai/blog/glm-4.7)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
--- a/examples/glm47-flash/lora.yaml
+++ b/examples/glm47-flash/lora.yaml
@@ -1,65 +0,0 @@
-base_model: zai-org/GLM-4.7-Flash
-
-plugins:
-  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
-
-load_in_8bit: true
-quantize_moe_experts: true
-
-datasets:
-  - path: fozziethebeat/alpaca_messages_2k_test
-    type: chat_template
-
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.1
-output_dir: ./outputs/glm4.7-flash-lora-8bit-out
-
-adapter: lora
-lora_model_dir:
-
-sequence_len: 2048
-sample_packing: true
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0
-lora_target_modules:
-  - q_proj
-  - v_proj
-  - k_proj
-  - o_proj
-
-# Uncomment to also target MoE expert weights:
-# lora_target_parameters:
-#   - mlp.experts.gate_up_proj
-#   - mlp.experts.down_proj
-
-# LoRA kernels incompatible with DSA attention
-lora_mlp_kernel: false
-lora_qkv_kernel: false
-lora_o_kernel: false
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 1
-optimizer: adamw_torch_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: false
-
-gradient_checkpointing: true
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_ratio: 0.1
-evals_per_epoch: 1
-saves_per_epoch: 1
--- a/examples/glm47-flash/lora_fsdp.yaml
+++ b/examples/glm47-flash/lora_fsdp.yaml
@@ -1,75 +0,0 @@
-base_model: zai-org/GLM-4.7-Flash
-
-plugins:
-  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
-
-load_in_8bit: true
-quantize_moe_experts: true
-
-datasets:
-  - path: fozziethebeat/alpaca_messages_2k_test
-    type: chat_template
-
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.1
-output_dir: ./outputs/glm4.7-flash-lora-8bit-fsdp-out
-
-adapter: lora
-lora_model_dir:
-
-sequence_len: 2048
-sample_packing: true
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0
-lora_target_modules:
-  - q_proj
-  - v_proj
-  - k_proj
-  - o_proj
-
-# Uncomment to also target MoE expert weights:
-# lora_target_parameters:
-#   - mlp.experts.gate_up_proj
-#   - mlp.experts.down_proj
-
-# LoRA kernels incompatible with DSA attention
-lora_mlp_kernel: false
-lora_qkv_kernel: false
-lora_o_kernel: false
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 1
-optimizer: adamw_torch_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: false
-
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_ratio: 0.1
-evals_per_epoch: 1
-saves_per_epoch: 1
-
-fsdp_config:
-  fsdp_version: 2
-  offload_params: false
-  cpu_ram_efficient_loading: false
-  auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  transformer_layer_cls_to_wrap: Glm4MoeLiteDecoderLayer
-  state_dict_type: FULL_STATE_DICT
-  sharding_strategy: FULL_SHARD
-  reshard_after_forward: true
-  activation_checkpointing: true
--- a/examples/glm47-flash/qlora.yaml
+++ b/examples/glm47-flash/qlora.yaml
@@ -1,65 +0,0 @@
-base_model: zai-org/GLM-4.7-Flash
-
-plugins:
-  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
-
-load_in_4bit: true
-quantize_moe_experts: true
-
-datasets:
-  - path: fozziethebeat/alpaca_messages_2k_test
-    type: chat_template
-
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.1
-output_dir: ./outputs/glm4.7-flash-qlora-out
-
-adapter: qlora
-lora_model_dir:
-
-sequence_len: 2048
-sample_packing: true
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0
-lora_target_modules:
-  - q_proj
-  - v_proj
-  - k_proj
-  - o_proj
-
-# Uncomment to also target MoE expert weights:
-# lora_target_parameters:
-#   - mlp.experts.gate_up_proj
-#   - mlp.experts.down_proj
-
-# LoRA kernels incompatible with DSA attention
-lora_mlp_kernel: false
-lora_qkv_kernel: false
-lora_o_kernel: false
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 1
-optimizer: adamw_torch_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: false
-
-gradient_checkpointing: true
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_ratio: 0.1
-evals_per_epoch: 1
-saves_per_epoch: 1
--- a/examples/glm47-flash/qlora_fsdp.yaml
+++ b/examples/glm47-flash/qlora_fsdp.yaml
@@ -1,75 +0,0 @@
-base_model: zai-org/GLM-4.7-Flash
-
-plugins:
-  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
-
-load_in_4bit: true
-quantize_moe_experts: true
-
-datasets:
-  - path: fozziethebeat/alpaca_messages_2k_test
-    type: chat_template
-
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.1
-output_dir: ./outputs/glm4.7-flash-qlora-fsdp-out
-
-adapter: qlora
-lora_model_dir:
-
-sequence_len: 2048
-sample_packing: true
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0
-lora_target_modules:
-  - q_proj
-  - v_proj
-  - k_proj
-  - o_proj
-
-# Uncomment to also target MoE expert weights:
-# lora_target_parameters:
-#   - mlp.experts.gate_up_proj
-#   - mlp.experts.down_proj
-
-# LoRA kernels incompatible with DSA attention
-lora_mlp_kernel: false
-lora_qkv_kernel: false
-lora_o_kernel: false
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 1
-optimizer: adamw_torch_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: false
-
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_ratio: 0.1
-evals_per_epoch: 1
-saves_per_epoch: 1
-
-fsdp_config:
-  fsdp_version: 2
-  offload_params: false
-  cpu_ram_efficient_loading: false
-  auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  transformer_layer_cls_to_wrap: Glm4MoeLiteDecoderLayer
-  state_dict_type: FULL_STATE_DICT
-  sharding_strategy: FULL_SHARD
-  reshard_after_forward: true
-  activation_checkpointing: true
--- a/examples/llama-3/3b-qat-mxfp4.yaml
+++ b/examples/llama-3/3b-qat-mxfp4.yaml
@@ -1,65 +0,0 @@
-base_model: meta-llama/Llama-3.2-3B
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-
-load_in_8bit: false
-load_in_4bit: false
-strict: false
-
-plugins:
-  - axolotl.integrations.liger.LigerPlugin
-
-liger_rope: true
-liger_rms_norm: true
-liger_glu_activation: true
-liger_layer_norm: true
-liger_fused_linear_cross_entropy: true
-
-datasets:
-  - path: yahma/alpaca-cleaned
-    type: alpaca
-    split: train[:95%]
-
-output_dir: ./outputs/qat_out/
-dataset_prepared_path: ./outputs/dataset_prepared
-
-sequence_len: 2048
-flash_attention: true
-
-qat:
-  activation_dtype: mxfp4
-  weight_dtype: mxfp4
-  group_size: 32
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_checkpointing: true
-activation_offloading: true
-gradient_accumulation_steps: 4
-micro_batch_size: 1
-num_epochs: 1
-optimizer: adamw_torch_8bit
-
-cosine_constant_lr_ratio: 0
-cosine_min_lr_ratio: 1.0
-learning_rate: 2e-5
-save_only_model: true
-bf16: true
-
-resume_from_checkpoint:
-logging_steps: 1
-
-evals_per_epoch: 1
-saves_per_epoch: 1
-
-warmup_ratio: 0.1
-weight_decay: 0.0
-
-special_tokens:
-  pad_token: <|finetune_right_pad_id|>
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/qwen3-next/README.md
+++ b/examples/qwen3-next/README.md
@@ -6,13 +6,30 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations

 ## Getting started

-1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
+1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Qwen3-Next is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).

-2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
+    Here is an example of how to install from main for pip:
+
+```bash
+# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
+git clone https://github.com/axolotl-ai-cloud/axolotl.git
+cd axolotl
+
+pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation -e '.[flash-attn]'
+
+# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
+python scripts/cutcrossentropy_install.py | sh
+```
+
+2. Install Qwen3-Next transformers commit
+```bash
+pip3 uninstall -y transformers && pip3 install "git+https://github.com/huggingface/transformers.git@b9282355bea846b54ed850a066901496b19da654"
+```

 3. Install FLA for improved performance
 ```bash
-pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.4.1
+pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.3.2
 ```

 4. Run the finetuning example:
@@ -21,7 +38,7 @@ pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.4.1
 axolotl train examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml
 ```

-This config uses about ~47 GiB (no target experts) and ~71GiB (target experts) VRAM.
+This config uses about 45.62 GiB VRAM.

 Let us know how it goes. Happy finetuning! 🚀

--- a/examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml
+++ b/examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml
@@ -9,8 +9,6 @@ plugins:
 load_in_8bit: false
 load_in_4bit: true

-quantize_moe_experts: true
-
 datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template
@@ -27,7 +25,7 @@ sample_packing: true

 lora_r: 16
 lora_alpha: 8
-lora_dropout: 0
+lora_dropout: 0.05
 lora_target_modules:
  - linear_attn.in_proj_ba
  - linear_attn.in_proj_qkvz
@@ -36,19 +34,12 @@ lora_target_modules:
  - shared_expert.down_proj
  - shared_expert.gate_proj
  - shared_expert_gate
+  - mlp.gate
  - q_proj
  - v_proj
  - k_proj
  - o_proj

-# lora_target_parameters:
-#   - mlp.experts.gate_up_proj
-#   - mlp.experts.down_proj
-
-lora_mlp_kernel: false
-lora_qkv_kernel: false
-lora_o_kernel: false
-
 wandb_project:
 wandb_entity:
 wandb_watch:
--- a/examples/qwen3.5/122b-a10b-moe-qlora.yaml
+++ b/examples/qwen3.5/122b-a10b-moe-qlora.yaml
@@ -1,71 +0,0 @@
-base_model: Qwen/Qwen3.5-122B-A10B
-
-plugins:
-  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
-strict: false
-
-chat_template: qwen3_5
-datasets:
-  - path: mlabonne/FineTome-100k
-    type: chat_template
-    split: train[:20%]
-    field_messages: conversations
-    message_property_mappings:
-      role: from
-      content: value
-val_set_size: 0.0
-output_dir: ./outputs/out
-dataset_prepared_path: last_run_prepared
-
-sequence_len: 2048
-sample_packing: true
-
-load_in_4bit: true
-quantize_moe_experts: true
-adapter: qlora
-lora_r: 16
-lora_alpha: 32
-lora_dropout: 0
-lora_target_modules:
-  - q_proj
-  - k_proj
-  - v_proj
-  - o_proj
-
-#lora_target_parameters:
-#   - mlp.experts.gate_up_proj
-#   - mlp.experts.down_proj
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 2
-micro_batch_size: 1
-num_epochs: 1
-optimizer: adamw_torch_4bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: true
-
-
-lora_mlp_kernel: false
-lora_qkv_kernel: false
-lora_o_kernel: false
-
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_ratio: 0.1
-evals_per_epoch: 4
-saves_per_epoch: 1
-weight_decay: 0.0
-special_tokens:
--- a/examples/qwen3.5/27b-qlora.yaml
+++ b/examples/qwen3.5/27b-qlora.yaml
@@ -1,72 +0,0 @@
-base_model: Qwen/Qwen3.5-27B
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-# Note: Qwen3.5 is an early-fusion VLM (image+text). This config fine-tunes
-# the text-only path. For multimodal (image+text) fine-tuning, add image
-# columns to your dataset following axolotl's multimodal dataset format.
-
-plugins:
-  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
-strict: false
-
-chat_template: qwen3_5
-datasets:
-  - path: mlabonne/FineTome-100k
-    type: chat_template
-    split: train[:20%]
-    field_messages: conversations
-    message_property_mappings:
-      role: from
-      content: value
-val_set_size: 0.0
-output_dir: ./outputs/out
-dataset_prepared_path: last_run_prepared
-
-sequence_len: 2048
-sample_packing: true
-
-load_in_4bit: true
-adapter: qlora
-lora_r: 16
-lora_alpha: 32
-lora_target_modules:
-  - q_proj
-  - k_proj
-  - v_proj
-  - o_proj
-  - down_proj
-  - up_proj
-  # Uncomment below to also target the linear attention projections.
-  # These use separate in_proj_qkv / in_proj_z / out_proj (Qwen3.5-specific).
-  # - linear_attn.in_proj_qkv
-  # - linear_attn.in_proj_z
-  # - linear_attn.out_proj
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 2
-micro_batch_size: 1
-num_epochs: 1
-optimizer: adamw_torch_4bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: true
-
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_ratio: 0.1
-evals_per_epoch: 4
-saves_per_epoch: 1
-weight_decay: 0.0
-special_tokens:
--- a/examples/qwen3.5/35b-a3b-moe-qlora.yaml
+++ b/examples/qwen3.5/35b-a3b-moe-qlora.yaml
@@ -1,70 +0,0 @@
-base_model: Qwen/Qwen3.5-35B-A3B
-
-plugins:
-  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
-strict: false
-
-chat_template: qwen3_5
-datasets:
-  - path: mlabonne/FineTome-100k
-    type: chat_template
-    split: train[:20%]
-    field_messages: conversations
-    message_property_mappings:
-      role: from
-      content: value
-val_set_size: 0.0
-output_dir: ./outputs/out
-dataset_prepared_path: last_run_prepared
-
-sequence_len: 2048
-sample_packing: true
-
-load_in_4bit: true
-quantize_moe_experts: true
-adapter: qlora
-lora_r: 16
-lora_alpha: 32
-lora_dropout: 0
-lora_target_modules:
-  - q_proj
-  - k_proj
-  - v_proj
-  - o_proj
-
-#lora_target_parameters:
-#   - mlp.experts.gate_up_proj
-#   - mlp.experts.down_proj
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 2
-micro_batch_size: 1
-num_epochs: 1
-optimizer: adamw_torch_4bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: true
-
-lora_mlp_kernel: false
-lora_qkv_kernel: false
-lora_o_kernel: false
-
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_ratio: 0.1
-evals_per_epoch: 4
-saves_per_epoch: 1
-weight_decay: 0.0
-special_tokens:
--- a/examples/qwen3.5/7b-lora-vision.yaml
+++ b/examples/qwen3.5/7b-lora-vision.yaml
@@ -1,72 +0,0 @@
-base_model: Qwen/Qwen3.5-7B
-processor_type: AutoProcessor
-
-# Qwen3.5-7B and above are early-fusion VLMs (Qwen3_5ForConditionalGeneration).
-# Vision and text tokens are processed together by the same transformer layers.
-# Note: Qwen3.5-2B is a text-only model — the smallest VLM is Qwen3.5-7B.
-
-# These 3 lines are required for vision/multimodal training
-skip_prepare_dataset: true
-remove_unused_columns: false
-sample_packing: false
-
-chat_template: qwen3_5
-datasets:
-  - path: HuggingFaceH4/llava-instruct-mix-vsft
-    type: chat_template
-    split: train[:1%]
-
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.0
-output_dir: ./outputs/out
-
-adapter: lora
-lora_model_dir:
-
-sequence_len: 8192
-pad_to_sequence_len: false
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-# Targets the language model attention and MLP layers.
-# Qwen3.5 is early-fusion: all layers (including those seeing vision tokens) share
-# the same transformer stack, so standard attention targets work for both modalities.
-lora_target_modules:
-  - q_proj
-  - k_proj
-  - v_proj
-  - o_proj
-  - down_proj
-  - up_proj
-  # Uncomment to also target the linear attention (GatedDeltaNet) projections:
-  # - linear_attn.in_proj_qkv
-  # - linear_attn.in_proj_z
-  # - linear_attn.out_proj
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 1
-num_epochs: 1
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: true
-tf32: true
-
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-logging_steps: 1
-flash_attention: true
-
-warmup_ratio: 0.1
-evals_per_epoch: 1
-saves_per_epoch: 1
-weight_decay: 0.0
--- a/examples/qwen3.5/README.md
+++ b/examples/qwen3.5/README.md
@@ -1,61 +0,0 @@
-# Finetune Qwen3.5 with Axolotl
-
-[Qwen3.5](https://huggingface.co/collections/Qwen/qwen35-68452f3bc6e4b7cfb4e1c803) is a hybrid architecture model series combining Gated DeltaNet linear attention with standard Transformer attention. Models from 7B onwards are early-fusion vision-language models (`Qwen3_5ForConditionalGeneration`), meaning vision and text tokens are processed through the same transformer stack. The 2B variant is text-only.
-
-Available configs:
-
-| Config | Model | Type |
-|---|---|---|
-| `27b-qlora.yaml` | Qwen3.5-27B | Dense VLM, text-only path |
-| `35b-a3b-moe-qlora.yaml` | Qwen3.5-35B-A3B | MoE, text-only path |
-| `122b-a10b-moe-qlora.yaml` | Qwen3.5-122B-A10B | MoE, text-only path |
-| `7b-lora-vision.yaml` | Qwen3.5-7B | Vision+text (multimodal) |
-
-## Getting started
-
-1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
-
-2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
-
-3. Install FLA for sample packing support with the Gated DeltaNet linear attention layers:
-```bash
-pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.4.1
-```
-> FLA is required when `sample_packing: true`. Without it, training raises a `RuntimeError` on packed sequences. Vision configs use `sample_packing: false` so FLA is optional there.
-
-4. Run a finetuning example:
-
-```bash
-# Dense 27B text-only (QLoRA, ~47 GiB VRAM with sample packing)
-axolotl train examples/qwen3.5/27b-qlora.yaml
-
-# MoE 35B-A3B text-only (QLoRA)
-axolotl train examples/qwen3.5/35b-a3b-moe-qlora.yaml
-
-# MoE 122B-A10B text-only (QLoRA)
-axolotl train examples/qwen3.5/122b-a10b-moe-qlora.yaml
-
-# 7B vision+text (LoRA, multimodal dataset)
-axolotl train examples/qwen3.5/7b-lora-vision.yaml
-```
-
-### TIPS
-
- For inference, you can experiment with `temperature: 0.7`, `top_p: 0.8`, `top_k: 20`, and `min_p: 0`.
- You can run a full finetuning by removing `adapter: qlora` and `load_in_4bit: true`. See [Multi-GPU](#optimization-guides) below.
- Read more on loading your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
- For **multimodal** finetuning, set `processor_type: AutoProcessor`, `skip_prepare_dataset: true`, and `remove_unused_columns: false` as shown in `7b-lora-vision.yaml`.
- The Gated DeltaNet linear attention layers (`linear_attn.*`) can optionally be added to `lora_target_modules` — they are commented out by default.
-
-## Optimization Guides
-
- [Optimizations Guide](https://docs.axolotl.ai/docs/optimizations.html)
-
-## Related Resources
-
- [Qwen3.5 Blog](https://qwenlm.github.io/blog/qwen3.5/)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
--- a/examples/trinity/README.md
+++ b/examples/trinity/README.md
@@ -8,15 +8,13 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations

 1. Install Axolotl following the main from the [installation guide](https://docs.axolotl.ai/docs/installation.html#sec-edge-build).

-2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
-
-3. Run the finetuning example:
+2. Run the finetuning example:

    ```bash
    axolotl train examples/trinity/trinity-nano-preview-qlora.yaml
    ```

-This config uses about 24.9 GiB VRAM (w/o CCE).
+This config uses about 24.9 GiB VRAM.

 Let us know how it goes. Happy finetuning! 🚀

@@ -31,6 +29,10 @@ Let us know how it goes. Happy finetuning! 🚀

 Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).

+## Limitations
+
+**Cut Cross Entropy (CCE)**: Currently not supported. We plan to include CCE support for Trinity in the near future.
+
 ## Related Resources

 - [Trinity Blog](https://www.arcee.ai/blog/the-trinity-manifesto)
--- a/examples/trinity/trinity-nano-preview-qlora.yaml
+++ b/examples/trinity/trinity-nano-preview-qlora.yaml
@@ -1,4 +1,5 @@
 base_model: arcee-ai/Trinity-Nano-Preview
+trust_remote_code: true
 revision_of_model: 2ee94b0

 # Automatically upload checkpoint and final model to HF
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,18 +12,15 @@ packaging==26.0
 huggingface_hub>=1.1.7
 peft>=0.18.1
 tokenizers>=0.22.1
-transformers==5.3.0
-accelerate==1.13.0
+transformers @ git+https://github.com/winglian/transformers.git@refactor-inner-training-loop-reorder-only
+accelerate==1.12.0
 datasets==4.5.0
-deepspeed>=0.18.6,<0.19.0
-trl==0.29.0
-hf_xet==1.3.2
-kernels==0.12.2
+deepspeed>=0.18.3
+trl==0.28.0
+hf_xet==1.2.0
+kernels==0.11.5

-fla-core==0.4.1
-flash-linear-attention==0.4.1
-
-trackio>=0.16.1
+trackio>=0.13.0
 typing-extensions>=4.15.0

 optimum==1.16.2
--- a/scripts/cutcrossentropy_install.py
+++ b/scripts/cutcrossentropy_install.py
@@ -29,5 +29,5 @@ UV_PREFIX = "uv " if USE_UV else ""

 print(
    UNINSTALL_PREFIX
-    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@e8ad129"'
+    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0d4ce4b"'
 )
--- a/setup.py
+++ b/setup.py
@@ -26,18 +26,6 @@ def parse_requirements(extras_require_map):
    try:
        xformers_version = [req for req in _install_requires if "xformers" in req][0]
        install_xformers = platform.machine() != "aarch64"
-        if platform.machine() == "aarch64":
-            # skip on ARM64
-            skip_packages = [
-                "torchao",
-                "fla-core",
-                "flash-linear-attention",
-            ]
-            _install_requires = [
-                req
-                for req in _install_requires
-                if re.split(r"[>=<]", req)[0].strip() not in skip_packages
-            ]
        if "Darwin" in platform.system():
            # skip packages not compatible with OSX
            skip_packages = [
--- a/src/axolotl/cli/init.py
+++ b/src/axolotl/cli/init.py
@@ -6,6 +6,5 @@ from axolotl.logging_config import configure_logging

 os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
 os.environ.setdefault("HF_XET_HIGH_PERFORMANCE", "1")
-os.environ.setdefault("TRL_EXPERIMENTAL_SILENCE", "1")

 configure_logging()
--- a/src/axolotl/cli/config.py
+++ b/src/axolotl/cli/config.py
@@ -5,7 +5,7 @@ import os
 import tempfile
 from pathlib import Path
 from tempfile import NamedTemporaryFile
-from typing import Any, Optional, Union
+from typing import Union
 from urllib.parse import urlparse

 import requests
@@ -32,63 +32,6 @@ from axolotl.utils.wandb_ import setup_wandb_env_vars

 LOG = get_logger(__name__)

-
-def _coerce_value(value: Any, existing: Optional[Any] = None) -> Any:
-    """Coerce a string CLI value to its most likely Python type.
-
-    If an existing value is present in the config, its type is used to guide
-    casting.  Otherwise, YAML-style inference is applied: booleans, ints,
-    floats, and None literals are recognised automatically.
-
-    Args:
-        value: The raw value (typically a string from the CLI).
-        existing: An optional existing config value whose type guides coercion.
-
-    Returns:
-        The value cast to the inferred or expected type.
-    """
-    if not isinstance(value, str):
-        return value
-
-    # If the config already has a typed value, cast to match
-    if existing is not None:
-        if isinstance(existing, bool):
-            return value.lower() in ("true", "1", "yes")
-        if isinstance(existing, int):
-            try:
-                return int(value)
-            except (ValueError, TypeError):
-                return value
-        if isinstance(existing, float):
-            try:
-                return float(value)
-            except (ValueError, TypeError):
-                return value
-        # For other types (str, list, dict, etc.), return as-is
-        return value
-
-    # No existing value -- use YAML-style inference
-    lower = value.lower()
-    if lower in ("true", "yes"):
-        return True
-    if lower in ("false", "no"):
-        return False
-    if lower in ("null", "none", "~"):
-        return None
-
-    # Try int then float
-    try:
-        return int(value)
-    except ValueError:
-        pass
-    try:
-        return float(value)
-    except ValueError:
-        pass
-
-    return value
-
-
 API_KEY_FIELDS = {"comet_api_key"}

 TELEMETRY_MANAGER = TelemetryManager.get_instance()
@@ -265,37 +208,13 @@ def load_cfg(
    # If there are any options passed in the cli, if it is something that seems valid
    # from the yaml, then overwrite the value
    cfg_keys = cfg.keys()
-
-    # Separate nested (dot-notation) kwargs from flat kwargs
-    nested_kwargs: dict[str, dict[str, Any]] = {}
-    flat_kwargs: dict[str, Any] = {}
    for key, value in kwargs.items():
-        if "__" in key:
-            parent, child = key.split("__", 1)
-            nested_kwargs.setdefault(parent, {})[child] = value
-        else:
-            flat_kwargs[key] = value
-
-    # Apply flat kwargs
-    for key, value in flat_kwargs.items():
        # If not strict, allow writing to cfg even if it's not in the yml already
        if key in cfg_keys or not cfg.strict:
-            cfg[key] = _coerce_value(value, cfg.get(key))
-
-    # Apply nested kwargs (e.g., trl__beta -> cfg.trl.beta)
-    for parent, children in nested_kwargs.items():
-        if parent not in cfg_keys and cfg.strict:
-            continue
-        if cfg[parent] is None:
-            cfg[parent] = {}
-        if not isinstance(cfg[parent], dict):
-            LOG.warning(
-                "Overwriting non-dict value for '%s' with nested CLI overrides", parent
-            )
-            cfg[parent] = {}
-        for child_key, child_value in children.items():
-            existing_child = cfg[parent].get(child_key)
-            cfg[parent][child_key] = _coerce_value(child_value, existing_child)
+            if isinstance(cfg[key], bool):
+                cfg[key] = bool(value)
+            else:
+                cfg[key] = value

    try:
        device_props = torch.cuda.get_device_properties("cuda")
--- a/src/axolotl/cli/merge_lora.py
+++ b/src/axolotl/cli/merge_lora.py
@@ -26,7 +26,7 @@ def do_merge_lora(*, cfg: DictDefault) -> None:
    model, tokenizer, processor = load_model_and_tokenizer(cfg=cfg)

    LOG.info("Running merge of LoRA with base model...")
-    model = model.merge_and_unload(progressbar=True, safe_merge=True)
+    model = model.merge_and_unload(progressbar=True)
    try:
        model.to(dtype=cfg.torch_dtype)
    except ValueError as e:
@@ -71,7 +71,6 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs) -> None:
        merge_lora=True,
        load_in_8bit=False,
        load_in_4bit=False,
-        quantize_moe_experts=False,
        flash_attention=False,
        context_parallel_size=None,
        deepspeed=None,
--- a/src/axolotl/cli/utils/args.py
+++ b/src/axolotl/cli/utils/args.py
@@ -2,7 +2,7 @@

 import dataclasses
 from functools import wraps
-from types import NoneType, UnionType
+from types import NoneType
 from typing import Any, Callable, Type, Union, get_args, get_origin

 import click
@@ -20,8 +20,7 @@ def _strip_optional_type(field_type: type | str | None):
        If the input type is `Union[T, None]` or `Optional[T]`, returns `T`. Otherwise
            returns the input type unchanged.
    """
-    is_union = get_origin(field_type) is Union or isinstance(field_type, UnionType)
-    if is_union and type(None) in get_args(field_type):
+    if get_origin(field_type) is Union and type(None) in get_args(field_type):
        field_type = next(
            t for t in get_args(field_type) if not isinstance(t, NoneType)
        )
@@ -88,70 +87,10 @@ def add_options_from_dataclass(config_class: Type[Any]) -> Callable:
    return decorator


-def _is_pydantic_model(field_type: type) -> bool:
-    """Check if a type is a Pydantic BaseModel subclass."""
-    try:
-        return isinstance(field_type, type) and issubclass(field_type, BaseModel)
-    except TypeError:
-        return False
-
-
-def _get_field_description(field) -> str | None:
-    """Get description from a Pydantic field, checking both .description and json_schema_extra."""
-    if field.description:
-        return field.description
-    if field.json_schema_extra and isinstance(field.json_schema_extra, dict):
-        return field.json_schema_extra.get("description")
-    return None
-
-
-def _add_nested_model_options(
-    function: Callable, parent_name: str, model_class: Type[BaseModel]
-) -> Callable:
-    """
-    Add Click options for all fields of a nested Pydantic model using dot-notation.
-
-    Note: Only single-level nesting is supported (e.g., ``--trl.beta``).
-    Deeper nesting (e.g., ``--trl.scheduler.warmup``) is not handled.
-
-    Args:
-        function: Click command function to add options to.
-        parent_name: Parent field name (e.g., "trl").
-        model_class: Nested Pydantic model class.
-
-    Returns:
-        Function with added Click options.
-    """
-    for sub_name, sub_field in reversed(model_class.model_fields.items()):
-        sub_type = _strip_optional_type(sub_field.annotation)
-        # Use dot notation: --parent.sub_field
-        cli_name = f"{parent_name}.{sub_name}".replace("_", "-")
-        # The kwarg name uses double-underscore as separator
-        param_name = f"{parent_name}__{sub_name}"
-        description = _get_field_description(sub_field)
-
-        if sub_type is bool:
-            option_name = f"--{cli_name}/--no-{cli_name}"
-            function = click.option(
-                option_name, param_name, default=None, help=description
-            )(function)
-        else:
-            option_name = f"--{cli_name}"
-            click_type = {str: str, int: int, float: float}.get(sub_type)
-            function = click.option(
-                option_name, param_name, default=None, type=click_type, help=description
-            )(function)
-
-    return function
-
-
 def add_options_from_config(config_class: Type[BaseModel]) -> Callable:
    """
    Create Click options from the fields of a Pydantic model.

-    For fields whose type is itself a Pydantic BaseModel, dot-notation CLI options are
-    generated for each sub-field (e.g., ``--trl.beta=0.1``).
-
    Args:
        config_class: PyDantic model with fields to parse from the CLI

@@ -164,11 +103,6 @@ def add_options_from_config(config_class: Type[BaseModel]) -> Callable:
        for name, field in reversed(config_class.model_fields.items()):
            field_type = _strip_optional_type(field.annotation)

-            # Handle nested Pydantic models with dot-notation options
-            if _is_pydantic_model(field_type):
-                function = _add_nested_model_options(function, name, field_type)
-                continue
-
            if field_type is bool:
                field_name = name.replace("_", "-")
                option_name = f"--{field_name}/--no-{field_name}"
--- a/src/axolotl/common/architectures.py
+++ b/src/axolotl/common/architectures.py
@@ -12,14 +12,10 @@ MOE_ARCH_BLOCK = {
    "mixtral": "MixtralSparseMoeBlock",
    "qwen2_moe": "Qwen2MoeSparseMoeBlock",
    "qwen3_moe": "Qwen3MoeSparseMoeBlock",
-    "qwen3_5_moe": "Qwen3_5MoeSparseMoeBlock",
    "qwen3_vl_moe": "Qwen3VLMoeTextSparseMoeBlock",
    "deepseek_v2": "DeepseekV2MoE",
    "deepseek_v3": "DeepseekV3MoE",
    "gpt_oss": "GptOssDecoderLayer",
    "lfm2_moe": "Lfm2MoeSparseMoeBlock",
    "afmoe": "AfmoeMoE",
-    "glm4_moe": "Glm4MoeDecoderLayer",
-    "glm4_moe_lite": "Glm4MoeLiteDecoderLayer",
-    "glm_moe_dsa": "GlmMoeDsaDecoderLayer",
 }
--- a/src/axolotl/core/builders/causal.py
+++ b/src/axolotl/core/builders/causal.py
@@ -122,12 +122,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            ColabCallback = colab_inference_post_train_callback(trainer)
            callbacks.append(ColabCallback(self.cfg))

-        if getattr(self.cfg, "generate_samples", False):
-            from axolotl.utils.callbacks.generation import SFTGenerationCallback
-
-            callbacks.append(SFTGenerationCallback(trainer))
-            LOG.info("SFT sample generation enabled")
-
        callbacks.extend(super().get_post_trainer_create_callbacks(trainer=trainer))
        return callbacks

--- a/src/axolotl/core/builders/rl.py
+++ b/src/axolotl/core/builders/rl.py
@@ -120,6 +120,11 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
        if self.cfg.use_wandb:
            training_args_kwargs["run_name"] = self.cfg.wandb_name

+        if self.cfg.max_prompt_len:
+            training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len
+        else:
+            training_args_kwargs["max_prompt_length"] = self.cfg.sequence_len
+
        training_args_cls = None
        blocklist_args_kwargs = []
        if self.cfg.rl is RLType.SIMPO:
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -26,7 +26,7 @@ from transformers import PreTrainedModel, Trainer
 from transformers.trainer import TRAINING_ARGS_NAME
 from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, has_length, seed_worker
 from transformers.utils import SAFE_WEIGHTS_NAME, is_peft_available
-from trl.experimental.utils import pad_to_length
+from trl.trainer.utils import pad_to_length
 from typing_extensions import override

 from axolotl.core.trainers.mixins import (
@@ -719,20 +719,13 @@ class AxolotlTrainer(
        output_dir = output_dir if output_dir is not None else self.args.output_dir
        os.makedirs(output_dir, exist_ok=True)
        LOG.info(f"Saving model checkpoint to {output_dir}")
-
-        # fix for Context Parallel save: CP eval invalidates tensor storage
-        # pointers, so clone to CPU to get fresh valid storage for safetensors
-        if (
-            state_dict is not None
-            and self.axolotl_cfg
-            and self.axolotl_cfg.context_parallel_size
-            and self.axolotl_cfg.context_parallel_size > 1
-        ):
+        if state_dict is None:
+            state_dict = self.accelerator.get_state_dict(self.model)
+        if state_dict is not None:
            state_dict = {
-                k: v.detach().cpu() if isinstance(v, torch.Tensor) else v
+                k: v.clone() if isinstance(v, torch.Tensor) else v
                for k, v in state_dict.items()
            }
-
        supported_classes = (
            (PreTrainedModel,)
            if not is_peft_available()
@@ -743,7 +736,6 @@ class AxolotlTrainer(
        if not isinstance(self.model, supported_classes):
            if state_dict is None:
                state_dict = self.model.state_dict()
-
            if isinstance(
                self.accelerator.unwrap_model(self.model, keep_torch_compile=False),
                supported_classes,
@@ -753,7 +745,6 @@ class AxolotlTrainer(
                ).save_pretrained(
                    output_dir,
                    state_dict=state_dict,
-                    is_main_process=self.accelerator.is_main_process,
                )
            else:
                LOG.info(
@@ -781,7 +772,11 @@ class AxolotlTrainer(
            LOG.info(
                "Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`"
            )
-            self.data_collator.tokenizer.save_pretrained(output_dir)
-
+            save_jinja_files = True
+            if self.axolotl_cfg:
+                save_jinja_files = self.axolotl_cfg.tokenizer_save_jinja_files
+            self.data_collator.tokenizer.save_pretrained(
+                output_dir, save_jinja_files=save_jinja_files
+            )
        # Good practice: save your training arguments together with the trained model
        torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
--- a/src/axolotl/core/trainers/dpo/init.py
+++ b/src/axolotl/core/trainers/dpo/init.py
@@ -25,13 +25,17 @@ class DPOStrategy:
        # Label smoothing is not compatible with IPO
        if cfg.rl is RLType.DPO and cfg.dpo_label_smoothing:
            training_args_kwargs["label_smoothing"] = cfg.dpo_label_smoothing
+        training_args_kwargs["max_completion_length"] = None
        training_args_kwargs["max_length"] = cfg.sequence_len
+        training_args_kwargs["generate_during_eval"] = cfg.dpo_generate_during_eval
        if cfg.dpo_use_weighting is not None:
            training_args_kwargs["use_weighting"] = cfg.dpo_use_weighting
        if cfg.dpo_padding_free is not None:
            training_args_kwargs["padding_free"] = cfg.dpo_padding_free
        if cfg.dpo_norm_loss is not None:
            training_args_kwargs["dpo_norm_loss"] = cfg.dpo_norm_loss
+        if cfg.dpo_use_logits_to_keep is not None:
+            training_args_kwargs["use_logits_to_keep"] = cfg.dpo_use_logits_to_keep
        if cfg.dpo_use_liger_kernel is not None:
            training_args_kwargs["use_liger_kernel"] = cfg.dpo_use_liger_kernel
        return training_args_kwargs
--- a/src/axolotl/core/trainers/dpo/trainer.py
+++ b/src/axolotl/core/trainers/dpo/trainer.py
@@ -103,10 +103,10 @@ class AxolotlDPOTrainer(
    ) -> dict[str, torch.Tensor]:
        if self.args.dpo_norm_loss:
            # fmt: off
-            loss_type: list[str] = self.loss_type  # type: ignore[has-type]
+            loss_type: str = self.loss_type  # type: ignore[has-type]
            # fmt: on
            # concatenated_forward handles avg token logprob for ipo case already
-            self.loss_type = ["ipo"]
+            self.loss_type = "ipo"
            res = super().concatenated_forward(model, batch, is_ref_model=is_ref_model)
            self.loss_type = loss_type
            return res
--- a/src/axolotl/core/trainers/mixins/scheduler.py
+++ b/src/axolotl/core/trainers/mixins/scheduler.py
@@ -25,7 +25,7 @@ class SchedulerMixin(Trainer):
    args = None  # type: "AxolotlTrainingArguments"  # type: ignore[name-defined]

    def create_scheduler(
-        self, num_training_steps: int, optimizer: None | torch.optim.Optimizer = None
+        self, num_training_steps: int, optimizer: torch.optim.Optimizer = None
    ) -> LRScheduler:
        """
        Set up the scheduler. The optimizer of the trainer must have been set up either before this method is called or
@@ -45,13 +45,6 @@ class SchedulerMixin(Trainer):
            and self.args.cosine_min_lr_ratio is not None
        )

-        if optimizer is None:
-            if self.optimizer is None:
-                raise ValueError(
-                    "Optimizer must be set before calling create_scheduler or passed as an argument."
-                )
-            optimizer = self.optimizer
-
        # fmt: off
        if self.lr_scheduler is None:  # type: ignore
            # fmt: on
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -19,7 +19,7 @@ python scripts/cutcrossentropy_install.py | sh

 - If you are installing from pip
 ```bash
-pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@e8ad129"
+pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0d4ce4b"
 ```

 ## Usage
@@ -31,7 +31,6 @@ plugins:

 ## Supported Models

- afmoe
 - apertus
 - arcee
 - cohere
@@ -52,7 +51,6 @@ plugins:
 - glm4v
 - glm4v_moe
 - glm_image
- glm_moe_dsa
 - gpt_oss
 - granite
 - granitemoe
@@ -78,19 +76,14 @@ plugins:
 - olmo
 - olmo2
 - olmo3
- olmoe
 - phi
 - phi3
 - phi4_multimodal
 - qwen2
- qwen2_5_vl
 - qwen2_moe
 - qwen2_vl
+- qwen2_5_vl
 - qwen3
- qwen3_5
- qwen3_5_text
- qwen3_5_moe
- qwen3_5_moe_text
 - qwen3_moe
 - qwen3_next
 - qwen3_vl
--- a/src/axolotl/integrations/cut_cross_entropy/init.py
+++ b/src/axolotl/integrations/cut_cross_entropy/init.py
@@ -35,7 +35,7 @@ LOG = get_logger(__name__)

 _CCE_INSTALL_MESSAGE = (
    "Please install Axolotl's fork of cut_cross_entropy with transformers support using "
-    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@e8ad129"`'
+    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0d4ce4b"`'
 )


--- a/src/axolotl/integrations/kernels/README.md
+++ b/src/axolotl/integrations/kernels/README.md
@@ -10,7 +10,7 @@ class ExpertsInterface(GeneralInterface):
    }
 ```

-In our custom integration, we add support for **ScatterMoE** and **SonicMoE**, which are more efficient and faster than `grouped_mm`.
+In our custom integration, we add support for **ScatterMoE**, which is even more efficient and faster than `grouped_mm`.

 ## Usage

@@ -21,57 +21,23 @@ plugins:
  - axolotl.integrations.kernels.KernelsPlugin

 use_kernels: true
-
-# Choose one (mutually exclusive):
 use_scattermoe: true
-# OR
-use_sonicmoe: true
 ```

-**Important:** Setting `experts_implementation` is incompatible with custom kernel options.
-
-### SonicMoE installation
-
-**Prerequisites:**
- NVIDIA Hopper (H100, H200) or Blackwell (B200, GB200) GPU
- CUDA 12.9+ (13.0+ for B300)
- PyTorch 2.7+ (2.9.1 recommended)
- For B300: Triton 3.6.0
-
-```bash
-pip install --ignore-requires-python --no-deps "sonic-moe @ git+https://github.com/Dao-AILab/sonic-moe.git@116e2df0a41874f77fa0ad269ce7df3f0cfcb956" && pip install nvidia-cutlass-dsl==4.4.0 quack-kernels==0.2.5
-```
-
-See the [SonicMoE installation guide](https://github.com/Dao-AILab/sonic-moe?tab=readme-ov-file#-installation) for the latest prerequisite details.
-
-**Note:** Blackwell support is in upstream beta. On Blackwell GPUs, Axolotl automatically sets `USE_QUACK_GEMM=1` to enable the Blackwell kernels.
+**Important:** Setting `experts_implementation` is incompatible with `use_scattermoe`.

 ## How It Works

 The `KernelsPlugin` runs before model loading and:

-### ScatterMoE
-1. Registers the ScatterMoE kernel from the local `libs/scattermoe_lora` package (includes fused LoRA support via Triton kernels).
+1. Registers the ScatterMoE kernel from the [`axolotl-ai-co/scattermoe`](https://huggingface.co/axolotl-ai-co/scattermoe) Hub repo.
 2. Patches the model's `SparseMoeBlock` forward method with the optimized ScatterMoE implementation.

-### SonicMoE
-1. Resolves the model's MoE block class(es) from `constants.py`.
-2. Patches the forward method with SonicMoE's optimized kernels and registers a weight converter for the interleaved gate/up projection format.
-3. Supports both softmax->topk and sigmoid->topk routing strategies.
-
-Both paths use the shared `resolve_moe_block_classes` utility in `constants.py` for model-type-to-class resolution.
-
-#### Supported Models
-
-See `constants.py` for the full list of supported model types (Qwen2-MoE, Qwen3-MoE, OLMoE, Mixtral, DeepSeek-V3, GLM-MoE, MiniMax, etc.).
+This works for any MoE model in transformers that uses a `SparseMoeBlock` class (Mixtral, Qwen2-MoE, OLMoE, etc.).

 ## Limitations

-ScatterMoE uses a softmax -> topk routing, so results may be different for some model architectures as baseline (GPT-OSS, etc). Incompatible with `GLM_MOE_DSA` (GLM 5) and `GLM4_MOE_LITE` (GLM 4.7 Flash) at the moment.
-
-SonicMoE supports both softmax->topk and sigmoid->topk routing, covering a wider range of architectures.
-
-ScatterMoE does not work for GLM4.7 Flash (glm4_moe_lite) atm.
+ScatterMoE uses a softmax -> topk routing, so results may be different for some model arch as baseline (GPT-OSS, GLM_MOE_DSA).

 ## Note on MegaBlocks

--- a/src/axolotl/integrations/kernels/args.py
+++ b/src/axolotl/integrations/kernels/args.py
@@ -6,18 +6,7 @@ LOG = get_logger(__name__)


 class KernelsArgs(BaseModel):
-    use_scattermoe: bool | None = None
-    use_sonicmoe: bool | None = None
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_mutually_exclusive(cls, data):
-        if data.get("use_scattermoe") and data.get("use_sonicmoe"):
-            raise ValueError(
-                "Cannot use both ScatterMoE and SonicMoE simultaneously. "
-                "Please set only one of `use_scattermoe` or `use_sonicmoe` to true."
-            )
-        return data
+    use_scattermoe: bool | None = True

    @model_validator(mode="before")
    @classmethod
@@ -44,16 +33,3 @@ class KernelsArgs(BaseModel):
            data["experts_implementation"] = "eager"

        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def disable_mlp_kernel(cls, data):
-        if data.get("use_scattermoe") is True or data.get("use_sonicmoe") is True:
-            if data.get("lora_mlp_kernel") is True:
-                LOG.warning(
-                    "Disabling lora_mlp_kernel when using custom MoE kernels due to compatibility issues."
-                )
-                data["lora_mlp_kernel"] = False
-            data["mlp_kernel"] = False
-
-        return data
--- a/src/axolotl/integrations/kernels/constants.py
+++ b/src/axolotl/integrations/kernels/constants.py
@@ -1,68 +0,0 @@
-"""
-Supported MoE block mappings for kernel integrations.
-
-Maps model_type to the SparseMoeBlock class name(s) in transformers.
-Used by both ScatterMoE and SonicMoE kernel paths.
-
-Values can be a single class name (str) or a list of class names for models
-with multiple MoE block types (e.g. qwen3_omni_moe has Thinker + Talker).
-"""
-
-import importlib
-
-SPARSE_MOE_BLOCK = {
-    # softmax -> topk routing
-    "qwen2_moe": "Qwen2MoeSparseMoeBlock",
-    "qwen3_moe": "Qwen3MoeSparseMoeBlock",
-    "qwen3_5_moe": "Qwen3_5MoeSparseMoeBlock",
-    "qwen3_next": "Qwen3NextSparseMoeBlock",
-    "qwen3_vl_moe": "Qwen3VLMoeTextSparseMoeBlock",
-    # qwen3_omni_moe: Thinker (standard) + Talker (shared experts + shared_expert_gate)
-    "qwen3_omni_moe": [
-        "Qwen3OmniMoeThinkerTextSparseMoeBlock",
-        "Qwen3OmniMoeTalkerTextSparseMoeBlock",
-    ],
-    "olmoe": "OlmoeSparseMoeBlock",
-    "mixtral": "MixtralSparseMoeBlock",
-    "minimax": "MiniMaxSparseMoeBlock",
-    # sigmoid -> topk routing (with group-based expert selection)
-    "glm_moe_dsa": "GlmMoeDsaMoE",
-    "deepseek_v3": "DeepseekV3MoE",
-    "glm4_moe": "Glm4MoeMoE",
-    "glm4_moe_lite": "Glm4MoeLiteMoE",
-    "glm4v_moe": "Glm4vMoeTextMoE",
-    # sigmoid -> topk routing (no group selection)
-    "minimax_m2": "MiniMaxM2SparseMoeBlock",
-    # Models below need custom routing (not yet implemented):
-    # "ernie4_5_moe": "Ernie4_5_MoeSparseMoeBlock",  # softmax->topk, e_score_correction_bias between softmax and topk
-    # "deepseek_v2": "DeepseekV2Moe",  # softmax->topk, group_limited_greedy, different attr names (num_group)
-    # "hunyuan_v1_moe": "HunYuanMoEV1Moe",  # softmax->topk, gate.wg (not gate.weight), scatter routing
-    # "gpt_oss": "GptOssMLP",  # topk->softmax, transposed layout [E,H,2*I], custom GLU, expert biases
-}
-
-
-def resolve_moe_block_classes(model_type: str):
-    """Resolve all MoE block classes from transformers for the given model type.
-
-    Returns a list of classes (one for most models, multiple for models with
-    distinct MoE block types like qwen3_omni_moe).
-    """
-    entry = SPARSE_MOE_BLOCK.get(model_type)
-    if entry is None:
-        raise ValueError(
-            f"Unsupported MoE model type '{model_type}'. "
-            f"Supported types: {list(SPARSE_MOE_BLOCK.keys())}"
-        )
-
-    cls_names = entry if isinstance(entry, list) else [entry]
-    module_path = f"transformers.models.{model_type}.modeling_{model_type}"
-    module = importlib.import_module(module_path)
-
-    classes = []
-    for cls_name in cls_names:
-        moe_cls = getattr(module, cls_name, None)
-        if moe_cls is None:
-            raise ValueError(f"Could not find class '{cls_name}' in '{module_path}'")
-        classes.append(moe_cls)
-
-    return classes
--- a/src/axolotl/integrations/kernels/libs/init.py
+++ b/src/axolotl/integrations/kernels/libs/init.py
--- a/src/axolotl/integrations/kernels/libs/scattermoe_lora/init.py
+++ b/src/axolotl/integrations/kernels/libs/scattermoe_lora/init.py
@@ -1,18 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Copyright (c) Axolotl AI
-# Licensed under the Apache License, Version 2.0
-
-from . import layers
-from .lora_ops import ParallelExperts
-from .parallel_experts import flatten_sort_count, parallel_linear
-from .parallel_linear_lora import ScatterMoELoRA, parallel_linear_lora
-
-__all__ = [
-    "layers",
-    "ParallelExperts",
-    "flatten_sort_count",
-    "parallel_linear",
-    "ScatterMoELoRA",
-    "parallel_linear_lora",
-    "lora_ops",
-]
--- a/src/axolotl/integrations/kernels/libs/scattermoe_lora/kernels/init.py
+++ b/src/axolotl/integrations/kernels/libs/scattermoe_lora/kernels/init.py
@@ -1,12 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-#
-# Original work Copyright (c) Shawn Tan and ScatterMoE Contributors
-# Adapted from https://github.com/shawntan/scattermoe
-# See https://github.com/shawntan/scattermoe/blob/main/LICENSE
-#
-# Modifications and LoRA adaptation Copyright (c) Axolotl AI
-# Licensed under the Apache License, Version 2.0
-
-from . import lora_ops, ops
-
-__all__ = ["ops", "lora_ops"]
--- a/src/axolotl/integrations/kernels/libs/scattermoe_lora/kernels/lora_ops.py
+++ b/src/axolotl/integrations/kernels/libs/scattermoe_lora/kernels/lora_ops.py
--- a/src/axolotl/integrations/kernels/libs/scattermoe_lora/kernels/ops.py
+++ b/src/axolotl/integrations/kernels/libs/scattermoe_lora/kernels/ops.py
@@ -1,645 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Adapted from https://github.com/shawntan/scattermoe
-# Copyright (c) Shawn Tan and ScatterMoE Contributors
-# Licensed under the Apache License, Version 2.0
-# See https://github.com/shawntan/scattermoe/blob/main/LICENSE
-
-from typing import Optional
-
-import torch
-import triton
-import triton.language as tl
-
-BLOCK_M = 128
-ALLOW_TF32 = True
-
-
-@triton.jit
-def _compute_expert_block(
-    E_idx,
-    E_mask,
-    M_in_idx,
-    N_block,
-    N_mask,
-    X_ptr,
-    stride_xm,
-    stride_xk,
-    W_ptr,
-    stride_we,
-    stride_wk,
-    stride_wn,
-    K,
-    acc,
-    no_k_mask,
-    BLOCK_K,
-    allow_tf32=True,
-):
-    K_block = tl.arange(0, BLOCK_K)
-    X_blk_ptrs = X_ptr + M_in_idx[:, None] * stride_xm + K_block[None, :] * stride_xk
-    W_blk_ptrs = (
-        W_ptr
-        + K_block[:, None] * stride_wk
-        + N_block[None, :] * stride_wn
-        + E_idx * stride_we
-    )
-    iters = tl.cdiv(K, BLOCK_K)
-
-    for K_block_id in range(iters):
-        if no_k_mask:
-            x = tl.load(X_blk_ptrs, mask=E_mask[:, None])
-            w = tl.load(W_blk_ptrs, mask=N_mask[None, :])
-        else:
-            K_mask = (K_block_id * BLOCK_K + K_block) < K
-            x = tl.load(X_blk_ptrs, mask=E_mask[:, None] & K_mask[None, :])
-            w = tl.load(W_blk_ptrs, mask=K_mask[:, None] & N_mask[None, :])
-
-        X_blk_ptrs += BLOCK_K * stride_xk
-        W_blk_ptrs += BLOCK_K * stride_wk
-        acc = tl.dot(x, w, acc, allow_tf32=allow_tf32)
-    return acc
-
-
-def _scatter2scatter_configs():
-    return [
-        triton.Config({"BLOCK_N": 128, "BLOCK_K": 32}, num_stages=4, num_warps=4),
-    ]
-
-
-@triton.autotune(
-    configs=_scatter2scatter_configs(),
-    key=["M", "N", "K"],
-)
-@triton.heuristics(
-    {
-        "NO_K_MASK": lambda args: (args["K"] % args["BLOCK_K"]) == 0,
-        "NO_N_MASK": lambda args: (args["N"] % args["BLOCK_N"]) == 0,
-    }
-)
-@triton.jit
-def _scatter2scatter(
-    X_ptr,
-    stride_xm: tl.constexpr,
-    stride_xk: tl.constexpr,
-    W_ptr,
-    stride_we,
-    stride_wk: tl.constexpr,
-    stride_wn: tl.constexpr,
-    Y_ptr,
-    stride_ym: tl.constexpr,
-    stride_yn: tl.constexpr,
-    B_ptr,
-    stride_be: tl.constexpr,
-    stride_bn: tl.constexpr,
-    grouped_idx_ptr,
-    expert_idxs_ptr,
-    # block_start_idx_ptr,
-    FAN_OUT: tl.constexpr,
-    M,
-    K: tl.constexpr,
-    N: tl.constexpr,
-    E: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    BLOCK_K: tl.constexpr,
-    ACC_TYPE: tl.constexpr,
-    # OUT_M,
-    allow_tf32: tl.constexpr,
-    x_grouped: tl.constexpr,
-    y_grouped: tl.constexpr,
-    NO_K_MASK: tl.constexpr,
-    NO_N_MASK: tl.constexpr,
-):
-    pid = tl.program_id(axis=0)
-
-    N_BLOCK_COUNT = tl.cdiv(N, BLOCK_N)
-    M_block_id = pid // N_BLOCK_COUNT
-    N_block_id = pid % N_BLOCK_COUNT
-
-    M_block = M_block_id * BLOCK_M + tl.arange(0, BLOCK_M)
-    N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
-    N_mask = N_block < N
-    M_boundary_mask = M_block < (FAN_OUT * M)
-    E_idxs = tl.load(expert_idxs_ptr + M_block, mask=M_boundary_mask, other=E)
-
-    no_k_mask = K % BLOCK_K == 0
-
-    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
-    E_first_idx = tl.min(E_idxs)
-    E_last_idx = tl.minimum(tl.max(E_idxs), E - 1)
-    M_idx = tl.load(grouped_idx_ptr + M_block, mask=M_boundary_mask).to(tl.int32)
-    for E_idx in range(E_first_idx, E_last_idx + 1):
-        E_mask = E_idxs == E_idx
-        E_M_idx = M_idx
-        if x_grouped:
-            M_in_idx = M_block
-        else:
-            M_in_idx = E_M_idx // FAN_OUT
-        acc = _compute_expert_block(
-            E_idx,
-            E_mask,
-            M_in_idx,
-            N_block,
-            N_mask,
-            X_ptr,
-            stride_xm,
-            stride_xk,
-            W_ptr,
-            stride_we,
-            stride_wk,
-            stride_wn,
-            K,
-            acc,
-            no_k_mask,
-            BLOCK_K,
-            allow_tf32=allow_tf32,
-        )
-
-    if B_ptr is not None:
-        B_blk_ptrs = B_ptr + E_idxs[:, None] * stride_be + N_block[None, :] * stride_bn
-        acc += tl.load(B_blk_ptrs, mask=M_boundary_mask[:, None] & N_mask[None, :])
-
-    if y_grouped:
-        M_out_idx = M_block
-    else:
-        M_out_idx = M_idx
-    Y_blk_ptrs = Y_ptr + (M_out_idx[:, None] * stride_ym + N_block[None, :] * stride_yn)
-    tl.store(Y_blk_ptrs, acc, mask=M_boundary_mask[:, None] & N_mask[None, :])
-
-
-def scatter2scatter(
-    X,
-    W,
-    sorted_expert_idxs,
-    sorted_scattered_idxs,
-    k,
-    b=None,
-    x_grouped=False,
-    y_grouped=False,
-    out=None,
-):
-    assert sorted_scattered_idxs.size(0) == sorted_expert_idxs.size(0)
-    assert sorted_scattered_idxs.size(0) == X.size(0) * k
-    # Pre-kernel setup
-    y_dim = W.size(-1)
-    L_scattered = sorted_expert_idxs.size(0)
-    if out is None:
-        output = torch.empty((L_scattered, y_dim), device=X.device, dtype=X.dtype)
-    else:
-        assert out.size(0) == L_scattered and out.size(1) == y_dim
-        output = out
-
-    scatter2scatter_compileable(
-        output,
-        W,
-        X,
-        k,
-        sorted_expert_idxs,
-        sorted_scattered_idxs,
-        b,
-        x_grouped,
-        y_grouped,
-    )
-    return output
-
-
-@torch.library.custom_op("scattermoe::scatter2scatter", mutates_args={"output"})
-def scatter2scatter_compileable(
-    output: torch.Tensor,
-    W: torch.Tensor,
-    X: torch.Tensor,
-    k: int,
-    sorted_expert_idxs: torch.Tensor,
-    sorted_scattered_idxs: torch.Tensor,
-    b: Optional[torch.Tensor],
-    x_grouped: bool,
-    y_grouped: bool,
-) -> None:
-    def grid(META):
-        grid_num = (
-            triton.cdiv(sorted_expert_idxs.size(0), META["BLOCK_M"])
-            * triton.cdiv(META["N"], META["BLOCK_N"]),
-        )
-        return grid_num
-
-    if b is None:
-        b = None
-        stride_be = stride_bn = 0
-    else:
-        stride_be, stride_bn = b.stride()
-
-    _scatter2scatter[grid](
-        # X_ptr, stride_xm, stride_xk,
-        X,
-        X.stride(0),
-        X.stride(1),
-        # W_ptr, stride_we, stride_wk, stride_wn,
-        W,
-        W.stride(0),
-        W.stride(1),
-        W.stride(2),
-        # Y_ptr, stride_ym, stride_yn,
-        output,
-        output.stride(0),
-        output.stride(1),
-        # B_ptr, stride_be, stride_bn
-        b,
-        stride_be,
-        stride_bn,
-        grouped_idx_ptr=sorted_scattered_idxs,
-        expert_idxs_ptr=sorted_expert_idxs,
-        # block_start_idx_ptr=padded_block_idxs,
-        FAN_OUT=k,
-        M=X.size(0),
-        K=X.size(1),
-        N=output.size(1),
-        E=W.size(0),
-        BLOCK_M=BLOCK_M,
-        ACC_TYPE=tl.float32,
-        allow_tf32=ALLOW_TF32,
-        x_grouped=x_grouped,
-        y_grouped=y_grouped,
-    )
-
-
-def _config_XtY():
-    return [
-        triton.Config(
-            {"BLOCK_N": 128, "BLOCK_K": 128, "BLOCK_M": 32}, num_stages=4, num_warps=4
-        ),
-    ]
-
-
-def group_bwd_W(DY, X, expert_offsets, E, has_bias=False):
-    DWt = torch.zeros((E, DY.size(-1), X.size(-1)), device=DY.device, dtype=DY.dtype)
-    DW = DWt.permute(0, 2, 1)
-    if has_bias:
-        Db = torch.zeros((E, DY.size(-1)), device=DY.device, dtype=DY.dtype)
-    else:
-        Db = None
-    groupXtY_compileable(E, DW, Db, DY, X, expert_offsets)
-    return DW, Db
-
-
-@torch.library.custom_op("scattermoe::groupXtY", mutates_args={"DW", "Db"})
-def groupXtY_compileable(
-    E: int,
-    DW: torch.Tensor,
-    Db: Optional[torch.Tensor],
-    DY: torch.Tensor,
-    X: torch.Tensor,
-    expert_offsets: torch.Tensor,
-) -> None:
-    def grid(META):
-        grid = (
-            E * triton.cdiv(META["K"], META["BLOCK_K"]),
-            triton.cdiv(META["N"], META["BLOCK_N"]),
-        )
-        return grid
-
-    if Db is None:
-        stride_dbe = 0
-        stride_dbn = 0
-    else:
-        stride_dbe, stride_dbn = Db.stride()
-
-    _groupXtY[grid](
-        # DY_ptr, stride_dym, stride_dyk,
-        DY,
-        DY.stride(0),
-        DY.stride(1),
-        # X_ptr, stride_xm, stride_xn,
-        X,
-        X.stride(0),
-        X.stride(1),
-        # DW_ptr, stride_dwe, stride_dwk, stride_dwn,
-        DW,
-        DW.stride(0),
-        DW.stride(1),
-        DW.stride(2),
-        # Db_ptr, stride_dwe, stride_dbn,
-        Db,
-        stride_dbe,
-        stride_dbn,
-        # expert_offsets_ptr,
-        expert_offsets,
-        # K: tl.constexpr, N: tl.constexpr,
-        M=DY.size(0),
-        N=DY.size(-1),
-        K=X.size(-1),
-        # ACC_TYPE: tl.constexpr,
-        ACC_TYPE=tl.float32,
-        allow_tf32=ALLOW_TF32,
-    )
-
-
-@triton.autotune(
-    configs=_config_XtY(),
-    key=["M", "N", "K"],
-)
-@triton.heuristics(
-    {
-        "NO_K_MASK": lambda args: (args["K"] % args["BLOCK_K"]) == 0,
-        "NO_N_MASK": lambda args: (args["N"] % args["BLOCK_N"]) == 0,
-    }
-)
-@triton.jit
-def _groupXtY(
-    DY_ptr,
-    stride_dym,
-    stride_dyk,
-    X_ptr,
-    stride_xm,
-    stride_xn,
-    DW_ptr,
-    stride_dwe,
-    stride_dwk,
-    stride_dwn,
-    Db_ptr,
-    stride_dbe,
-    stride_dbn,
-    expert_offsets_ptr,
-    M,
-    K: tl.constexpr,
-    N: tl.constexpr,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    BLOCK_K: tl.constexpr,
-    ACC_TYPE: tl.constexpr,
-    allow_tf32: tl.constexpr,
-    NO_K_MASK: tl.constexpr,
-    NO_N_MASK: tl.constexpr,
-):
-    pid0 = tl.program_id(axis=0)
-    pid1 = tl.program_id(axis=1)
-    num0 = tl.num_programs(0)
-    num1 = tl.num_programs(1)
-    # pid1, pid0 = tl.swizzle2d(pid1, pid0, num1, num0, 128)
-    pid0, pid1 = tl.swizzle2d(pid0, pid1, num0, num1, 4)
-
-    K_BLOCK_COUNT = tl.cdiv(K, BLOCK_K)
-    E_idx = pid0 // K_BLOCK_COUNT
-    K_block_id = pid0 % K_BLOCK_COUNT
-    N_block_id = pid1
-
-    if E_idx == 0:
-        start_idx = 0
-    else:
-        start_idx = tl.load(expert_offsets_ptr + E_idx - 1).to(tl.int32)
-    end_idx = tl.load(expert_offsets_ptr + E_idx).to(tl.int32)
-
-    if end_idx > start_idx:
-        M_block = tl.max_contiguous(start_idx + tl.arange(0, BLOCK_M), BLOCK_M)
-
-        K_block = K_block_id * BLOCK_K + tl.arange(0, BLOCK_K)
-        K_mask = K_block < K
-        K_block = tl.max_contiguous(tl.multiple_of(K_block % K, BLOCK_K), BLOCK_K)
-
-        N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
-        N_mask = N_block < N
-        N_block = tl.max_contiguous(tl.multiple_of(N_block % N, BLOCK_N), BLOCK_N)
-
-        M_idxs = M_block
-        xt_blk_ptrs = X_ptr + K_block[:, None] * stride_xn + M_idxs[None, :] * stride_xm
-        dy_blk_ptrs = (
-            DY_ptr + M_idxs[:, None] * stride_dym + N_block[None, :] * stride_dyk
-        )
-        if (Db_ptr is not None) and (K_block_id == 0):
-            _xty_and_bias(
-                E_idx,
-                start_idx,
-                end_idx,
-                M_block,
-                K_block,
-                K_mask,
-                N_block,
-                N_mask,
-                dy_blk_ptrs,
-                stride_dym,
-                xt_blk_ptrs,
-                stride_xm,
-                DW_ptr,
-                stride_dwe,
-                stride_dwk,
-                stride_dwn,
-                Db_ptr,
-                stride_dbe,
-                stride_dbn,
-                BLOCK_M,
-                BLOCK_N,
-                BLOCK_K,
-                ACC_TYPE,
-                allow_tf32,
-                NO_K_MASK,
-                NO_N_MASK,
-                compute_bias=True,
-            )
-        else:
-            _xty_and_bias(
-                E_idx,
-                start_idx,
-                end_idx,
-                M_block,
-                K_block,
-                K_mask,
-                N_block,
-                N_mask,
-                dy_blk_ptrs,
-                stride_dym,
-                xt_blk_ptrs,
-                stride_xm,
-                DW_ptr,
-                stride_dwe,
-                stride_dwk,
-                stride_dwn,
-                Db_ptr,
-                stride_dbe,
-                stride_dbn,
-                BLOCK_M,
-                BLOCK_N,
-                BLOCK_K,
-                ACC_TYPE,
-                allow_tf32,
-                NO_K_MASK,
-                NO_N_MASK,
-                compute_bias=False,
-            )
-
-
-@triton.jit
-def _xty_and_bias(
-    E_idx,
-    start_idx,
-    end_idx,
-    M_block,
-    K_block,
-    K_mask,
-    N_block,
-    N_mask,
-    dy_blk_ptrs,
-    stride_dym,
-    xt_blk_ptrs,
-    stride_xm,
-    DW_ptr,
-    stride_dwe,
-    stride_dwk,
-    stride_dwn,
-    Db_ptr,
-    stride_dbe,
-    stride_dbn,
-    BLOCK_M,
-    BLOCK_N,
-    BLOCK_K,
-    ACC_TYPE,
-    allow_tf32,
-    NO_K_MASK,
-    NO_N_MASK,
-    compute_bias: tl.constexpr,
-):
-    if compute_bias:
-        db_acc = tl.zeros((BLOCK_N,), dtype=ACC_TYPE)
-    else:
-        db_acc = None
-
-    acc = tl.zeros((BLOCK_K, BLOCK_N), dtype=ACC_TYPE)
-    iters = tl.cdiv(end_idx - start_idx, BLOCK_M)
-    for i in range(0, iters):
-        M_mask = (i * BLOCK_M + M_block) < end_idx
-        if NO_K_MASK:
-            xt = tl.load(xt_blk_ptrs, mask=M_mask[None, :])
-        else:
-            xt = tl.load(xt_blk_ptrs, mask=K_mask[:, None] & M_mask[None, :])
-        if NO_N_MASK:
-            dy = tl.load(dy_blk_ptrs, mask=M_mask[:, None])
-        else:
-            dy = tl.load(dy_blk_ptrs, mask=M_mask[:, None] & N_mask[None, :])
-
-        acc += tl.dot(xt, dy, out_dtype=ACC_TYPE, allow_tf32=allow_tf32)
-
-        xt_blk_ptrs += BLOCK_M * stride_xm
-        dy_blk_ptrs += BLOCK_M * stride_dym
-
-        if compute_bias:
-            db_acc += tl.sum(dy, axis=0)
-
-    DW_blk_ptrs = (
-        DW_ptr
-        + E_idx * stride_dwe
-        + K_block[:, None] * stride_dwk
-        + N_block[None, :] * stride_dwn
-    )
-    acc = acc.to(DW_blk_ptrs.dtype.element_ty)
-    tl.store(DW_blk_ptrs, acc, mask=K_mask[:, None] & N_mask[None, :])
-    if compute_bias:
-        Db_blk_ptrs = Db_ptr + E_idx * stride_dbe + N_block * stride_dbn
-        tl.store(Db_blk_ptrs, db_acc, mask=N_mask)
-
-
-def _config_grouping():
-    return [
-        triton.Config({"BLOCK_N": 256, "BLOCK_K": 128}, num_stages=4, num_warps=4),
-        # triton.Config({'BLOCK_N': 128, 'BLOCK_K': 64}, num_stages=4, num_warps=4),
-        # triton.Config({'BLOCK_N': 64, 'BLOCK_K': 32}, num_stages=4, num_warps=4),
-    ]
-
-
-def group(A, sorted_expert_idxs, coeff=None, fan_out=1, out=None):
-    N = sorted_expert_idxs.size(0)
-    K = A.size(1)
-    assert A.size(0) * fan_out == N
-    if out is not None:
-        Y = out
-    else:
-        Y = torch.empty((N, K), dtype=A.dtype, device=A.device)
-    group_compileable(A, K, N, Y, coeff, coeff is not None, fan_out, sorted_expert_idxs)
-    return Y
-
-
-@torch.library.custom_op("scattermoe::group", mutates_args={"Y"})
-def group_compileable(
-    A: torch.Tensor,
-    K: int,
-    N: int,
-    Y: torch.Tensor,
-    coeff: Optional[torch.Tensor],
-    has_coeff: bool,
-    fan_out: int,
-    sorted_expert_idxs: torch.Tensor,
-) -> None:
-    def grid(META):
-        grid_num = (triton.cdiv(META["N"], META["BLOCK_N"]),)
-        return grid_num
-
-    _group[grid](
-        # A_ptr, stride_an, stride_ai,
-        A,
-        A.stride(0),
-        A.stride(1),
-        has_coeff,
-        coeff,
-        fan_out,
-        # Y_ptr, stride_yn, stride_yk,
-        Y,
-        Y.stride(0),
-        Y.stride(1),
-        # grouped_idx_ptr,
-        sorted_expert_idxs,
-        # N: tl.constexpr, K: tl.constexpr,
-        N,
-        K,
-    )
-
-
-@triton.autotune(configs=_config_grouping(), key=["K"])
-@triton.heuristics({"NO_K_MASK": lambda args: (args["K"] % args["BLOCK_K"]) == 0})
-@triton.jit
-def _group(
-    src_ptr,
-    stride_sn,
-    stride_sk,
-    has_coeff: tl.constexpr,
-    coeff_ptr,
-    FAN_OUT: tl.constexpr,
-    tgt_ptr,
-    stride_tn,
-    stride_ti,
-    grouped_idx_ptr,
-    N,
-    K: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    BLOCK_K: tl.constexpr,
-    NO_K_MASK: tl.constexpr,
-):
-    pid = tl.program_id(axis=0)
-
-    N_block_id = pid
-    N_blk = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
-    N_mask = N_blk < N
-    N_blk = tl.max_contiguous(tl.multiple_of(N_blk % N, BLOCK_N), BLOCK_N)
-    N_idx = tl.load(grouped_idx_ptr + N_blk, mask=N_mask, other=0)
-
-    K_blk = tl.arange(0, BLOCK_K)
-    src_blk_ptrs = (
-        src_ptr + (N_idx // FAN_OUT)[:, None] * stride_sn + K_blk[None, :] * stride_sk
-    )
-    tgt_blk_ptrs = tgt_ptr + N_blk[:, None] * stride_tn + K_blk[None, :] * stride_ti
-
-    if has_coeff:
-        c = tl.load(coeff_ptr + N_idx, mask=N_mask)[:, None]
-
-    iters = tl.cdiv(K, BLOCK_K)
-    for i in range(0, iters):
-        if NO_K_MASK or i < iters - 1:
-            block = tl.load(src_blk_ptrs, mask=N_mask[:, None])
-            if has_coeff:
-                block *= c
-            tl.store(tgt_blk_ptrs, block, mask=N_mask[:, None])
-
-        else:
-            K_mask = (i * BLOCK_K + K_blk) < K
-            mask = N_mask[:, None] & K_mask[None, :]
-            block = tl.load(src_blk_ptrs, mask=mask)
-            if has_coeff:
-                block *= c
-            tl.store(tgt_blk_ptrs, block, mask=mask)
-        src_blk_ptrs += BLOCK_K * stride_sk
-        tgt_blk_ptrs += BLOCK_K * stride_ti
--- a/src/axolotl/integrations/kernels/libs/scattermoe_lora/kernels/single.py
+++ b/src/axolotl/integrations/kernels/libs/scattermoe_lora/kernels/single.py
@@ -1,98 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Adapted from https://github.com/shawntan/scattermoe
-# Copyright (c) Shawn Tan and ScatterMoE Contributors
-# Licensed under the Apache License, Version 2.0
-# See https://github.com/shawntan/scattermoe/blob/main/LICENSE
-
-import torch
-import triton
-import triton.language as tl
-
-
-@triton.jit
-def _single2scatter(
-    X_ptr,
-    stride_xm,
-    stride_xk,
-    W_ptr,
-    stride_we,
-    stride_wk,
-    stride_wn,
-    Y_ptr,
-    stride_ym,
-    stride_yn,
-    expert_idxs_ptr,
-    FAN_OUT: tl.constexpr,
-    K: tl.constexpr,
-    N: tl.constexpr,
-    E: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    BLOCK_K: tl.constexpr,
-    ACC_TYPE: tl.constexpr,
-):
-    pid0 = tl.program_id(axis=0)
-    pid1 = tl.program_id(axis=1)
-
-    N_block_id = pid0
-    if FAN_OUT == 1:
-        in_idx = pid1
-    else:
-        in_idx = 0
-    out_idx = pid1
-
-    K_block = tl.arange(0, BLOCK_K)
-    N_block = tl.max_contiguous(
-        tl.multiple_of((N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)) % N, BLOCK_N),
-        BLOCK_N,
-    )
-    E_idx = tl.load(expert_idxs_ptr + pid1)
-    X_blk_ptrs = X_ptr + in_idx * stride_xm + K_block[:, None] * stride_xk
-    W_blk_ptrs = (
-        W_ptr
-        + E_idx * stride_we
-        + K_block[:, None] * stride_wk
-        + N_block[None, :] * stride_wn
-    )
-    N_mask = N_block < N
-    acc = tl.zeros((1, BLOCK_N), dtype=ACC_TYPE)
-    for _K_block_id in range(0, tl.cdiv(K, BLOCK_K)):
-        K_mask = K_block < K
-        x = tl.load(X_blk_ptrs, mask=K_mask[:, None], other=0.0)
-        w = tl.load(W_blk_ptrs, mask=K_mask[:, None] & N_mask[None, :], other=0.0)
-        acc += tl.sum(x * w, axis=0)[None, :]
-        X_blk_ptrs += BLOCK_K * stride_xk
-        W_blk_ptrs += BLOCK_K * stride_wk
-        K_block += BLOCK_K
-    Y_blk_ptrs = Y_ptr + out_idx * stride_ym + N_block[None, :] * stride_yn
-    tl.store(Y_blk_ptrs, acc, mask=N_mask[None, :])
-
-
-def single2scatter(X, W, expert_idxs):
-    E, xdim, ydim = W.size()
-    k = expert_idxs.size(1)
-    assert X.size(0) == k or X.size(0) == 1
-    Y = torch.empty((k, ydim), device=X.device, dtype=X.dtype)
-    BLOCK_N = 128
-    BLOCK_K = 128
-    grid = triton.cdiv(ydim, BLOCK_N), k
-    _single2scatter[grid](
-        X,
-        X.stride(0),
-        X.stride(1),
-        W,
-        W.stride(0),
-        W.stride(1),
-        W.stride(2),
-        Y,
-        Y.stride(0),
-        Y.stride(1),
-        expert_idxs,
-        FAN_OUT=Y.size(0) // X.size(0),
-        K=xdim,
-        N=ydim,
-        E=E,
-        BLOCK_N=BLOCK_N,
-        BLOCK_K=BLOCK_K,
-        ACC_TYPE=tl.float32,
-    )
-    return Y
--- a/src/axolotl/integrations/kernels/libs/scattermoe_lora/layers.py
+++ b/src/axolotl/integrations/kernels/libs/scattermoe_lora/layers.py
@@ -1,439 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-#
-# Original work Copyright (c) Shawn Tan and ScatterMoE Contributors
-# Adapted from https://github.com/shawntan/scattermoe
-# See https://github.com/shawntan/scattermoe/blob/main/LICENSE
-#
-# Modifications and LoRA adaptation Copyright (c) Axolotl AI
-# Licensed under the Apache License, Version 2.0
-
-"""
-ScatterMoE layer replacements for HuggingFace MoE architectures.
-
-Provides drop-in forward replacements that use ScatterMoE kernels for
-acceleration. When used via the HF ``kernels`` library
-(``replace_kernel_forward_from_hub``), these classes replace the forward
-method of the original MoE block.
-
-LoRA support
------------
-When peft wraps parameters via ``target_parameters``, the ``self.experts``
-submodule becomes a chain of ``ParamWrapper`` objects and the ``self.gate``
-router may also become a ``ParamWrapper``.  The ``HFScatterMoEGatedMLP``
-forward detects this and automatically:
-
-1. Unwraps ``self.gate`` to the base router, applying gate LoRA delta
-2. Unwraps ``self.experts`` to the base ``OlmoeExperts`` module
-3. Extracts LoRA A/B weights and scaling from each wrapper
-4. Converts B layout from peft rank-major to scattermoe expert-major
-5. Routes to ``parallel_linear_lora`` for fused LoRA computation
-6. Passes through ``self.shared_expert`` / ``self.shared_expert_gate``
-   (peft wraps their linear layers with standard LoRA, no special handling)
-"""
-
-import torch
-from torch import nn
-from torch.nn import functional as F
-
-from .parallel_experts import flatten_sort_count, parallel_linear
-from .parallel_linear_lora import get_lora_params_from_wrapper, parallel_linear_lora
-
-# =============================================================================
-# LoRA layout conversion utilities (peft <-> scattermoe)
-# =============================================================================
-
-
-def peft_lora_B_to_scattermoe(peft_B, num_experts, rank):
-    """Convert peft rank-major lora_B ``[out, E*r]`` to scattermoe
-    expert-major ``[N, r*E]``.
-
-    peft reshapes B to ``[out, r, E]`` (rank-major).
-    scattermoe slices B as ``[:, e*r:(e+1)*r]`` (expert-major).
-    """
-    N = peft_B.shape[0]
-    return (
-        peft_B.reshape(N, rank, num_experts)
-        .permute(0, 2, 1)
-        .contiguous()
-        .reshape(N, num_experts * rank)
-    )
-
-
-def peft_lora_to_scattermoe(peft_A, peft_B, num_experts, rank):
-    """Convert peft LoRA weights to scattermoe layout (with A<->B swap).
-
-    peft operates on the parameter in its native storage layout ``[E, dim1, dim2]``
-    where ``in_features=dim1, out_features=dim2``.  ScatterMoE transposes the
-    parameter (``W = param.transpose(2, 1)``) giving ``[E, dim2, dim1]`` with
-    ``K=dim2, N=dim1``.  Because of this transposition, peft's A and B roles
-    are swapped relative to scattermoe's convention.
-
-    peft gives:
-        lora_A ``[r*E, dim1]``, lora_B ``[dim2, r*E]``
-
-    scattermoe needs:
-        lora_A ``[r*E, K=dim2]``, lora_B ``[N=dim1, r*E]``
-
-    This function swaps A<->B and converts B from rank-major to expert-major.
-    Uses vectorized tensor operations (no Python loop over experts).
-
-    Works for **both** gate_up_proj and down_proj since the transposition
-    issue is the same for any parameter.
-    """
-    peft_B_em = peft_lora_B_to_scattermoe(peft_B, num_experts, rank)
-
-    dim1 = peft_A.shape[1]  # peft in_features -> scattermoe N
-    dim2 = peft_B_em.shape[0]  # peft out_features -> scattermoe K
-
-    # smoe_A: per expert, transpose B_e [dim2, r] -> [r, dim2]
-    # [dim2, E*r] -> [dim2, E, r] -> [E, r, dim2] -> [E*r, dim2]
-    smoe_A = (
-        peft_B_em.reshape(dim2, num_experts, rank)
-        .permute(1, 2, 0)
-        .contiguous()
-        .reshape(rank * num_experts, dim2)
-    )
-
-    # smoe_B: per expert, transpose A_e [r, dim1] -> [dim1, r]
-    # [E*r, dim1] -> [E, r, dim1] -> [dim1, E, r] -> [dim1, E*r]
-    smoe_B = (
-        peft_A.reshape(num_experts, rank, dim1)
-        .permute(2, 0, 1)
-        .contiguous()
-        .reshape(dim1, num_experts * rank)
-    )
-
-    return smoe_A, smoe_B
-
-
-def peft_down_proj_lora_to_scattermoe(peft_A, peft_B, num_experts, rank):
-    """Deprecated alias for :func:`peft_lora_to_scattermoe`."""
-    return peft_lora_to_scattermoe(peft_A, peft_B, num_experts, rank)
-
-
-# =============================================================================
-# ParamWrapper unwrapping
-# =============================================================================
-
-
-def _unwrap_gate_lora(gate_module):
-    """Unwrap peft ``ParamWrapper`` on the router gate.
-
-    When peft targets ``gate.weight``, ``self.gate`` becomes::
-
-        ParamWrapper(weight)
-          -> base_layer: OlmoeTopKRouter (the real module)
-
-    This function detects the wrapping and returns the base router, its
-    weight tensor, and an optional LoRA delta tensor.
-
-    Returns:
-        (base_gate, gate_weight, gate_lora_delta_or_None)
-
-        ``base_gate`` is the original router module (with ``.top_k``,
-        ``.num_experts``, ``.norm_topk_prob``).
-        ``gate_weight`` is the base router weight (may be a DTensor under FSDP).
-        ``gate_lora_delta_or_None`` is the LoRA delta tensor if LoRA is active,
-        else ``None``.  Kept separate to avoid mixing DTensor + Tensor in an add.
-    """
-    if hasattr(gate_module, "base_layer") and hasattr(gate_module, "lora_A"):
-        base_gate = gate_module.base_layer
-        lora_A, lora_B, scaling = get_lora_params_from_wrapper(gate_module)
-        if lora_A is not None:
-            # gate weight: [num_experts, hidden_size]
-            # lora_A: [r, hidden_size], lora_B: [num_experts, r]
-            # delta = scaling * B @ A = [num_experts, hidden_size]
-            delta = scaling * (lora_B @ lora_A)
-            return base_gate, base_gate.weight, delta
-        else:
-            return base_gate, base_gate.weight, None
-    else:
-        # No wrapping — gate is the original module
-        return gate_module, gate_module.weight, None
-
-
-def _convert_smoe_lora(lora_A, lora_B, num_experts, rank, scaling):
-    """Convert peft LoRA weights to scattermoe layout."""
-    smoe_A, smoe_B = peft_lora_to_scattermoe(lora_A, lora_B, num_experts, rank)
-    return (smoe_A, smoe_B, scaling)
-
-
-def _unwrap_experts_lora(experts_module):
-    """Walk a peft ``ParamWrapper`` chain on ``self.experts``.
-
-    When peft targets ``experts.gate_up_proj`` and ``experts.down_proj`` via
-    ``target_parameters``, ``self.experts`` becomes a nested chain::
-
-        ParamWrapper(down_proj)
-          -> base_layer: ParamWrapper(gate_up_proj)
-              -> base_layer: OlmoeExperts (the real module)
-
-    This function walks the chain, collects LoRA params keyed by
-    ``parameter_name``, and returns the base experts module.
-
-    Returns:
-        (base_experts, gup_lora, down_lora)
-
-        Each ``*_lora`` is either ``(smoe_A, smoe_B, scaling)`` or ``None``.
-        A/B are already in scattermoe layout.
-    """
-    # Collect ParamWrapper layers by their parameter_name
-    wrappers = {}
-    module = experts_module
-    while hasattr(module, "base_layer") and hasattr(module, "lora_A"):
-        param_name = getattr(module, "parameter_name", None)
-        if param_name is not None:
-            wrappers[param_name] = module
-        module = module.base_layer
-
-    base_experts = module
-
-    if not wrappers:
-        return base_experts, None, None
-
-    # Determine num_experts from base module
-    num_experts = getattr(base_experts, "num_experts", None)
-    if num_experts is None:
-        # Fallback: infer from parameter shape
-        gup = getattr(base_experts, "gate_up_proj", None)
-        if gup is not None:
-            num_experts = gup.shape[0]
-
-    # Extract gate_up_proj LoRA (needs A<->B swap due to transposition)
-    gup_lora = None
-    gup_wrapper = wrappers.get("gate_up_proj")
-    if gup_wrapper is not None:
-        lora_A, lora_B, scaling = get_lora_params_from_wrapper(gup_wrapper)
-        if lora_A is not None:
-            rank = lora_A.shape[0] // num_experts
-            gup_lora = _convert_smoe_lora(lora_A, lora_B, num_experts, rank, scaling)
-
-    # Extract down_proj LoRA (needs A<->B swap due to transposition)
-    down_lora = None
-    down_wrapper = wrappers.get("down_proj")
-    if down_wrapper is not None:
-        lora_A, lora_B, scaling = get_lora_params_from_wrapper(down_wrapper)
-        if lora_A is not None:
-            rank = lora_A.shape[0] // num_experts
-            down_lora = _convert_smoe_lora(lora_A, lora_B, num_experts, rank, scaling)
-
-    return base_experts, gup_lora, down_lora
-
-
-# =============================================================================
-# Layer classes
-# =============================================================================
-
-
-class ScatterMoEGatedMLP(nn.Module):
-    def forward(self, layer_input):
-        """
-        Forward pass of the mixture of experts layer.
-
-        Args:
-            layer_input (Tensor):
-                Input tensor.
-
-        Returns:
-            Tensor:
-                Output tensor.
-        """
-        bsz, length, emb_size = layer_input.size()
-        layer_input = layer_input.reshape(-1, emb_size)
-        # compute the top_k routing decision
-        router_logits = self.router.layer(layer_input)
-        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
-        routing_weights, selected_experts = torch.topk(
-            routing_weights, self.router.top_k, dim=-1
-        )
-        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-        routing_weights = routing_weights.to(layer_input.dtype)
-        sorted_expert_idxs, sorted_scattered_idxs, expert_offsets = flatten_sort_count(
-            selected_experts, num_experts=self.router.num_experts
-        )
-
-        # compute experts
-        gates, h = parallel_linear(
-            layer_input,
-            self.input_linear.weight.transpose(2, 1),
-            self.router.top_k,
-            sorted_expert_idxs,
-            sorted_scattered_idxs,
-            expert_offsets,
-            grouped_in=False,
-            grouped_out=True,
-        ).chunk(2, dim=-1)
-        h = self.activation(gates) * h
-        layer_output = parallel_linear(
-            h,
-            self.output_linear.weight.transpose(2, 1),
-            1,
-            sorted_expert_idxs,
-            sorted_scattered_idxs,
-            expert_offsets,
-            grouped_in=True,
-            grouped_out=False,
-            gates=routing_weights,
-        )
-        layer_output = layer_output.view(bsz, length, emb_size)
-        return layer_output
-
-
-class HFScatterMoEGatedMLP(nn.Module):
-    """
-    ScatterMoE-accelerated forward pass for HF MoEs (OLMoE / Qwen2MoE).
-
-    Used as a kernel layer via the HF ``kernels`` library.  The ``forward``
-    method replaces the original ``OlmoeSparseMoeBlock.forward``.
-
-    Supports both full-parameter training and LoRA fine-tuning:
-
-    * **Full-param**: uses ``parallel_linear`` (base ScatterMoE kernel)
-    * **LoRA**: detects peft ``ParamWrapper`` on ``self.experts``, extracts
-      adapter weights, and uses ``parallel_linear_lora`` (fused kernel)
-    """
-
-    @staticmethod
-    def forward(self: nn.Module, layer_input: torch.Tensor):
-        """
-        Forward pass using ScatterMoE kernels.
-
-        Args:
-            self: The MoeSparseMoeBlock module containing:
-                - self.gate: Router (or peft ParamWrapper wrapping it)
-                - self.experts: Experts module (or peft ParamWrapper chain)
-                - self.shared_expert: Optional shared expert (e.g. Qwen2MoE)
-                - self.shared_expert_gate: Optional shared expert gate
-            layer_input: Input tensor [batch_size, seq_len, hidden_size]
-
-        Returns:
-            Tensor: [batch_size, seq_len, hidden_size]
-        """
-        batch_size, sequence_length, hidden_dim = layer_input.shape
-        hidden_states_flat = layer_input.view(-1, hidden_dim)
-
-        # ====================================================================
-        # Shared Expert (if present, e.g. Qwen2MoE)
-        # ====================================================================
-        # peft wraps individual linear layers inside shared_expert with
-        # standard LoRA — calling forward() handles this transparently.
-        if hasattr(self, "shared_expert") and self.shared_expert is not None:
-            shared_expert_output = self.shared_expert(hidden_states_flat)
-            # shared_expert_gate may also be peft-wrapped (standard LoRA
-            # on nn.Linear), its forward() applies LoRA automatically.
-            shared_expert_gate_output = F.sigmoid(
-                self.shared_expert_gate(hidden_states_flat)
-            )
-            shared_expert_output = shared_expert_output * shared_expert_gate_output
-        else:
-            shared_expert_output = None
-
-        # ====================================================================
-        # Router Computation (with optional gate LoRA)
-        # ====================================================================
-        base_gate, gate_weight, gate_lora_delta = _unwrap_gate_lora(self.gate)
-        router_logits = F.linear(hidden_states_flat, gate_weight)
-        if gate_lora_delta is not None:
-            router_logits = router_logits + F.linear(
-                hidden_states_flat, gate_lora_delta
-            )
-        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
-
-        top_k = base_gate.top_k
-        num_experts = base_gate.num_experts
-        routing_weights, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
-
-        if base_gate.norm_topk_prob:
-            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-        routing_weights = routing_weights.to(hidden_states_flat.dtype)
-
-        sorted_expert_idxs, sorted_scattered_idxs, expert_offsets = flatten_sort_count(
-            selected_experts, num_experts=num_experts
-        )
-
-        # ====================================================================
-        # Detect LoRA (peft ParamWrapper) and extract adapter weights
-        # ====================================================================
-        experts, gup_lora, down_lora = _unwrap_experts_lora(self.experts)
-
-        # ====================================================================
-        # Gate + Up projection
-        # ====================================================================
-        gate_up_W = experts.gate_up_proj.transpose(2, 1)  # [E, hidden, 2*inter]
-
-        if gup_lora is not None:
-            gup_A, gup_B, gup_scaling = gup_lora
-            gup = parallel_linear_lora(
-                hidden_states_flat,
-                gate_up_W,
-                top_k,
-                sorted_expert_idxs,
-                sorted_scattered_idxs,
-                expert_offsets,
-                lora_A=gup_A,
-                lora_B=gup_B,
-                scaling=gup_scaling,
-                grouped_in=False,
-                grouped_out=True,
-                use_fused_dX=True,
-                use_fused_gather=True,
-            )
-        else:
-            gup = parallel_linear(
-                hidden_states_flat,
-                gate_up_W,
-                top_k,
-                sorted_expert_idxs,
-                sorted_scattered_idxs,
-                expert_offsets,
-                grouped_in=False,
-                grouped_out=True,
-            )
-
-        gates, h = gup.chunk(2, dim=-1)
-        h = experts.act_fn(gates) * h
-
-        # ====================================================================
-        # Down projection
-        # ====================================================================
-        down_W = experts.down_proj.transpose(2, 1)  # [E, inter, hidden]
-
-        if down_lora is not None:
-            down_A, down_B, down_scaling = down_lora
-            expert_output = parallel_linear_lora(
-                h,
-                down_W,
-                1,
-                sorted_expert_idxs,
-                sorted_scattered_idxs,
-                expert_offsets,
-                lora_A=down_A,
-                lora_B=down_B,
-                scaling=down_scaling,
-                gates=routing_weights,
-                grouped_in=True,
-                grouped_out=False,
-                use_fused_dX=True,
-                use_fused_gather=True,
-            )
-        else:
-            expert_output = parallel_linear(
-                h,
-                down_W,
-                1,
-                sorted_expert_idxs,
-                sorted_scattered_idxs,
-                expert_offsets,
-                grouped_in=True,
-                grouped_out=False,
-                gates=routing_weights,
-            )
-
-        # ====================================================================
-        # Combine with shared expert and reshape
-        # ====================================================================
-        if shared_expert_output is not None:
-            expert_output = expert_output + shared_expert_output
-
-        expert_output = expert_output.view(batch_size, sequence_length, hidden_dim)
-        return expert_output
--- a/src/axolotl/integrations/kernels/libs/scattermoe_lora/lora_ops.py
+++ b/src/axolotl/integrations/kernels/libs/scattermoe_lora/lora_ops.py
@@ -1,99 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Copyright (c) Axolotl AI
-# Licensed under the Apache License, Version 2.0
-
-"""
-ParallelExperts module with LoRA support.
-
-Provides a drop-in replacement for ScatterMoE's ParallelExperts that
-uses the fused LoRA kernel when adapter weights are attached.
-"""
-
-from typing import Optional
-
-import torch
-import torch.nn as nn
-
-from .parallel_linear_lora import parallel_linear_lora
-
-
-class ParallelExperts(nn.Module):
-    """
-    Parallel Experts with fused LoRA support.
-
-    Drop-in replacement for the original ParallelExperts. When LoRA parameters
-    are attached via set_lora(), the forward pass uses a fused kernel:
-        Y = X @ W + scaling * (X @ A^T) @ B^T
-    """
-
-    def __init__(
-        self,
-        num_experts: int,
-        input_size: int,
-        output_size: int,
-        bias: bool = False,
-    ) -> None:
-        super().__init__()
-        self.weight = nn.Parameter(torch.empty(num_experts, output_size, input_size))
-        if bias:
-            self.bias = nn.Parameter(torch.empty(num_experts, output_size))
-        else:
-            self.bias = None
-        self.num_experts = num_experts
-        self.input_size = input_size
-        self.output_size = output_size
-        self._lora_A: torch.Tensor | None = None
-        self._lora_B: torch.Tensor | None = None
-        self._lora_scaling: float | None = None
-        self.reset_parameters()
-
-    def reset_parameters(self) -> None:
-        nn.init.normal_(self.weight, std=0.02)
-        if self.bias is not None:
-            nn.init.zeros_(self.bias)
-
-    def extra_repr(self) -> str:
-        return (
-            f"num_experts={self.num_experts}, "
-            f"input_size={self.input_size}, "
-            f"output_size={self.output_size}"
-        )
-
-    def set_lora(self, lora_A: torch.Tensor, lora_B: torch.Tensor, scaling: float):
-        """Attach LoRA parameters for fused computation."""
-        self._lora_A = lora_A
-        self._lora_B = lora_B
-        self._lora_scaling = scaling
-
-    def clear_lora(self):
-        """Remove LoRA parameters."""
-        self._lora_A = None
-        self._lora_B = None
-        self._lora_scaling = None
-
-    def forward(
-        self,
-        inputs: torch.Tensor,
-        k: int,
-        sorted_expert_idxs: torch.Tensor,
-        sorted_scattered_idxs: torch.Tensor,
-        expert_offsets: torch.Tensor,
-        gates: Optional[torch.Tensor] = None,
-        grouped_in: bool = False,
-        grouped_out: bool = False,
-    ) -> torch.Tensor:
-        return parallel_linear_lora(
-            inputs,
-            self.weight.permute(0, 2, 1),  # [E, input, output]
-            k,
-            sorted_expert_idxs,
-            sorted_scattered_idxs,
-            expert_offsets,
-            lora_A=self._lora_A,
-            lora_B=self._lora_B,
-            scaling=self._lora_scaling if self._lora_scaling is not None else 1.0,
-            expert_biases=self.bias,
-            gates=gates,
-            grouped_in=grouped_in,
-            grouped_out=grouped_out,
-        )
--- a/src/axolotl/integrations/kernels/libs/scattermoe_lora/parallel_experts.py
+++ b/src/axolotl/integrations/kernels/libs/scattermoe_lora/parallel_experts.py
@@ -1,253 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Adapted from https://github.com/shawntan/scattermoe
-# Copyright (c) Shawn Tan and ScatterMoE Contributors
-# Licensed under the Apache License, Version 2.0
-# See https://github.com/shawntan/scattermoe/blob/main/LICENSE
-
-from typing import Optional
-
-import torch
-import torch.nn as nn
-
-from . import kernels
-
-
-@torch.library.custom_op("scattermoe::bincount", mutates_args={})
-def compileable_bincount(x: torch.Tensor, minlength: int) -> torch.Tensor:
-    return x.bincount(minlength=minlength)
-
-
-@compileable_bincount.register_fake
-def _(x: torch.Tensor, minlength: int) -> torch.Tensor:
-    return torch.empty(minlength, dtype=torch.long, device=x.device)
-
-
-@torch.compile
-def flatten_sort_count(expert_idxs: torch.Tensor, num_experts: int):
-    with torch.no_grad():
-        flattened_expert_idxs = expert_idxs.flatten()
-        sorted_expert_idxs, sorted_scattered_idxs = torch.sort(flattened_expert_idxs)
-        expert_counts = compileable_bincount(
-            flattened_expert_idxs, minlength=num_experts
-        )
-        expert_offsets = expert_counts.cumsum(-1)
-        return sorted_expert_idxs, sorted_scattered_idxs, expert_offsets
-
-
-class ParallelLinear(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x: torch.Tensor,
-        expert_weights: torch.Tensor,
-        k: int,
-        sorted_expert_idxs: torch.Tensor,
-        sorted_scattered_idxs: torch.Tensor,
-        expert_offsets: torch.Tensor,
-        expert_biases: Optional[torch.Tensor] = None,
-        gates: Optional[torch.Tensor] = None,
-        grouped_in: bool = False,
-        grouped_out: bool = False,
-    ):
-        with torch.device(x.device):
-            output = kernels.ops.scatter2scatter(
-                X=x,
-                W=expert_weights,
-                b=expert_biases,
-                k=k,
-                sorted_expert_idxs=sorted_expert_idxs,
-                sorted_scattered_idxs=sorted_scattered_idxs,
-                x_grouped=grouped_in,
-                y_grouped=grouped_out,
-            )
-            if gates is not None:
-                output_expanded = output.view(
-                    gates.size(0), gates.size(1), output.size(-1)
-                )
-                output = (gates.unsqueeze(1) @ output_expanded).squeeze(1)
-            else:
-                output_expanded = None
-
-            ctx.save_for_backward(
-                x,
-                expert_weights,
-                expert_biases,
-                sorted_expert_idxs,
-                sorted_scattered_idxs,
-                expert_offsets,
-                gates,
-                output_expanded,
-            )
-            ctx.grouped_in = grouped_in
-            ctx.grouped_out = grouped_out
-            ctx.k = k
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_out: torch.Tensor):
-        with torch.device(grad_out.device):
-            (
-                x,
-                expert_weights,
-                expert_biases,
-                sorted_expert_idxs,
-                sorted_scattered_idxs,
-                expert_offsets,
-                gates,
-                output_expanded,
-            ) = ctx.saved_tensors
-            k = ctx.k
-            grouped_in = ctx.grouped_in
-            grouped_out = ctx.grouped_out
-
-            if gates is not None:
-                # calculate gates gradient
-                # d_gates = torch.bmm(output_expanded, grad_out[:, :, None]).squeeze(-1)
-                d_gates = (output_expanded @ grad_out.unsqueeze(-1)).squeeze(-1)
-                gates_flat = gates.flatten()
-                gate_fan = gates.size(1)
-                grouped_grad_out = output_expanded.flatten(
-                    0, 1
-                )  # reuse expanded buffer later
-            else:
-                d_gates = None
-                gates_flat = None
-                gate_fan = 1
-                grouped_grad_out = None
-
-            if grouped_out:
-                grouped_grad_out = grad_out
-            else:
-                grouped_grad_out = kernels.ops.group(
-                    grad_out,
-                    sorted_scattered_idxs,
-                    fan_out=gate_fan,
-                    coeff=gates_flat,
-                    out=grouped_grad_out,
-                )
-            if grouped_in:
-                grouped_x = x
-                d_expanded_input = None
-            else:
-                grouped_x = kernels.ops.group(x, sorted_scattered_idxs, fan_out=k)
-                d_expanded_input = grouped_x
-
-            d_weights, d_biases = kernels.ops.group_bwd_W(
-                DY=grouped_grad_out,
-                X=grouped_x,
-                expert_offsets=expert_offsets,
-                E=expert_weights.size(0),
-                has_bias=expert_biases is not None,
-            )
-
-            d_expanded_input = kernels.ops.scatter2scatter(
-                X=grouped_grad_out,
-                x_grouped=True,
-                W=expert_weights.permute(0, 2, 1),
-                sorted_expert_idxs=sorted_expert_idxs,
-                sorted_scattered_idxs=sorted_scattered_idxs,
-                k=1,
-                y_grouped=grouped_in,
-                out=d_expanded_input,  # Reuse grouped_x buffer
-            )
-
-            if k == 1:
-                d_input = d_expanded_input
-            else:
-                d_input = d_expanded_input.view(
-                    x.size(0), k, d_expanded_input.size(-1)
-                ).sum(-2)
-        return (
-            # x, expert_weights,
-            d_input,
-            d_weights,
-            # k, sorted_expert_idxs, sorted_scattered_idxs, expert_offsets,
-            None,
-            None,
-            None,
-            None,
-            # bias, gates
-            d_biases,
-            d_gates,
-            # grouped_in, grouped_out,
-            None,
-            None,
-        )
-
-
-def parallel_linear(
-    inputs,
-    expert_weights,
-    k,
-    sorted_expert_idxs,
-    sorted_scattered_idxs,
-    expert_offsets,
-    expert_biases=None,
-    gates=None,
-    grouped_in=False,
-    grouped_out=False,
-):
-    results = ParallelLinear.apply(
-        inputs,
-        expert_weights,
-        k,
-        sorted_expert_idxs,
-        sorted_scattered_idxs,
-        expert_offsets,
-        expert_biases,
-        gates,
-        grouped_in,
-        grouped_out,
-    )
-    return results
-
-
-class ParallelExperts(nn.Module):
-    def __init__(self, num_experts, input_size, output_size, bias=False) -> None:
-        super().__init__()
-        self.weight = nn.Parameter(torch.empty(num_experts, output_size, input_size))
-
-        if bias:
-            self.bias = nn.Parameter(torch.empty(num_experts, output_size))
-        else:
-            self.bias = None
-
-        self.num_experts = num_experts
-        self.input_size = input_size
-        self.output_size = output_size
-        self.reset_parameters()
-
-    def extra_repr(self):
-        return "num_experts={}, input_size={}, output_size={}".format(
-            self.num_experts, self.input_size, self.output_size
-        )
-
-    def reset_parameters(self) -> None:
-        nn.init.normal_(self.weight, std=0.02)
-        if self.bias is not None:
-            nn.init.zeros_(self.bias)
-
-    def forward(
-        self,
-        inputs,
-        k,
-        sorted_expert_idxs,
-        sorted_scattered_idxs,
-        expert_offsets,
-        gates=None,
-        grouped_in=False,
-        grouped_out=False,
-    ):
-        results = parallel_linear(
-            inputs,
-            self.weight.permute(0, 2, 1),
-            k,
-            sorted_expert_idxs,
-            sorted_scattered_idxs,
-            expert_offsets,
-            expert_biases=self.bias,
-            gates=gates,
-            grouped_in=grouped_in,
-            grouped_out=grouped_out,
-        )
-        return results
--- a/src/axolotl/integrations/kernels/libs/scattermoe_lora/parallel_linear_lora.py
+++ b/src/axolotl/integrations/kernels/libs/scattermoe_lora/parallel_linear_lora.py
@@ -1,480 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Copyright (c) Axolotl AI
-# Licensed under the Apache License, Version 2.0
-
-"""
-ScatterMoE + LoRA Autograd Function
-====================================
-
-Provides the autograd function and Python interface for fused ScatterMoE + LoRA.
-
-Key design for LoRA training:
-  - Expert weights W are FROZEN (no gradient computed for W).
-  - Only LoRA adapter weights (A, B) receive gradients.
-  - The input gradient dX is still computed (needed for upstream layers).
-  - This avoids the expensive group_bwd_W computation entirely.
-
-Forward:
-  Y = X @ W + scaling * (X @ A^T) @ B^T
-
-Backward (W frozen):
-  dX = dY @ W^T + scaling * (dY @ B) @ A          (via scatter2scatter for base, separate for LoRA)
-  dA = scaling * (dY @ B)^T @ X                     (per-expert, on grouped data)
-  dB = scaling * dY^T @ (X @ A^T)                   (per-expert, on grouped data)
-"""
-
-from typing import Optional
-
-import torch
-
-from .kernels import ops as base_ops
-from .kernels.lora_ops import (
-    group_bwd_lora,
-    group_bwd_lora_fused,
-    scatter2scatter_lora,
-    scatter2scatter_lora_dX,
-)
-
-
-class ScatterMoELoRA(torch.autograd.Function):
-    """
-    Autograd function for fused ScatterMoE + LoRA with frozen expert weights.
-
-    This function is optimized for the LoRA fine-tuning scenario where:
-    - Expert weights W are frozen (requires_grad=False)
-    - Only LoRA A and B matrices receive gradients
-    - Input gradients are computed for upstream layer backprop
-    """
-
-    @staticmethod
-    def forward(
-        ctx,
-        x: torch.Tensor,
-        expert_weights: torch.Tensor,
-        k: int,
-        sorted_expert_idxs: torch.Tensor,
-        sorted_scattered_idxs: torch.Tensor,
-        expert_offsets: torch.Tensor,
-        lora_A: torch.Tensor,
-        lora_B: torch.Tensor,
-        scaling: float,
-        expert_biases: Optional[torch.Tensor] = None,
-        gates: Optional[torch.Tensor] = None,
-        grouped_in: bool = False,
-        grouped_out: bool = False,
-        use_fused_dX: bool = False,
-        use_fused_gather: bool = False,
-    ):
-        with torch.device(x.device):
-            # Fused forward: Y = X @ W + scaling * (X @ A^T) @ B^T
-            output = scatter2scatter_lora(
-                X=x,
-                W=expert_weights,
-                sorted_expert_idxs=sorted_expert_idxs,
-                sorted_scattered_idxs=sorted_scattered_idxs,
-                k=k,
-                lora_A=lora_A,
-                lora_B=lora_B,
-                scaling=scaling,
-                b=expert_biases,
-                x_grouped=grouped_in,
-                y_grouped=grouped_out,
-            )
-
-            # Handle gating (weighted combination of top-k expert outputs)
-            if gates is not None:
-                output_expanded = output.view(
-                    gates.size(0), gates.size(1), output.size(-1)
-                )
-                output = (gates.unsqueeze(1) @ output_expanded).squeeze(1)
-            else:
-                output_expanded = None
-
-            ctx.save_for_backward(
-                x,
-                lora_A,
-                lora_B,
-                sorted_expert_idxs,
-                sorted_scattered_idxs,
-                expert_offsets,
-                gates,
-                output_expanded,
-            )
-            # Store frozen weights as plain Python attributes instead of
-            # save_for_backward.  This avoids:
-            # 1. Version-check conflicts with FSDP unshard/reshard
-            # 2. Pinning all-gathered parameters via saved_tensors hooks
-            # 3. Interfering with activation offloading pack/unpack hooks
-            # Safe because expert_weights are frozen (requires_grad=False).
-            ctx.expert_weights = expert_weights
-            ctx.expert_biases = expert_biases
-            ctx.grouped_in = grouped_in
-            ctx.grouped_out = grouped_out
-            ctx.k = k
-            ctx.scaling = scaling
-            ctx.use_fused_dX = use_fused_dX
-            ctx.use_fused_gather = use_fused_gather
-
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_out: torch.Tensor):
-        with torch.device(grad_out.device):
-            (
-                x,
-                lora_A,
-                lora_B,
-                sorted_expert_idxs,
-                sorted_scattered_idxs,
-                expert_offsets,
-                gates,
-                output_expanded,
-            ) = ctx.saved_tensors
-            expert_weights = ctx.expert_weights
-
-            k = ctx.k
-            scaling = ctx.scaling
-            grouped_in = ctx.grouped_in
-            grouped_out = ctx.grouped_out
-            E = expert_weights.size(0)
-
-            # ------------------------------------------------------------------
-            # Gate gradients (if using top-k gating with routing weights)
-            # ------------------------------------------------------------------
-            if gates is not None:
-                # d_gates[t, j] = output_expanded[t, j, :] . grad_out[t, :]
-                d_gates = (output_expanded @ grad_out.unsqueeze(-1)).squeeze(-1)
-                gates_flat = gates.flatten()
-                gate_fan = gates.size(1)
-                # Reuse output_expanded buffer for grouped_grad_out
-                grouped_grad_out = output_expanded.flatten(0, 1)
-            else:
-                d_gates = None
-                gates_flat = None
-                gate_fan = 1
-                grouped_grad_out = None
-
-            # ------------------------------------------------------------------
-            # LoRA gradients (dA, dB) and setup for dX
-            # ------------------------------------------------------------------
-            # Fused gather uses sorted_scattered_idxs for indirect X access
-            # in the Triton kernel, avoiding the group(x) allocation.
-            #
-            # can_fuse_gather: X is ungrouped and not too large for scatter loads
-            #   - When gates is None and grouped_out=False: both DY and X ungrouped
-            #   - When grouped_out=True (gate_up_proj): DY already grouped, X ungrouped
-            #     -> use dy_grouped=True in the fused kernel
-            M_total = sorted_scattered_idxs.size(0)
-            K_dim = x.size(-1)
-            N_dim = expert_weights.size(-1)
-            fuse_gather_workload = M_total * max(K_dim, N_dim)
-            _FUSE_GATHER_THRESHOLD = 2**24  # ~16M elements
-
-            can_fuse_gather = (
-                ctx.use_fused_gather
-                and not grouped_in  # X must be ungrouped for scatter access
-                and gates is None  # gate coeff requires multiplicative gather
-                and fuse_gather_workload < _FUSE_GATHER_THRESHOLD
-            )
-
-            if can_fuse_gather:
-                # ------------------------------------------------------------------
-                # Fused path: skip group(x) entirely
-                # ------------------------------------------------------------------
-                d_expanded_input = None
-
-                d_lora_A, d_lora_B = group_bwd_lora_fused(
-                    DY=grad_out,
-                    X=x,
-                    lora_A=lora_A,
-                    lora_B=lora_B,
-                    expert_offsets=expert_offsets,
-                    sorted_scattered_idxs=sorted_scattered_idxs,
-                    E=E,
-                    k=k,
-                    scaling=scaling,
-                    dy_grouped=grouped_out,
-                )
-
-                # Prepare grouped_grad_out for the dX path (needed by both
-                # the fused dX kernel when grouped_out=True, and the non-fused path)
-                if grouped_out:
-                    grouped_grad_out = grad_out
-                elif not ctx.use_fused_dX:
-                    grouped_grad_out = base_ops.group(
-                        grad_out,
-                        sorted_scattered_idxs,
-                        fan_out=gate_fan,
-                        coeff=gates_flat,
-                        out=grouped_grad_out,
-                    )
-            else:
-                # ------------------------------------------------------------------
-                # Original path: explicit group() calls
-                # ------------------------------------------------------------------
-                if grouped_out:
-                    grouped_grad_out = grad_out
-                else:
-                    grouped_grad_out = base_ops.group(
-                        grad_out,
-                        sorted_scattered_idxs,
-                        fan_out=gate_fan,
-                        coeff=gates_flat,
-                        out=grouped_grad_out,
-                    )
-
-                if grouped_in:
-                    grouped_x = x
-                    d_expanded_input = None
-                else:
-                    grouped_x = base_ops.group(x, sorted_scattered_idxs, fan_out=k)
-                    d_expanded_input = grouped_x  # Will be overwritten; reuse buffer
-
-                d_lora_A, d_lora_B = group_bwd_lora(
-                    DY=grouped_grad_out,
-                    X=grouped_x,
-                    lora_A=lora_A,
-                    lora_B=lora_B,
-                    expert_offsets=expert_offsets,
-                    E=E,
-                    scaling=scaling,
-                )
-
-            # ------------------------------------------------------------------
-            # Input gradient: dX = dY @ W^T + scaling * (dY @ B) @ A
-            # ------------------------------------------------------------------
-            if ctx.use_fused_dX:
-                if can_fuse_gather and not grouped_out:
-                    # Fully fused: read ungrouped DY via scatter pattern
-                    d_expanded_input = scatter2scatter_lora_dX(
-                        DY=grad_out,
-                        W=expert_weights,
-                        sorted_expert_idxs=sorted_expert_idxs,
-                        sorted_scattered_idxs=sorted_scattered_idxs,
-                        k=1,
-                        lora_A=lora_A,
-                        lora_B=lora_B,
-                        scaling=scaling,
-                        dy_grouped=False,
-                        dx_grouped=grouped_in,
-                        out=d_expanded_input,
-                    )
-                else:
-                    # Fused dX only: read from pre-grouped DY
-                    d_expanded_input = scatter2scatter_lora_dX(
-                        DY=grouped_grad_out,
-                        W=expert_weights,
-                        sorted_expert_idxs=sorted_expert_idxs,
-                        sorted_scattered_idxs=sorted_scattered_idxs,
-                        k=1,
-                        lora_A=lora_A,
-                        lora_B=lora_B,
-                        scaling=scaling,
-                        dy_grouped=True,
-                        dx_grouped=grouped_in,
-                        out=d_expanded_input,
-                    )
-            else:
-                # Original path: separate base scatter2scatter + LoRA Python loop
-                d_expanded_input = base_ops.scatter2scatter(
-                    X=grouped_grad_out,
-                    x_grouped=True,
-                    W=expert_weights.permute(0, 2, 1),  # [E, N, K]
-                    sorted_expert_idxs=sorted_expert_idxs,
-                    sorted_scattered_idxs=sorted_scattered_idxs,
-                    k=1,
-                    y_grouped=grouped_in,
-                    out=d_expanded_input,
-                )
-
-                # LoRA part: dX_lora = scaling * (dY @ B) @ A
-                if scaling != 0.0:
-                    d_input_lora_grouped = _compute_lora_input_grad(
-                        grouped_grad_out,
-                        lora_A,
-                        lora_B,
-                        expert_offsets,
-                        E,
-                        scaling,
-                    )
-                    if grouped_in:
-                        d_expanded_input.add_(d_input_lora_grouped)
-                    else:
-                        # Scatter-add LoRA gradient directly into d_expanded_input.
-                        # Avoids allocating a zeros_like + add result
-                        d_expanded_input[sorted_scattered_idxs] += d_input_lora_grouped
-
-            # Reduce over top-k if k > 1
-            if k == 1:
-                d_input = d_expanded_input
-            else:
-                d_input = d_expanded_input.view(
-                    x.size(0), k, d_expanded_input.size(-1)
-                ).sum(-2)
-
-            # W is frozen during LoRA training -- skip weight gradient
-            d_weights = (
-                torch.zeros_like(expert_weights)
-                if expert_weights.requires_grad
-                else None
-            )
-            d_biases = None
-
-        return (
-            d_input,
-            d_weights,
-            None,
-            None,
-            None,
-            None,  # k, sorted indices, offsets
-            d_lora_A,
-            d_lora_B,
-            None,  # lora_A, lora_B, scaling
-            d_biases,
-            d_gates,
-            None,
-            None,  # grouped_in, grouped_out
-            None,  # use_fused_dX
-            None,  # use_fused_gather
-        )
-
-
-def _compute_lora_input_grad(
-    grouped_grad_out: torch.Tensor,
-    lora_A: torch.Tensor,
-    lora_B: torch.Tensor,
-    expert_offsets: torch.Tensor,
-    E: int,
-    scaling: float,
-) -> torch.Tensor:
-    """
-    Compute the LoRA contribution to the input gradient:
-      dX_lora = scaling * (dY @ B) @ A
-
-    Uses PyTorch ops on expert-grouped data.
-    Each expert e: dX_e = scaling * (dY_e @ B_e) @ A_e
-    """
-    R = lora_A.size(0) // E
-    K = lora_A.size(1)
-    M_total = grouped_grad_out.size(0)
-
-    d_input_lora = torch.zeros(
-        (M_total, K), device=grouped_grad_out.device, dtype=grouped_grad_out.dtype
-    )
-
-    compute_dtype = grouped_grad_out.dtype
-
-    prev_offset = 0
-    for e in range(E):
-        curr_offset = expert_offsets[e].item()
-        if curr_offset > prev_offset:
-            dy_e = grouped_grad_out[prev_offset:curr_offset]  # [M_e, N]
-            a_e = lora_A[e * R : (e + 1) * R, :].to(compute_dtype)  # [r, K]
-            b_e = lora_B[:, e * R : (e + 1) * R].to(compute_dtype)  # [N, r]
-
-            # dX_e = scaling * (dY_e @ B_e) @ A_e
-            dy_b = dy_e @ b_e  # [M_e, r]
-            dx_e = scaling * (dy_b @ a_e)  # [M_e, K]
-            d_input_lora[prev_offset:curr_offset] = dx_e
-
-        prev_offset = curr_offset
-
-    return d_input_lora
-
-
-# =============================================================================
-# Helper: Extract LoRA params from PEFT ParamWrapper
-# =============================================================================
-
-
-def get_lora_params_from_wrapper(module) -> tuple:
-    """
-    Extract LoRA parameters from a PEFT ParamWrapper.
-
-    Returns:
-        (lora_A, lora_B, scaling) if LoRA is active, else (None, None, None)
-    """
-    if not hasattr(module, "lora_A") or not hasattr(module, "lora_B"):
-        return None, None, None
-
-    active_adapters = getattr(module, "active_adapters", ["default"])
-    if not active_adapters:
-        return None, None, None
-
-    adapter_name = active_adapters[0]
-
-    lora_A_dict = getattr(module, "lora_A", {})
-    lora_B_dict = getattr(module, "lora_B", {})
-    scaling_dict = getattr(module, "scaling", {})
-
-    if adapter_name not in lora_A_dict:
-        return None, None, None
-
-    lora_A = lora_A_dict[adapter_name].weight
-    lora_B = lora_B_dict[adapter_name].weight
-    scaling = scaling_dict[adapter_name]
-
-    return lora_A, lora_B, scaling
-
-
-# =============================================================================
-# Drop-in replacement for parallel_linear
-# =============================================================================
-
-
-def parallel_linear_lora(
-    inputs: torch.Tensor,
-    expert_weights: torch.Tensor,
-    k: int,
-    sorted_expert_idxs: torch.Tensor,
-    sorted_scattered_idxs: torch.Tensor,
-    expert_offsets: torch.Tensor,
-    lora_A: Optional[torch.Tensor] = None,
-    lora_B: Optional[torch.Tensor] = None,
-    scaling: float = 1.0,
-    expert_biases: Optional[torch.Tensor] = None,
-    gates: Optional[torch.Tensor] = None,
-    grouped_in: bool = False,
-    grouped_out: bool = False,
-    use_fused_dX: bool = False,
-    use_fused_gather: bool = False,
-):
-    """
-    Drop-in replacement for parallel_linear that supports LoRA.
-
-    If lora_A and lora_B are provided, uses fused LoRA kernel.
-    Otherwise falls back to standard scatter2scatter.
-    """
-    if lora_A is not None and lora_B is not None:
-        return ScatterMoELoRA.apply(
-            inputs,
-            expert_weights,
-            k,
-            sorted_expert_idxs,
-            sorted_scattered_idxs,
-            expert_offsets,
-            lora_A,
-            lora_B,
-            scaling,
-            expert_biases,
-            gates,
-            grouped_in,
-            grouped_out,
-            use_fused_dX,
-            use_fused_gather,
-        )
-    else:
-        from .parallel_experts import ParallelLinear
-
-        return ParallelLinear.apply(
-            inputs,
-            expert_weights,
-            k,
-            sorted_expert_idxs,
-            sorted_scattered_idxs,
-            expert_offsets,
-            expert_biases,
-            gates,
-            grouped_in,
-            grouped_out,
-        )
--- a/src/axolotl/integrations/kernels/plugin.py
+++ b/src/axolotl/integrations/kernels/plugin.py
@@ -1,59 +1,12 @@
-import importlib
-import os
-from pathlib import Path
-
-import torch
+from kernels import (
+    LayerRepository,
+    Mode,
+    register_kernel_mapping,
+    replace_kernel_forward_from_hub,
+)

 from axolotl.integrations.base import BasePlugin
-from axolotl.utils.logging import get_logger
-
-LOG = get_logger(__name__)
-
-
-def _check_sonicmoe_gpu_compat():
-    """Validate GPU compute capability for SonicMoE and configure env.
-
-    Supported: Hopper (sm_90), Blackwell (sm_100 - sm_103).
-    B300 (sm_103) additionally requires Triton 3.6.0.
-    """
-    if not torch.cuda.is_available():
-        return
-
-    cc = torch.cuda.get_device_capability()
-
-    if cc < (9, 0):
-        raise RuntimeError(
-            f"SonicMoE requires Hopper (sm_90) or Blackwell (sm_100+) GPU, "
-            f"but detected sm_{cc[0]}{cc[1]}."
-        )
-
-    if cc > (10, 3):
-        raise RuntimeError(
-            f"SonicMoE does not yet support sm_{cc[0]}{cc[1]}. "
-            f"Supported: Hopper (sm_90) and Blackwell (sm_100 - sm_103)."
-        )
-
-    # Blackwell (sm_100+): enable QuACK GEMM kernels
-    if cc >= (10, 0):
-        os.environ.setdefault("USE_QUACK_GEMM", "1")
-        LOG.info(
-            f"Blackwell GPU (sm_{cc[0]}{cc[1]}) detected, enabling USE_QUACK_GEMM=1"
-        )
-
-    # B300 (sm_103): requires Triton 3.6.0
-    if cc == (10, 3):
-        triton_spec = importlib.util.find_spec("triton")
-        if triton_spec is None:
-            raise RuntimeError(
-                "B300 (sm_103) requires Triton 3.6.0, but Triton is not installed."
-            )
-        import triton
-
-        triton_version = tuple(int(x) for x in triton.__version__.split(".")[:2])
-        if triton_version != (3, 6):
-            raise RuntimeError(
-                f"B300 (sm_103) requires Triton 3.6.x, but found {triton.__version__}."
-            )
+from axolotl.utils.callbacks.models import get_causal_lm_model_cls_prefix


 class KernelsPlugin(BasePlugin):
@@ -64,45 +17,18 @@ class KernelsPlugin(BasePlugin):
        if cfg.use_scattermoe:
            self._register_kernels()
            self._kernelize_model(cfg.model_config_type)
-        elif cfg.use_sonicmoe:
-            if not importlib.util.find_spec("sonicmoe"):
-                raise RuntimeError(
-                    "SonicMoE is not installed. See installation instructions at "
-                    "https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/integrations/kernels/README.md#sonicmoe-installation"
-                )
-
-            _check_sonicmoe_gpu_compat()
-
-            from axolotl.integrations.kernels.sonicmoe import patch_sonicmoe
-
-            LOG.info(
-                f"Applying SonicMoE patches for model type: {cfg.model_config_type}"
-            )
-            patch_sonicmoe(
-                cfg.model_config_type,
-                torch_compile=bool(getattr(cfg, "torch_compile", False)),
-            )

    def _register_kernels(self):
-        from kernels import (
-            LocalLayerRepository,
-            Mode,
-            register_kernel_mapping,
-        )
-
-        plugin_root = Path(__file__).parent
        register_kernel_mapping(
            {
                "HFScatterMoEParallelExperts": {
                    "cuda": {
-                        Mode.TRAINING: LocalLayerRepository(
-                            repo_path=plugin_root / "libs" / "scattermoe_lora",
-                            package_name="scattermoe_lora",
+                        Mode.TRAINING: LayerRepository(
+                            repo_id="axolotl-ai-co/scattermoe",
                            layer_name="HFScatterMoEGatedMLP",
                        ),
-                        Mode.INFERENCE: LocalLayerRepository(
-                            repo_path=plugin_root / "libs" / "scattermoe_lora",
-                            package_name="scattermoe_lora",
+                        Mode.INFERENCE: LayerRepository(
+                            repo_id="axolotl-ai-co/scattermoe",
                            layer_name="HFScatterMoEGatedMLP",
                        ),
                    },
@@ -111,11 +37,25 @@ class KernelsPlugin(BasePlugin):
        )

    def _kernelize_model(self, model_type: str):
-        from kernels import replace_kernel_forward_from_hub
+        if model_type == "olmoe":
+            from transformers.models.olmoe.modeling_olmoe import OlmoeSparseMoeBlock

-        from axolotl.integrations.kernels.constants import resolve_moe_block_classes
-
-        for model_moe_cls in resolve_moe_block_classes(model_type):
            replace_kernel_forward_from_hub(
-                model_moe_cls, "HFScatterMoEParallelExperts"
+                OlmoeSparseMoeBlock, "HFScatterMoEParallelExperts"
            )
+        else:
+            try:
+                model_moe_cls = get_model_moe_block(model_type)
+                replace_kernel_forward_from_hub(
+                    model_moe_cls, "HFScatterMoEParallelExperts"
+                )
+            except Exception as err:
+                raise ValueError(f"Unsupported model type: {model_type}") from err
+
+
+def get_model_moe_block(model_type: str):
+    module_path = f"transformers.models.{model_type}.modeling_{model_type}"
+    model_cls_prefix, _ = get_causal_lm_model_cls_prefix(model_type)
+    module = __import__(module_path, fromlist=[f"{model_cls_prefix}SparseMoeBlock"])
+    model_cls = getattr(module, f"{model_cls_prefix}SparseMoeBlock")
+    return model_cls
--- a/src/axolotl/integrations/kernels/sonicmoe/init.py
+++ b/src/axolotl/integrations/kernels/sonicmoe/init.py
@@ -1,3 +0,0 @@
-from .patch import patch_sonicmoe
-
-__all__ = ["patch_sonicmoe"]
--- a/src/axolotl/integrations/kernels/sonicmoe/patch.py
+++ b/src/axolotl/integrations/kernels/sonicmoe/patch.py
@@ -1,213 +0,0 @@
-"""
-SonicMoE patching for SparseMoeBlock forward pass.
-
-Monkeypatches the SparseMoeBlock class for a given model type to use
-SonicMoE's optimized kernels. Two forward paths are supported:
-
-1. **General routing path** (routing_fn is not None):
-   Uses a custom routing function + ``moe_general_routing_inputs``.
-   Suitable for models with non-standard routing (softmax->topk, sigmoid->topk).
-
-2. **Fused topk->softmax path** (routing_fn is None):
-   Uses ``moe_TC_softmax_topk_layer`` which fuses routing + expert computation.
-   Suitable for models with simple topk->softmax routing.
-
-Weight format conversion (interleave/deinterleave) is handled by the
-WeightConverter system, so the forward assumes weights are already in
-interleaved format.
-
-Shared experts are handled generically: if the block has a ``shared_expert``
-or ``shared_experts`` attribute, its output is computed alongside the routed
-experts and added to the final output. An optional ``shared_expert_gate``
-applies sigmoid gating to the shared expert contribution.
-"""
-
-import torch
-import torch.nn.functional as F
-
-from axolotl.integrations.kernels.constants import resolve_moe_block_classes
-from axolotl.utils.logging import get_logger
-
-LOG = get_logger(__name__)
-
-
-def patch_sonicmoe(model_type: str, torch_compile: bool = False):
-    """Main entry point: patch SparseMoeBlock for SonicMoE support.
-
-    Args:
-        model_type: The HuggingFace model type (e.g. "qwen3_moe").
-        torch_compile: If True, wrap routing functions with torch.compile
-            for kernel fusion (fuses softmax+topk+renorm into fewer launches).
-    """
-    from .routing import get_model_moe_config
-    from .weight_converter import register_sonicmoe_weight_converter
-
-    routing_fn, activation, router_attr = get_model_moe_config(model_type)
-
-    if torch_compile and routing_fn is not None:
-        routing_fn = _try_compile_routing(routing_fn)
-
-    for moe_cls in resolve_moe_block_classes(model_type):
-        _patch_forward(moe_cls, routing_fn, activation, router_attr)
-    register_sonicmoe_weight_converter(model_type)
-
-
-def _try_compile_routing(routing_fn):
-    """Attempt to torch.compile the routing function, fall back to eager on failure."""
-    try:
-        compiled_fn = torch.compile(routing_fn, mode="reduce-overhead", dynamic=False)
-        LOG.info(f"torch.compile enabled for routing function: {routing_fn.__name__}")
-        return compiled_fn
-    except Exception as exc:  # pylint: disable=broad-except
-        LOG.warning(
-            f"torch.compile failed for routing function {routing_fn.__name__}, "
-            f"falling back to eager: {exc}"
-        )
-        return routing_fn
-
-
-def _patch_forward(moe_cls, routing_fn, activation, router_attr):
-    """Monkeypatch the SparseMoeBlock class with a SonicMoE forward.
-
-    The patched forward handles shared experts generically: if
-    ``self.shared_expert`` or ``self.shared_experts`` exists, it is computed
-    and added to the routed output. If ``self.shared_expert_gate`` also exists,
-    it applies sigmoid gating to the shared expert contribution (as in qwen2_moe).
-
-    Args:
-        moe_cls: The SparseMoeBlock class to patch.
-        routing_fn: Routing function (e.g. softmax_topk_routing), or None
-            for the fused moe_TC_softmax_topk_layer path.
-        activation: SonicMoE ActivationType enum value.
-        router_attr: Name of the router module attribute on the MoE block.
-    """
-    if hasattr(moe_cls, "_original_forward"):
-        LOG.info(f"{moe_cls.__name__}.forward already patched with SonicMoE, skipping")
-        return
-
-    original_forward = moe_cls.forward
-
-    if routing_fn is not None:
-        _make_general_forward(moe_cls, routing_fn, activation)
-    else:
-        _make_fused_forward(moe_cls, activation, router_attr)
-
-    moe_cls._original_forward = original_forward
-    LOG.info(f"Patched {moe_cls.__name__}.forward with SonicMoE implementation")
-
-
-def _make_general_forward(moe_cls, routing_fn, activation):
-    """Create forward using routing_fn + moe_general_routing_inputs."""
-
-    def sonicmoe_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        from sonicmoe import moe_general_routing_inputs
-
-        batch_size, sequence_length, hidden_dim = hidden_states.shape
-        hidden_states_flat = hidden_states.view(-1, hidden_dim)
-
-        # Shared expert (computed early, matching original model ordering)
-        shared_expert_output = _compute_shared_expert(self, hidden_states_flat)
-
-        # Routing
-        router_scores, token_indices, expert_indices, _router_logits = routing_fn(
-            hidden_states_flat, self
-        )
-
-        # Permute weights to SonicMoE layout:
-        #   gate_up: [E, 2*I, H] -> [2*I, H, E]
-        #   down:    [E, H, I]   -> [H, I, E]
-        gate_up_weight = self.experts.gate_up_proj.permute(1, 2, 0)
-        down_weight = self.experts.down_proj.permute(1, 2, 0)
-        E = gate_up_weight.shape[-1]
-
-        output, _ = moe_general_routing_inputs(
-            hidden_states_flat,
-            router_scores,
-            token_indices,
-            expert_indices,
-            gate_up_weight,
-            None,  # b1 (no gate/up bias)
-            down_weight,
-            None,  # b2 (no down bias)
-            E,
-            torch.cuda.current_stream().cuda_stream,
-            activation,
-            False,  # is_inference_mode
-        )
-
-        # Add shared expert contribution if present
-        if shared_expert_output is not None:
-            if hasattr(self, "shared_expert_gate"):
-                shared_expert_output = (
-                    F.sigmoid(self.shared_expert_gate(hidden_states_flat))
-                    * shared_expert_output
-                )
-            output = output + shared_expert_output
-
-        return output.view(batch_size, sequence_length, hidden_dim)
-
-    moe_cls.forward = sonicmoe_forward
-
-
-def _make_fused_forward(moe_cls, activation, router_attr):
-    """Create forward using moe_TC_softmax_topk_layer (topk -> softmax)."""
-
-    def sonicmoe_fused_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        from sonicmoe import moe_TC_softmax_topk_layer
-
-        batch_size, sequence_length, hidden_dim = hidden_states.shape
-        hidden_states_flat = hidden_states.view(-1, hidden_dim)
-
-        # Shared expert (computed early, matching original model ordering)
-        shared_expert_output = _compute_shared_expert(self, hidden_states_flat)
-
-        router = getattr(self, router_attr)
-
-        # Permute weights to SonicMoE layout:
-        #   gate_up: [E, 2*I, H] -> [2*I, H, E]
-        #   down:    [E, H, I]   -> [H, I, E]
-        gate_up_weight = self.experts.gate_up_proj.permute(1, 2, 0)
-        down_weight = self.experts.down_proj.permute(1, 2, 0)
-
-        output, _router_logits, _expert_freq = moe_TC_softmax_topk_layer(
-            hidden_states_flat,
-            router.weight,
-            gate_up_weight,
-            None,  # b1 (no gate/up bias)
-            down_weight,
-            None,  # b2 (no down bias)
-            router.top_k,
-            torch.cuda.current_stream().cuda_stream,
-            activation,
-            False,  # is_inference_mode
-        )
-
-        # Add shared expert contribution if present
-        if shared_expert_output is not None:
-            if hasattr(self, "shared_expert_gate"):
-                shared_expert_output = (
-                    F.sigmoid(self.shared_expert_gate(hidden_states_flat))
-                    * shared_expert_output
-                )
-            output = output + shared_expert_output
-
-        return output.view(batch_size, sequence_length, hidden_dim)
-
-    moe_cls.forward = sonicmoe_fused_forward
-
-
-def _compute_shared_expert(moe_block, hidden_states_flat):
-    """Compute shared expert output if the block has one.
-
-    Handles singular (qwen2_moe: ``shared_expert``), plural
-    (glm_moe_dsa/deepseek_v3: ``shared_experts``), and MLP
-    (hunyuan_v1_moe: ``shared_mlp``) attribute names.
-    """
-    shared_expert = (
-        getattr(moe_block, "shared_expert", None)
-        or getattr(moe_block, "shared_experts", None)
-        or getattr(moe_block, "shared_mlp", None)
-    )
-    if shared_expert is not None:
-        return shared_expert(hidden_states_flat)
-    return None
--- a/src/axolotl/integrations/kernels/sonicmoe/routing.py
+++ b/src/axolotl/integrations/kernels/sonicmoe/routing.py
@@ -1,219 +0,0 @@
-"""
-Routing functions for SonicMoE integration.
-
-Different MoE architectures use different routing strategies:
- qwen3_moe / qwen2_moe / qwen3_5_moe / qwen3_vl_moe / qwen3_omni_moe: softmax -> topk (with optional renormalization)
- gpt_oss: topk -> softmax (uses fused moe_TC_softmax_topk_layer, routing_fn=None)
- glm_moe_dsa: sigmoid -> topk (with group-based expert selection)
-
-Each model type maps to a (routing_fn, activation_type, router_attr) triple.
-When routing_fn is None, the fused moe_TC_softmax_topk_layer path is used.
-"""
-
-import torch
-import torch.nn.functional as F
-
-
-def get_model_moe_config(model_type: str):
-    """Returns (routing_fn, activation, router_attr) for a given model type.
-
-    Args:
-        model_type: HuggingFace model type string.
-
-    Returns:
-        routing_fn: Callable or None. None signals the fused
-            moe_TC_softmax_topk_layer path (topk -> softmax models).
-        activation: SonicMoE ActivationType enum value.
-        router_attr: Name of the router module attribute on the MoE block
-            (e.g. "gate" or "router").
-
-    The activation type cannot be derived from config.hidden_act because
-    e.g. qwen3_moe reports "silu" but architecturally uses SwiGLU
-    (act_fn(gate) * up pattern). So we specify it per model type.
-    """
-    from sonicmoe.enums import ActivationType
-
-    if model_type in (
-        "qwen2_moe",
-        "qwen3_moe",
-        "qwen3_5_moe",
-        "qwen3_next",
-        "qwen3_vl_moe",
-        "qwen3_omni_moe",
-        "olmoe",
-        "mixtral",
-        "minimax",
-    ):
-        return softmax_topk_routing, ActivationType.SWIGLU, "gate"
-    elif model_type in (
-        "glm_moe_dsa",
-        "deepseek_v3",
-        "glm4_moe",
-        "glm4_moe_lite",
-        "glm4v_moe",
-        "minimax_m2",
-    ):
-        return sigmoid_topk_routing, ActivationType.SWIGLU, "gate"
-    # elif model_type in ("ernie4_5_moe",):
-    #     # Softmax→topk with e_score_correction_bias applied between softmax and topk.
-    #     return ..., ActivationType.SWIGLU, "gate"
-    # elif model_type in ("deepseek_v2",):
-    #     # Softmax→topk with group_limited_greedy. Different attr names: num_group
-    #     # (not n_group), gate is nn.Linear (not a router class).
-    #     return ..., ActivationType.SWIGLU, "gate"
-    # elif model_type in ("hunyuan_v1_moe",):
-    #     # Softmax→topk but gate structure differs: gate.wg (not gate.weight),
-    #     # top_k on block not gate, creates scatter routing matrix.
-    #     return ..., ActivationType.SWIGLU, "gate"
-    # Fused topk -> softmax path (routing_fn=None):
-    # elif model_type in ("gpt_oss",):
-    #     # NOTE: gpt_oss has a router bias which moe_TC_softmax_topk_layer
-    #     # ignores (it only takes router_w, not bias). Also has transposed
-    #     # weight layout [E, H, 2*I] and custom GLU activation.
-    #     return None, ActivationType.SWIGLU, "router"
-    else:
-        raise ValueError(f"SonicMoE: unsupported model type '{model_type}'")
-
-
-def softmax_topk_routing(
-    hidden_states: torch.Tensor, moe_block
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Qwen3/Qwen2-style routing: softmax -> topk -> optional renorm.
-
-    Args:
-        hidden_states: [T, H] flattened token representations
-        moe_block: MoE block module (accesses moe_block.gate.*)
-
-    Returns:
-        router_scores: [T*K] flattened scores (float32)
-        token_indices: [T*K] which token each entry belongs to (int32), sorted ascending
-        expert_indices: [T*K] which expert (int32)
-        router_logits: [T, E] original logits for aux loss
-    """
-    gate = moe_block.gate
-    T, H = hidden_states.shape
-    K = gate.top_k
-
-    # Compute router logits and softmax over all experts
-    router_logits = F.linear(hidden_states, gate.weight)  # [T, E]
-    router_probs = F.softmax(router_logits, dim=-1, dtype=torch.float32)  # [T, E]
-
-    # Select top-k experts per token
-    top_values, top_indices = torch.topk(router_probs, K, dim=-1)  # [T, K] each
-
-    # Renormalize if configured (default True for models without the attribute,
-    # e.g. Mixtral/MiniMax which always normalize)
-    if getattr(gate, "norm_topk_prob", True):
-        top_values = top_values / top_values.sum(dim=-1, keepdim=True)
-
-    # no-op: matches transformers which casts to softmax output dtype (float32).
-    # top_values = top_values.to(router_probs.dtype)
-
-    # Flatten for moe_general_routing_inputs.
-    # Token indices are naturally sorted ascending from the [T, K] layout:
-    # [0, 0, ..., 1, 1, ..., T-1, T-1, ...] — this is required by SonicMoE.
-    # Expert sorting is handled internally by general_routing_router_metadata.
-    token_indices = (
-        torch.arange(T, device=hidden_states.device, dtype=torch.int32)
-        .unsqueeze(1)
-        .expand(T, K)
-    )
-
-    flat_scores = top_values.reshape(-1)  # [T*K]
-    flat_token_idx = token_indices.reshape(-1)  # [T*K]
-    flat_expert_idx = top_indices.to(torch.int32).reshape(-1)  # [T*K]
-
-    return flat_scores, flat_token_idx, flat_expert_idx, router_logits
-
-
-def sigmoid_topk_routing(
-    hidden_states: torch.Tensor, moe_block
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Sigmoid-based routing: sigmoid -> optional group selection -> topk.
-
-    Supports two variants:
-    - **Group selection** (glm_moe_dsa, deepseek_v3, etc.): n_group > 1,
-      bias on gate, group-based masking before topk.
-    - **No group selection** (minimax_m2): n_group == 1 (or absent),
-      bias on moe_block, straight topk from all experts.
-
-    Final routing weights come from the original sigmoid scores (not
-    bias-corrected), with optional renormalization and scaling.
-
-    Args:
-        hidden_states: [T, H] flattened token representations
-        moe_block: MoE block module (accesses moe_block.gate.* and
-            optional moe_block.n_group, .topk_group, .top_k, .norm_topk_prob,
-            .routed_scaling_factor, .n_routed_experts)
-
-    Returns:
-        router_scores: [T*K] flattened scores (float32)
-        token_indices: [T*K] which token each entry belongs to (int32), sorted ascending
-        expert_indices: [T*K] which expert (int32)
-        router_logits: [T, E] original logits for aux loss
-    """
-    gate = moe_block.gate
-    T, H = hidden_states.shape
-    K = moe_block.top_k
-    E = getattr(moe_block, "n_routed_experts", gate.weight.shape[0])
-    n_group = getattr(moe_block, "n_group", 1)
-
-    # Compute router logits and sigmoid probabilities
-    router_logits = F.linear(hidden_states.float(), gate.weight.float())  # [T, E]
-    router_probs = router_logits.sigmoid()  # [T, E]
-
-    # Bias-corrected scores for expert selection (not used for final weights).
-    # glm_moe_dsa/deepseek_v3 store the bias on gate; minimax_m2 stores it on the block.
-    e_score_correction_bias = getattr(gate, "e_score_correction_bias", None)
-    if e_score_correction_bias is None:
-        e_score_correction_bias = getattr(moe_block, "e_score_correction_bias", None)
-    if e_score_correction_bias is None:
-        raise AttributeError(
-            f"sigmoid_topk_routing requires e_score_correction_bias on "
-            f"gate ({type(gate)}) or moe_block ({type(moe_block)}), but neither has it"
-        )
-    scores_for_choice = router_probs + e_score_correction_bias
-
-    # Group-based selection: pick top groups, mask the rest (skip when n_group == 1)
-    if n_group > 1:
-        group_scores = (
-            scores_for_choice.view(-1, n_group, E // n_group)
-            .topk(2, dim=-1)[0]
-            .sum(dim=-1)
-        )  # [T, n_group]
-        group_idx = torch.topk(
-            group_scores, k=moe_block.topk_group, dim=-1, sorted=False
-        )[1]
-        group_mask = torch.zeros_like(group_scores)
-        group_mask.scatter_(1, group_idx, 1)
-        score_mask = (
-            group_mask.unsqueeze(-1).expand(-1, n_group, E // n_group).reshape(-1, E)
-        )
-        scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0)
-
-    # Final topk from (possibly masked) scores
-    topk_indices = torch.topk(scores_for_choice, k=K, dim=-1, sorted=False)[1]
-
-    # Gather weights from original sigmoid scores (not bias-corrected)
-    topk_weights = router_probs.gather(1, topk_indices)
-
-    # Optional renormalization + scaling
-    norm_topk_prob = getattr(moe_block, "norm_topk_prob", True)
-    if norm_topk_prob:
-        topk_weights = topk_weights / (topk_weights.sum(dim=-1, keepdim=True) + 1e-20)
-    routed_scaling_factor = getattr(moe_block, "routed_scaling_factor", 1.0)
-    topk_weights = topk_weights * routed_scaling_factor
-
-    # Flatten for moe_general_routing_inputs.
-    # Token indices are naturally sorted ascending from the [T, K] layout.
-    token_indices = (
-        torch.arange(T, device=hidden_states.device, dtype=torch.int32)
-        .unsqueeze(1)
-        .expand(T, K)
-    )
-
-    flat_scores = topk_weights.to(torch.float32).reshape(-1)  # [T*K]
-    flat_token_idx = token_indices.reshape(-1)  # [T*K]
-    flat_expert_idx = topk_indices.to(torch.int32).reshape(-1)  # [T*K]
-
-    return flat_scores, flat_token_idx, flat_expert_idx, router_logits
--- a/src/axolotl/integrations/kernels/sonicmoe/weight_converter.py
+++ b/src/axolotl/integrations/kernels/sonicmoe/weight_converter.py
@@ -1,181 +0,0 @@
-"""
-Custom WeightConverter operations for SonicMoE weight format conversion.
-
-SonicMoE requires gate_up_proj weights in interleaved format:
- Standard (concatenated): [E, 2*I, H] where first I rows are gate, last I rows are up
- SonicMoE (interleaved): [E, 2*I, H] where rows alternate [g0, u0, g1, u1, ...]
-
-These ConversionOps integrate with transformers' WeightConverter system so that
-weights are transparently converted during loading and reverted during saving.
-"""
-
-from typing import Any
-
-import torch
-from einops import rearrange
-from transformers.core_model_loading import ConversionOps
-
-from axolotl.utils.logging import get_logger
-
-LOG = get_logger(__name__)
-
-
-def interleave_gate_up(tensor: torch.Tensor) -> torch.Tensor:
-    """[gate..., up...] -> [g0, u0, g1, u1, ...] along the 2*I dimension."""
-    return rearrange(tensor, "... (two out) h -> ... (out two) h", two=2)
-
-
-def deinterleave_gate_up(tensor: torch.Tensor) -> torch.Tensor:
-    """[g0, u0, g1, u1, ...] -> [gate..., up...] along the 2*I dimension."""
-    return rearrange(tensor, "... (out two) h -> ... (two out) h", two=2)
-
-
-class ConcatenatedToInterleaved(ConversionOps):
-    """Convert concatenated gate/up projections to interleaved format.
-
-    Input:  [E, 2*I, H] with gate=[E, :I, H] and up=[E, I:, H]
-    Output: [E, 2*I, H] with rows alternating [g0, u0, g1, u1, ...]
-
-    This operation is applied along ``dim`` (default 1, the 2*I dimension).
-    """
-
-    def __init__(self, dim: int = 1):
-        self.dim = dim
-
-    @torch.no_grad()
-    def convert(
-        self,
-        input_dict: dict[str, Any],
-        source_patterns: list[str],
-        target_patterns: list[str],
-        **kwargs,
-    ) -> dict[str, torch.Tensor]:
-        target_pattern = self._get_target_pattern(
-            input_dict, source_patterns, target_patterns
-        )
-        tensors = next(iter(input_dict.values()))
-        tensor = tensors[0] if isinstance(tensors, list) else tensors
-
-        interleaved = interleave_gate_up(tensor)
-
-        return {target_pattern: interleaved}
-
-    def _get_target_pattern(
-        self,
-        input_dict: dict[str, Any],
-        source_patterns: list[str],
-        target_patterns: list[str],
-    ) -> str:
-        # Follow the same logic as Transpose.get_target_pattern
-        if len(input_dict) != 1:
-            raise ValueError("Undefined Operation encountered!")
-        if len(target_patterns) > 1:
-            if len(source_patterns) == 1:
-                return source_patterns[0]
-            raise ValueError("Undefined Operation encountered!")
-        return target_patterns[0]
-
-    @property
-    def reverse_op(self) -> ConversionOps:
-        return InterleavedToConcatenated(self.dim)
-
-
-class InterleavedToConcatenated(ConversionOps):
-    """Convert interleaved gate/up projections back to concatenated format.
-
-    Input:  [E, 2*I, H] with rows alternating [g0, u0, g1, u1, ...]
-    Output: [E, 2*I, H] with gate=[E, :I, H] and up=[E, I:, H]
-
-    This is the reverse of ``ConcatenatedToInterleaved``.
-    """
-
-    def __init__(self, dim: int = 1):
-        self.dim = dim
-
-    @torch.no_grad()
-    def convert(
-        self,
-        input_dict: dict[str, Any],
-        source_patterns: list[str],
-        target_patterns: list[str],
-        **kwargs,
-    ) -> dict[str, torch.Tensor]:
-        target_pattern = self._get_target_pattern(
-            input_dict, source_patterns, target_patterns
-        )
-        tensors = next(iter(input_dict.values()))
-        tensor = tensors[0] if isinstance(tensors, list) else tensors
-
-        concatenated = deinterleave_gate_up(tensor)
-
-        return {target_pattern: concatenated}
-
-    def _get_target_pattern(
-        self,
-        input_dict: dict[str, Any],
-        source_patterns: list[str],
-        target_patterns: list[str],
-    ) -> str:
-        if len(input_dict) != 1:
-            raise ValueError("Undefined Operation encountered!")
-        if len(target_patterns) > 1:
-            if len(source_patterns) == 1:
-                return source_patterns[0]
-            raise ValueError("Undefined Operation encountered!")
-        return target_patterns[0]
-
-    @property
-    def reverse_op(self) -> ConversionOps:
-        return ConcatenatedToInterleaved(self.dim)
-
-
-def register_sonicmoe_weight_converter(model_type: str):
-    """Override the conversion mapping to add interleave step for gate_up_proj.
-
-    Appends a ConcatenatedToInterleaved operation to the existing gate_up_proj
-    converter chain. For example, qwen3_moe's chain becomes:
-        MergeModulelist(dim=0) -> Concatenate(dim=1) -> ConcatenatedToInterleaved(dim=1)
-
-    The reverse is auto-generated for saving:
-        InterleavedToConcatenated(dim=1) -> Chunk(dim=1) -> SplitModulelist(dim=0)
-    """
-    from transformers.conversion_mapping import (
-        get_checkpoint_conversion_mapping,
-        register_checkpoint_conversion_mapping,
-    )
-
-    existing = get_checkpoint_conversion_mapping(model_type)
-    if existing is None:
-        LOG.warning(
-            f"No conversion mapping found for model type '{model_type}'. "
-            "SonicMoE weight interleaving will not be applied during checkpoint loading."
-        )
-        return
-
-    # Find the gate_up_proj converter and append ConcatenatedToInterleaved
-    patched = False
-    for converter in existing:
-        if hasattr(converter, "operations") and any(
-            "gate_up_proj" in pat for pat in converter.target_patterns
-        ):
-            # Guard against double registration (e.g. plugin reloaded)
-            if any(
-                isinstance(op, ConcatenatedToInterleaved) for op in converter.operations
-            ):
-                LOG.info(
-                    f"SonicMoE weight converter already registered for '{model_type}'"
-                )
-                return
-            converter.operations.append(ConcatenatedToInterleaved(dim=1))
-            patched = True
-            break
-
-    if not patched:
-        LOG.warning(
-            f"Could not find gate_up_proj converter for model type '{model_type}'. "
-            "SonicMoE weight interleaving will not be applied during checkpoint loading."
-        )
-        return
-
-    register_checkpoint_conversion_mapping(model_type, existing, overwrite=True)
-    LOG.info(f"Registered SonicMoE weight converter for model type '{model_type}'")
--- a/src/axolotl/integrations/liger/plugin.py
+++ b/src/axolotl/integrations/liger/plugin.py
@@ -8,6 +8,9 @@ import sys
 from axolotl.integrations.base import BasePlugin
 from axolotl.utils.logging import get_logger

+from .models.base import patch_lce_forward
+from .utils import patch_with_compile_disable
+
 LOG = get_logger(__name__)


@@ -20,18 +23,10 @@ class LigerPlugin(BasePlugin):
        return "axolotl.integrations.liger.LigerArgs"

    def pre_model_load(self, cfg):
-        # shim: liger-kernel 0.7.0 imports ORPOTrainer from old trl path
-        import trl.trainer
-        from trl.experimental.orpo import ORPOTrainer
-
-        trl.trainer.ORPOTrainer = ORPOTrainer
-
        if cfg.torch_compile:
            # torch compile will unnecessarily attempt to optimize the triton kernel unless explicitly disabled
            import liger_kernel.ops.fused_linear_cross_entropy

-            from .utils import patch_with_compile_disable
-
            patch_with_compile_disable(
                liger_kernel.ops.fused_linear_cross_entropy,
                "fused_linear_cross_entropy_forward",
@@ -40,7 +35,6 @@ class LigerPlugin(BasePlugin):
                liger_kernel.ops.fused_linear_cross_entropy,
                "fused_linear_cross_entropy_backward",
            )
-
        from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss
        from liger_kernel.transformers.functional import liger_cross_entropy
        from liger_kernel.transformers.layer_norm import LigerLayerNorm
@@ -198,8 +192,6 @@ class LigerPlugin(BasePlugin):
            )
        elif cfg.liger_fused_linear_cross_entropy:
            try:
-                from .models.base import patch_lce_forward
-
                patch_lce_forward(cfg.model_config_type)
                LOG.warning_once(
                    f"Applied ONLY liger_fused_linear_cross_entropy genericpatches for model type: {cfg.model_config_type}"
--- a/src/axolotl/loaders/adapter.py
+++ b/src/axolotl/loaders/adapter.py
@@ -34,7 +34,7 @@ def setup_quantized_meta_for_peft(model: torch.nn.Module):
        return self

    for param in model.parameters():
-        if isinstance(param, Params4bit) and param.quant_state is not None:
+        if isinstance(param, Params4bit):
            param.quant_state._orig_to = param.quant_state.to
            param.quant_state.to = types.MethodType(temp_to_method, param.quant_state)

--- a/src/axolotl/loaders/model.py
+++ b/src/axolotl/loaders/model.py
@@ -172,10 +172,7 @@ class ModelLoader:
        # Build the model
        PLUGIN_MANAGER.pre_model_load(self.cfg)
        self.patch_manager.apply_post_plugin_pre_model_load_patches()
-
        skip_move_to_device = self._build_model()
-        self.patch_manager.apply_post_model_build_patches(self.model)
-
        PLUGIN_MANAGER.post_model_build(self.cfg, self.model)

        # Post-build model configuration
@@ -226,7 +223,7 @@ class ModelLoader:
            isinstance(self.model, (peft.PeftModel, peft.PeftModelForCausalLM))
            and not self.is_qlora_and_fsdp_enabled
        ):
-            self.model = self.model.merge_and_unload(safe_merge=True)
+            self.model = self.model.merge_and_unload()

        self._configure_experts_implementation()
        self._apply_activation_checkpointing()
@@ -674,8 +671,8 @@ class ModelLoader:
                del self.model_kwargs["device_map"]

            transformers.modeling_utils.is_deepspeed_zero3_enabled = lambda: True
-            transformers.integrations.deepspeed.is_deepspeed_zero3_enabled = lambda: (
-                True
+            transformers.integrations.deepspeed.is_deepspeed_zero3_enabled = (
+                lambda: True
            )

        return hf_ds_cfg
@@ -863,10 +860,6 @@ class ModelLoader:
            # Make sure everything is in the same dtype
            skip_prepare_model_for_kbit_training = True

-        if getattr(self.model, "_moe_experts_quantized", False):
-            # Parametrized expert tensors dequantize on access — would OOM.
-            skip_prepare_model_for_kbit_training = True
-
        if (
            not skip_prepare_model_for_kbit_training
            and self.cfg.adapter in ["lora", "qlora"]
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -118,7 +118,6 @@ class PatchManager:
    def apply_post_plugin_pre_model_load_patches(self):
        """Apply post plugin-pre_model_load load patches based on config."""
        self._apply_tiled_mlp(self.cfg.model_config_type)
-        self._apply_moe_expert_quantization_patch()

    def _apply_transformers_patches(self):
        from axolotl.monkeypatch.transformers.trainer_loss_calc import (
@@ -136,10 +135,6 @@ class PatchManager:

            patch_prepare_context_parallel_inputs()

-    def apply_post_model_build_patches(self, model: PreTrainedModel):
-        """Apply patches right after model build, before post-load setup."""
-        self._finalize_moe_expert_quantization(model)
-
    def apply_post_model_load_patches(self, model: PreTrainedModel):
        """Apply patches that require the model instance."""
        self._apply_llama_flash_attn_patches(model)
@@ -166,13 +161,6 @@ class PatchManager:

    def _apply_fsdp_patches(self):
        """Apply patches for FSDP configurations."""
-        if self.cfg.fsdp_config:
-            from axolotl.monkeypatch.accelerate.fsdp2 import (
-                patch_initialize_missing_keys_for_fsdp,
-            )
-
-            patch_initialize_missing_keys_for_fsdp()
-
        if self.cfg.context_parallel_size > 1 or (
            self.cfg.fsdp_config and str(self.cfg.fsdp_version) == "2"
        ):
@@ -182,14 +170,9 @@ class PatchManager:

            patch_parallelism_config()
        if self.cfg.fsdp_config and str(self.cfg.fsdp_version) == "2":
-            from axolotl.monkeypatch.accelerate.fsdp2 import (
-                patch_accelerate_fsdp2,
-                patch_tied_keys_for_meta_device,
-            )
+            from axolotl.monkeypatch.accelerate.fsdp2 import patch_accelerate_fsdp2

            patch_accelerate_fsdp2()
-            if self.cfg.fsdp_config.cpu_ram_efficient_loading:
-                patch_tied_keys_for_meta_device()
            if self.cfg.rl:
                from axolotl.monkeypatch.trainer.trl import patch_trl_prepare_fsdp2

@@ -246,31 +229,6 @@ class PatchManager:

            patch_qwen3_next_modeling_packing()

-        if self.cfg.model_config_type == "qwen3_5" and self.cfg.sample_packing:
-            from axolotl.monkeypatch.models.qwen3_5.modeling import (
-                patch_qwen3_5_modeling_packing,
-            )
-
-            patch_qwen3_5_modeling_packing()
-
-        if self.cfg.model_config_type == "qwen3_5_moe" and self.cfg.sample_packing:
-            from axolotl.monkeypatch.models.qwen3_5.modeling import (
-                patch_qwen3_5_moe_modeling_packing,
-            )
-
-            patch_qwen3_5_moe_modeling_packing()
-
-        if (
-            self.cfg.model_config_type in ["qwen3_5", "qwen3_5_moe"]
-            and self.cfg.is_multimodal
-            and self.cfg.flash_attention
-        ):
-            from axolotl.monkeypatch.models.qwen3_5.modeling import (
-                patch_qwen3_5_vlm_flash_attention,
-            )
-
-            patch_qwen3_5_vlm_flash_attention()
-
        if self.cfg.model_config_type == "kimi_linear":
            from axolotl.monkeypatch.models.kimi_linear.patch_kimi_linear import (
                patch_kimi_model,
@@ -371,7 +329,7 @@ class PatchManager:
            else:
                has_remote_code = False

-            if has_remote_code and self.cfg.trust_remote_code is not None:
+            if has_remote_code and self.cfg.trust_remote_code is False:
                # If explicitly set in YAML, prefer that
                has_remote_code = self.cfg.trust_remote_code

@@ -394,54 +352,15 @@ class PatchManager:
        if (
            self.cfg.fsdp_config
            and str(self.cfg.fsdp_version) == "2"
-            and (self.cfg.load_in_4bit or self.cfg.load_in_8bit)
+            and self.cfg.adapter == "qlora"
        ):
            from axolotl.monkeypatch.fsdp2_qlora import (
-                apply_init_dtype_attrs_patch,
                apply_init_sharded_param_patch,
                apply_init_unsharded_param_patch,
-                apply_linear8bitlt_save_patch,
            )

            apply_init_sharded_param_patch()
            apply_init_unsharded_param_patch()
-            apply_init_dtype_attrs_patch()
-            if self.cfg.load_in_8bit:
-                apply_linear8bitlt_save_patch()
-
-    def _apply_moe_expert_quantization_patch(self):
-        """Patch transformers weight loading to quantize MoE expert params on-the-fly."""
-        if not self.cfg.quantize_moe_experts:
-            return
-
-        from axolotl.monkeypatch.moe_quant import (
-            patch_moe_quantization_on_load,
-            patch_peft_target_parameters_matching,
-        )
-
-        patch_moe_quantization_on_load(self.cfg)
-        patch_peft_target_parameters_matching()
-
-    def _finalize_moe_expert_quantization(self, model: PreTrainedModel):
-        """Log quantization results and set model flag for downstream use."""
-        import torch
-
-        model._moe_experts_quantized = False
-        if self.cfg.quantize_moe_experts:
-            from axolotl.monkeypatch.moe_quant import get_moe_quantized_count
-
-            count = get_moe_quantized_count()
-            if count > 0:
-                import gc
-
-                model._moe_experts_quantized = True
-                LOG.info(
-                    "Quantized %d MoE expert parameter(s) to %s during model loading",
-                    count,
-                    "4-bit" if self.cfg.load_in_4bit else "8-bit",
-                )
-                gc.collect()
-                torch.cuda.empty_cache()

    def _apply_tiled_mlp(self, model_type: str):
        if self.cfg.tiled_mlp:
--- a/src/axolotl/loaders/processor.py
+++ b/src/axolotl/loaders/processor.py
@@ -19,11 +19,6 @@ def load_processor(cfg: DictDefault, tokenizer: PreTrainedTokenizerBase):
    if cfg.processor_type:
        processor_cls = getattr(transformers, cfg.processor_type)

-    # Build common kwargs for processor loading
-    processor_kwargs = {}
-    if cfg.revision_of_model:
-        processor_kwargs["revision"] = cfg.revision_of_model
-
    if cfg.tokenizer_use_mistral_common:

        def _patch_mistralcommontokenizer():
@@ -45,7 +40,6 @@ def load_processor(cfg: DictDefault, tokenizer: PreTrainedTokenizerBase):
        if processor_cls == VoxtralProcessor:
            return VoxtralProcessor.from_pretrained(
                cfg.processor_config,
-                **processor_kwargs,
            )

        from axolotl.utils.mistral import Mistral3Processor
@@ -54,12 +48,10 @@ def load_processor(cfg: DictDefault, tokenizer: PreTrainedTokenizerBase):
            tokenizer=tokenizer,
        )

-    processor_kwargs["trust_remote_code"] = cfg.trust_remote_code or False
-    processor_kwargs["tokenizer"] = tokenizer
-
    processor = processor_cls.from_pretrained(
        cfg.processor_config,
-        **processor_kwargs,
+        trust_remote_code=cfg.trust_remote_code or False,
+        tokenizer=tokenizer,
    )

    # Attempt to load image size from processor if available
--- a/src/axolotl/loaders/tokenizer.py
+++ b/src/axolotl/loaders/tokenizer.py
@@ -28,10 +28,7 @@ PLUGIN_MANAGER = PluginManager.get_instance()


 def modify_tokenizer_files(
-    tokenizer_path: str,
-    token_mappings: dict[int, str],
-    output_dir: str,
-    revision: str = "main",
+    tokenizer_path: str, token_mappings: dict[int, str], output_dir: str
 ) -> str:
    """
    Modify tokenizer files to replace added_tokens strings, save to output directory,
@@ -44,7 +41,6 @@ def modify_tokenizer_files(
        tokenizer_path: Path or name of the original tokenizer
        token_mappings: Dict mapping {token_id (int): new_token_string}
        output_dir: Directory to save the modified tokenizer
-        revision: Model revision/branch/tag/commit to load from (HF Hub)

    Returns:
        Path to the modified tokenizer directory
@@ -57,9 +53,7 @@ def modify_tokenizer_files(

    if is_local_main_process():
        # Load the tokenizer
-        temp_tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_path, use_fast=True, revision=revision
-        )
+        temp_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True)

        # Save the tokenizer to the output directory
        temp_tokenizer.save_pretrained(tokenizer_dir)
@@ -140,10 +134,7 @@ def load_tokenizer(cfg: DictDefault) -> PreTrainedTokenizer:
        from axolotl.utils.mistral import HFMistralTokenizer

        # Load the HF-compatible wrapper around MistralTokenizer
-        kwargs = {}
-        if cfg.revision_of_model:
-            kwargs["revision"] = cfg.revision_of_model
-        tokenizer = HFMistralTokenizer.from_pretrained(cfg.tokenizer_config, **kwargs)
+        tokenizer = HFMistralTokenizer.from_pretrained(cfg.tokenizer_config)

        return tokenizer

@@ -159,8 +150,6 @@ def load_tokenizer(cfg: DictDefault) -> PreTrainedTokenizer:
    if cfg.tokenizer_legacy is not None:
        # True is the default w/ https://github.com/huggingface/transformers/pull/25224
        tokenizer_kwargs["legacy"] = cfg.tokenizer_legacy
-    if cfg.revision_of_model:
-        tokenizer_kwargs["revision"] = cfg.revision_of_model

    tokenizer_cls = AutoTokenizer
    if cfg.tokenizer_type:
@@ -172,11 +161,8 @@ def load_tokenizer(cfg: DictDefault) -> PreTrainedTokenizer:
    # Apply token string overrides if specified
    if cfg.added_tokens_overrides:
        # Modify tokenizer files and get path to modified tokenizer
-        modify_kwargs = {"output_dir": cfg.output_dir}
-        if cfg.revision_of_model:
-            modify_kwargs["revision"] = cfg.revision_of_model
        tokenizer_path = modify_tokenizer_files(
-            tokenizer_path, cfg.added_tokens_overrides, **modify_kwargs
+            tokenizer_path, cfg.added_tokens_overrides, output_dir=cfg.output_dir
        )

    tokenizer = tokenizer_cls.from_pretrained(
@@ -201,7 +187,7 @@ def load_tokenizer(cfg: DictDefault) -> PreTrainedTokenizer:
        tokenizer.pad_token = LLAMA_DEFAULT_EOS_TOKEN

    if tokenizer.__class__.__name__ == "GPTNeoXTokenizerFast":
-        tokenizer.add_special_tokens({"pad_token": "[PAD]"})  # nosec B105
+        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
        os.environ["TOKENIZERS_PARALLELISM"] = "false"

    # Mistral's official FA implementation requires left padding
--- a/src/axolotl/models/mamba/modeling_mamba.py
+++ b/src/axolotl/models/mamba/modeling_mamba.py
@@ -111,7 +111,6 @@ class MambaLMHeadModel(nn.Module, GenerationMixin):
        self,
        save_directory: Union[str, os.PathLike],
        state_dict: Optional[dict] = None,
-        **kwargs,
    ):
        if state_dict is None:
            state_dict = self.state_dict()
--- a/src/axolotl/monkeypatch/accelerate/fsdp2.py
+++ b/src/axolotl/monkeypatch/accelerate/fsdp2.py
@@ -150,17 +150,13 @@ def get_state_dict(self, model, unwrap=True):
            )
    elif self.is_fsdp2:
        # https://github.com/pytorch/torchtune/blob/main/torchtune/training/_distributed.py#L465
-        from torch.distributed.tensor import DTensor
-
        state_dict = {}
        sharded_state_dict = model.state_dict()
        for param_name, param in sharded_state_dict.items():
            if param.is_cpu:
                param = param.to(torch.device("cuda"))

-            if isinstance(param, DTensor):
-                param = param.full_tensor()
-
+            param = param.full_tensor()
            if torch.distributed.get_rank() == 0:
                state_dict[param_name] = param.cpu()
            torch.distributed.barrier()
@@ -186,56 +182,10 @@ def get_state_dict(self, model, unwrap=True):
    return state_dict


-def patch_peft_param_wrapper_for_fsdp2():
-    """Patch PEFT's _LoraParameterProxy.forward for FSDP2 DTensor compatibility.
-
-    PEFT's ParamWrapper applies LoRA via torch.nn.utils.parametrize, which adds
-    delta_weight to the base weight W inside _LoraParameterProxy.forward().
-    Under FSDP2, W may be a DTensor (from FSDP unshard) while delta_weight is a
-    regular Tensor (or vice versa), causing a RuntimeError on mixed types.
-
-    This patch promotes the non-DTensor operand to match the DTensor's spec
-    using DTensor.from_local(), which is free for Replicate placement (just
-    metadata wrapping, no communication).
-    """
-    from peft.tuners.lora.layer import _LoraParameterProxy
-
-    if getattr(_LoraParameterProxy, "_axolotl_fsdp2_patched", False):
-        return
-
-    _original_forward = _LoraParameterProxy.forward
-
-    # NOTE: Replaces (not wraps) forward; assumes original is just `W + self.delta_weight`.
-    def _patched_forward(self, W):
-        from torch.distributed.tensor import DTensor
-
-        delta = self.delta_weight
-        w_is_dt = isinstance(W, DTensor)
-        d_is_dt = isinstance(delta, DTensor)
-
-        with torch.nn.utils.parametrize.cached():
-            if w_is_dt == d_is_dt:
-                return W + delta
-            if w_is_dt:
-                return W + DTensor.from_local(delta, W.device_mesh, W.placements)
-            return DTensor.from_local(W, delta.device_mesh, delta.placements) + delta
-
-    _LoraParameterProxy.forward = _patched_forward
-    _LoraParameterProxy._axolotl_fsdp2_patched = True
-    LOG.info("Patched PEFT _LoraParameterProxy.forward for FSDP2 DTensor compatibility")
-
-
 def _process_lora_module_for_fsdp(module, fsdp2_kwargs):
    """Helper function to process LoRA modules for FSDP2."""
-    from peft.tuners.lora.layer import ParamWrapper
    from torch.distributed.fsdp import fully_shard

-    # Skip ParamWrapper — its lora_A/B must not be independently sharded.
-    # The parent decoder layer's FSDP wrapper handles unsharding them.
-    # TODO: review if we even need to shard them separately in first place.
-    if isinstance(module, ParamWrapper):
-        return False
-
    log_bias_dtype_mismatch = False

    # Linear4Bit will keep it's bias term in fp32. If the weight dtype is in bf16 we are not able to
@@ -252,20 +202,12 @@ def _process_lora_module_for_fsdp(module, fsdp2_kwargs):
            fully_shard(module.lora_A[active_adapter], **fsdp2_kwargs)
        if module.lora_B:
            fully_shard(module.lora_B[active_adapter], **fsdp2_kwargs)
+        if module.lora_embedding_A:
+            fully_shard(module.lora_embedding_A[active_adapter], **fsdp2_kwargs)
+        if module.lora_embedding_B:
+            fully_shard(module.lora_embedding_B[active_adapter], **fsdp2_kwargs)
        if module.lora_magnitude_vector:
            fully_shard(module.lora_magnitude_vector[active_adapter], **fsdp2_kwargs)
-
-    # lora_embedding_A/B are ParameterDicts containing nn.Parameter (Tensors),
-    # not nn.Module. fully_shard() only accepts nn.Module, so we cannot shard
-    # individual embedding Parameters. Instead, shard the entire LoraLayer module. fully_shard() can be used hierarchically because it does not
-    # override groups already assigned by fully_shard(), so modules
-    # where fully_shard() was already called are not affected [see https://docs.pytorch.org/docs/stable/distributed.fsdp.fully_shard.html]
-    if module.lora_embedding_A or module.lora_embedding_B:
-        from torch.distributed.fsdp import FSDPModule
-
-        if not isinstance(module, FSDPModule):
-            fully_shard(module, **fsdp2_kwargs)
-
    return log_bias_dtype_mismatch


@@ -385,14 +327,6 @@ def fsdp2_prepare_model(accelerator, model: torch.nn.Module) -> torch.nn.Module:

    is_peft_model = isinstance(model, PeftModel)

-    # Patch PEFT's _LoraParameterProxy for DTensor compatibility if any
-    # ParamWrapper modules exist (used for target_parameters / 3D expert params).
-    if is_peft_model:
-        from peft.tuners.lora.layer import ParamWrapper
-
-        if any(isinstance(m, ParamWrapper) for m in model.modules()):
-            patch_peft_param_wrapper_for_fsdp2()
-
    auto_wrap_policy = fsdp2_prepare_auto_wrap_policy(fsdp2_plugin, model)
    log_bias_dtype_mismatch = False
    if auto_wrap_policy is not None:
@@ -442,83 +376,6 @@ def fsdp2_prepare_model(accelerator, model: torch.nn.Module) -> torch.nn.Module:
    return model


-def patch_tied_keys_for_meta_device():
-    """Patch _adjust_tied_keys_with_tied_pointers to skip meta tensors.
-
-    Meta tensors all share data_ptr()==0, causing every parameter to be incorrectly
-    grouped as "tied". Skipping them is safe since they have no real storage.
-    """
-    from collections import defaultdict
-
-    from transformers import PreTrainedModel
-
-    def _patched_adjust_tied_keys_with_tied_pointers(self, missing_keys):
-        param_pointers = defaultdict(list)
-        for param_name, param_value in self.state_dict().items():
-            if param_value.is_meta:
-                continue
-            param_pointers[param_value.data_ptr()].append(param_name)
-
-        tied_param_names = [
-            names
-            for names in param_pointers.values()
-            if len(names) > 1
-            and not any(name in self.all_tied_weights_keys.keys() for name in names)
-            and not all(name in missing_keys for name in names)
-        ]
-
-        tied_weights_keys_by_pointers = {
-            param_name: group[0]
-            for group in tied_param_names
-            for param_name in group[1:]
-        }
-        self.all_tied_weights_keys.update(tied_weights_keys_by_pointers)
-
-    PreTrainedModel._adjust_tied_keys_with_tied_pointers = (
-        _patched_adjust_tied_keys_with_tied_pointers
-    )
-
-
-def patch_initialize_missing_keys_for_fsdp():
-    """Patch _initialize_missing_keys to skip re-initialization on FSDP non-rank-0.
-
-    When using cpu_ram_efficient_loading, non-rank-0 processes load weights on
-    meta device and move them to CPU as empty tensors. Without this patch,
-    initialize_weights() re-initializes ALL parameters (via guarded init
-    functions), which is slow and uses extra RAM per process.
-
-    The fix marks all params/buffers with _is_hf_initialized=True before calling
-    the original method, so guarded init functions (init.normal_, init.zeros_,
-    etc.) become no-ops on non-rank-0 processes. The real weights arrive later
-    via FSDP broadcast from rank 0.
-
-    Upstream fix: https://github.com/huggingface/transformers/pull/44473
-    Remove this patch once transformers includes the fix in a stable release.
-    """
-    from transformers import PreTrainedModel
-    from transformers.modeling_utils import is_fsdp_enabled, is_local_dist_rank_0
-
-    if getattr(PreTrainedModel._initialize_missing_keys, "_axolotl_patched", False):
-        return
-
-    _original_initialize_missing_keys = PreTrainedModel._initialize_missing_keys
-
-    def _patched_initialize_missing_keys(self, is_quantized: bool) -> None:
-        if is_fsdp_enabled() and not is_local_dist_rank_0():
-            for key in self.state_dict():
-                try:
-                    param_or_buffer = self.get_parameter_or_buffer(key)
-                    param_or_buffer._is_hf_initialized = True
-                except AttributeError:
-                    pass  # may happen when handling pre-quantized weights
-            self._is_hf_initialized = True
-
-        _original_initialize_missing_keys(self, is_quantized)
-
-    PreTrainedModel._initialize_missing_keys = _patched_initialize_missing_keys
-    PreTrainedModel._initialize_missing_keys._axolotl_patched = True
-
-
 def patch_accelerate_fsdp2():
    import accelerate

--- a/src/axolotl/monkeypatch/fsdp2_qlora.py
+++ b/src/axolotl/monkeypatch/fsdp2_qlora.py
@@ -1,10 +1,9 @@
 """
-Monkeypatch to add Params4bit and Int8Params support to FSDP2. This enables QLoRA + FSDP2
-and 8-bit LoRA + FSDP2, as well as our LoRA / QLoRA Triton kernels to work with FSDP2.
+Monkeypatch to add Params4bit support to FSDP2. This enables QLoRA + FSDP2, as well as
+our LoRA / QLoRA Triton kernels to work with FSDP2.

-This patch modifies the _init_sharded_param and init_unsharded_param methods in FSDPParam
-to handle bitsandbytes Params4bit and Int8Params parameters, preserving their quantization
-metadata through the FSDP2 shard/unshard cycle.
+This patch modifies the _init_sharded_param method in FSDPParam to handle bitsandbytes
+Params4bit parameters.
 """

 import importlib
@@ -18,8 +17,6 @@ LOG = get_logger(__name__)

 def apply_init_sharded_param_patch():
    """Apply patch to FSDPParam._init_sharded_param to support Params4bit."""
-    if getattr(apply_init_sharded_param_patch, "_axolotl_patched", False):
-        return
    from torch.distributed.fsdp._fully_shard._fsdp_param import FSDPParam

    # Get original source
@@ -44,20 +41,9 @@ def apply_init_sharded_param_patch():
            bnb_quantized=param.bnb_quantized,
        )
        self.sharded_param = self.to_sharded_dtensor(self.sharded_param)
-    elif isinstance(param, bnb.nn.modules.Int8Params):
-        self.sharded_param = bnb.nn.modules.Int8Params(
-            data=sharded_param,
-            requires_grad=param.requires_grad,
-            has_fp16_weights=param.has_fp16_weights,
-            CB=None,
-            SCB=param.SCB,
-        )
-        self.sharded_param = self.to_sharded_dtensor(self.sharded_param)
    else:
-        self.sharded_param = nn.Parameter(
-            self.to_sharded_dtensor(sharded_param),
-            requires_grad=param.requires_grad,
-        )"""
+        self.sharded_param = nn.Parameter(self.to_sharded_dtensor(sharded_param))
+        self.sharded_param.requires_grad_(param.requires_grad)"""

    # Apply the replacement
    if original_param_creation in original_source:
@@ -87,7 +73,6 @@ def apply_init_sharded_param_patch():

        # Replace the method
        FSDPParam._init_sharded_param = patched_init_sharded_param
-        apply_init_sharded_param_patch._axolotl_patched = True
        LOG.info("Successfully applied FSDP _init_sharded_param patch")
    else:
        LOG.warning("Could not find target code for _init_sharded_param patching")
@@ -95,8 +80,6 @@ def apply_init_sharded_param_patch():

 def apply_init_unsharded_param_patch():
    """Apply patch to FSDPParam.init_unsharded_param to support Params4bit."""
-    if getattr(apply_init_unsharded_param_patch, "_axolotl_patched", False):
-        return
    from torch.distributed.fsdp._fully_shard._fsdp_param import FSDPParam

    # Get original source
@@ -122,14 +105,6 @@ def apply_init_unsharded_param_patch():
                module=local_tensor.module,
                bnb_quantized=local_tensor.bnb_quantized,
            )
-        elif isinstance(local_tensor, bnb.nn.modules.Int8Params):
-            self._unsharded_param = bnb.nn.modules.Int8Params(
-                data=unsharded_param,
-                requires_grad=self.sharded_param.requires_grad,
-                has_fp16_weights=local_tensor.has_fp16_weights,
-                CB=unsharded_param,
-                SCB=local_tensor.SCB,
-            )
        else:
            self._unsharded_param = nn.Parameter(
                unsharded_param, requires_grad=self.sharded_param.requires_grad
@@ -163,74 +138,6 @@ def apply_init_unsharded_param_patch():

        # Replace the method
        FSDPParam.init_unsharded_param = patched_init_unsharded_param
-        apply_init_unsharded_param_patch._axolotl_patched = True
        LOG.info("Successfully applied FSDP init_unsharded_param patch")
    else:
        LOG.warning("Could not find target code for patching")
-
-
-def apply_linear8bitlt_save_patch():
-    """Patch Linear8bitLt._save_to_state_dict to handle DTensor-wrapped Int8Params.
-
-    After FSDP2 sharding, Linear8bitLt.weight is a DTensor wrapping Int8Params.
-    BnB's _save_to_state_dict accesses self.weight.SCB directly, but DTensor
-    doesn't proxy custom attribute access to its _local_tensor. This patch
-    temporarily unwraps the DTensor during saving so BnB can find the SCB attribute.
-    """
-    if getattr(apply_linear8bitlt_save_patch, "_axolotl_patched", False):
-        return
-    import bitsandbytes as bnb
-    from torch.distributed.tensor import DTensor
-
-    original_save = bnb.nn.Linear8bitLt._save_to_state_dict
-
-    def _patched_save_to_state_dict(self, destination, prefix, keep_vars):
-        # Use _parameters dict directly to bypass nn.Module.__setattr__ type check.
-        weight = self._parameters["weight"]
-        unwrapped = False
-        if isinstance(weight, DTensor) and hasattr(weight, "_local_tensor"):
-            self._parameters["weight"] = weight._local_tensor
-            unwrapped = True
-        try:
-            original_save(self, destination, prefix, keep_vars)
-        finally:
-            if unwrapped:
-                self._parameters["weight"] = weight
-
-    bnb.nn.Linear8bitLt._save_to_state_dict = _patched_save_to_state_dict
-    apply_linear8bitlt_save_patch._axolotl_patched = True
-    LOG.info("Patched Linear8bitLt._save_to_state_dict for DTensor compatibility")
-
-
-def apply_init_dtype_attrs_patch():
-    """Prevent FSDP2 mixed precision from casting non-float quantized params.
-
-    When mixed precision is enabled (e.g., bf16), FSDP2's init_dtype_attrs sets
-    param_dtype=bf16 for ALL params. During all-gather, _to_dtype_if_needed casts
-    the sharded param to param_dtype. For non-float params (uint8 packed 4-bit,
-    int8 quantized) without FSDP2 extensions, this destroys the quantized data.
-
-    Params4bit handles this via fsdp_pre/post_all_gather extensions, but our
-    parametrize-based expert quantization uses plain nn.Parameter(uint8/int8)
-    without extensions.
-    """
-    if getattr(apply_init_dtype_attrs_patch, "_axolotl_patched", False):
-        return
-    from torch.distributed.fsdp._fully_shard._fsdp_param import FSDPParam
-
-    original_init_dtype_attrs = FSDPParam.init_dtype_attrs
-
-    def patched_init_dtype_attrs(self, mp_policy):
-        original_init_dtype_attrs(self, mp_policy)
-        # Skip casting non-float quantized params (uint8/int8) without FSDP2
-        # extensions — the parametrization chain handles dequantization.
-        if self.param_dtype is not None and not self.sharded_param.is_floating_point():
-            local = self.sharded_param
-            if hasattr(local, "_local_tensor"):
-                local = local._local_tensor
-            if not hasattr(local, "fsdp_pre_all_gather"):
-                self.param_dtype = None
-
-    FSDPParam.init_dtype_attrs = patched_init_dtype_attrs
-    apply_init_dtype_attrs_patch._axolotl_patched = True
-    LOG.info("Patched FSDPParam.init_dtype_attrs for non-float quantized params")
--- a/src/axolotl/monkeypatch/models/qwen3_5/init.py
+++ b/src/axolotl/monkeypatch/models/qwen3_5/init.py
--- a/src/axolotl/monkeypatch/models/qwen3_5/modeling.py
+++ b/src/axolotl/monkeypatch/models/qwen3_5/modeling.py
@@ -1,291 +0,0 @@
-"""Monkeypatch for Qwen3_5 and Qwen3_5Moe models to pass position_ids to linear attention."""
-
-import importlib
-from typing import Optional, Tuple
-
-import torch
-import torch.nn.functional as F
-
-from axolotl.utils.logging import get_logger
-
-LOG = get_logger(__name__)
-
-try:
-    from fla.modules.convolution import (
-        causal_conv1d as fla_causal_conv1d,  # FLA >= 0.4.1
-    )
-except ImportError:
-    try:
-        from fla.modules.conv import causal_conv1d as fla_causal_conv1d  # FLA < 0.4.1
-    except ImportError:
-        fla_causal_conv1d = None
-
-
-def get_cu_seqlens(position_ids):
-    """
-    Compute cumulative sequence lengths from position_ids for FLA varlen kernels.
-
-    Adapted from transformers.modeling_flash_attention_utils.prepare_fa_kwargs_from_position_ids.
-    https://github.com/huggingface/transformers/blob/0f1b128d3359a26bd18be99c26d7f04fb3cba914/src/transformers/modeling_flash_attention_utils.py#L316
-
-    Qwen3.5 uses MRoPE: position_ids arrive as [axes, B, T]. All axes carry the
-    same temporal positions, so axis 0 is used to recover the [B, T] layout.
-    See: https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen3_5/modeling_qwen3_5.py
-    """
-    if position_ids.ndim == 3:
-        position_ids = position_ids[0]
-
-    tensor_kwargs = {"dtype": torch.int32, "device": position_ids.device}
-    position_ids = position_ids.view(-1)
-    indices_q = (position_ids == 0).nonzero().view(-1)
-    return torch.cat(
-        (
-            indices_q.to(**tensor_kwargs),
-            torch.tensor(position_ids.size(), **tensor_kwargs),
-        )
-    )
-
-
-def _inject_fla_kernels(module) -> None:
-    """Inject FLA kernels into a modeling module, bypassing is_flash_linear_attention_available."""
-    try:
-        from fla.modules import FusedRMSNormGated
-        from fla.ops.gated_delta_rule import (
-            chunk_gated_delta_rule,
-            fused_recurrent_gated_delta_rule,
-        )
-
-        module.FusedRMSNormGated = FusedRMSNormGated
-        module.chunk_gated_delta_rule = chunk_gated_delta_rule
-        module.fused_recurrent_gated_delta_rule = fused_recurrent_gated_delta_rule
-        module.is_fast_path_available = True
-    except ImportError:
-        module.chunk_gated_delta_rule = None
-        module.fused_recurrent_gated_delta_rule = None
-        module.FusedRMSNormGated = None
-
-
-def _patched_decoder_forward(
-    self,
-    hidden_states: torch.Tensor,
-    position_embeddings: Tuple[torch.Tensor, torch.Tensor],
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values=None,
-    cache_position: Optional[torch.LongTensor] = None,
-    **kwargs,
-) -> torch.FloatTensor:
-    """Decoder layer forward that passes position_ids through to linear attention."""
-    residual = hidden_states
-    hidden_states = self.input_layernorm(hidden_states)
-
-    if self.layer_type == "linear_attention":
-        hidden_states = self.linear_attn(
-            hidden_states=hidden_states,
-            cache_params=past_key_values,
-            cache_position=cache_position,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-        )
-    elif self.layer_type == "full_attention":
-        hidden_states, _ = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            cache_position=cache_position,
-            position_embeddings=position_embeddings,
-            **kwargs,
-        )
-
-    hidden_states = residual + hidden_states
-
-    residual = hidden_states
-    hidden_states = self.post_attention_layernorm(hidden_states)
-    hidden_states = self.mlp(hidden_states)
-    if isinstance(hidden_states, tuple):  # MoE returns (hidden_states, router_logits)
-        hidden_states, _ = hidden_states
-    hidden_states = residual + hidden_states
-
-    return hidden_states
-
-
-def _make_qwen3_5_gated_delta_forward(apply_mask_fn):
-    """Factory for patched Qwen3_5/Qwen3_5Moe GatedDeltaNet forward with packing support."""
-
-    def patched_forward(
-        self,
-        hidden_states: torch.Tensor,
-        cache_params=None,
-        cache_position: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-    ):
-        hidden_states = apply_mask_fn(hidden_states, attention_mask)
-
-        batch_size, seq_len, _ = hidden_states.shape
-
-        use_precomputed_states = (
-            cache_params is not None
-            and cache_params.has_previous_state
-            and seq_len == 1
-            and cache_position is not None
-        )
-
-        cu_seqlens = None
-        if not use_precomputed_states and position_ids is not None:
-            cu_seqlens = get_cu_seqlens(position_ids=position_ids)
-
-        if cache_params is not None:
-            conv_state = cache_params.conv_states[self.layer_idx]
-            recurrent_state = cache_params.recurrent_states[self.layer_idx]
-
-        # mixed_qkv stays [B, T, D]; only transposed inside paths that require [B, D, T]
-        mixed_qkv = self.in_proj_qkv(hidden_states)  # [B, T, D]
-
-        z = self.in_proj_z(hidden_states)
-        z = z.reshape(batch_size, seq_len, -1, self.head_v_dim)
-
-        b = self.in_proj_b(hidden_states)
-        a = self.in_proj_a(hidden_states)
-
-        if use_precomputed_states:
-            mixed_qkv = self.causal_conv1d_update(
-                mixed_qkv.transpose(1, 2),
-                conv_state,
-                self.conv1d.weight.squeeze(1),
-                self.conv1d.bias,
-                self.activation,
-            ).transpose(1, 2)
-        else:
-            if cache_params is not None:
-                mixed_qkv_t = mixed_qkv.transpose(1, 2)
-                cache_params.conv_states[self.layer_idx] = F.pad(
-                    mixed_qkv_t,
-                    (self.conv_kernel_size - mixed_qkv_t.shape[-1], 0),
-                )
-
-            if fla_causal_conv1d is not None and cu_seqlens is not None:
-                # FLA varlen kernel for packed sequences; input must be contiguous [B, T, D]
-                mixed_qkv, _ = fla_causal_conv1d(
-                    x=mixed_qkv,
-                    weight=self.conv1d.weight.squeeze(1),
-                    bias=self.conv1d.bias,
-                    activation=self.activation,
-                    cu_seqlens=cu_seqlens,
-                )
-            else:
-                if cu_seqlens is not None and fla_causal_conv1d is None:
-                    raise RuntimeError(
-                        "Packed sequences require fla.modules.convolution.causal_conv1d "
-                        "(cu_seqlens support). Install flash-linear-attention or disable packing."
-                    )
-                mixed_qkv = F.silu(
-                    self.conv1d(mixed_qkv.transpose(1, 2))[:, :, :seq_len]
-                ).transpose(1, 2)
-
-        query, key, value = torch.split(
-            mixed_qkv,
-            [self.key_dim, self.key_dim, self.value_dim],
-            dim=-1,
-        )
-        query = query.reshape(batch_size, seq_len, -1, self.head_k_dim)
-        key = key.reshape(batch_size, seq_len, -1, self.head_k_dim)
-        value = value.reshape(batch_size, seq_len, -1, self.head_v_dim)
-
-        beta = b.sigmoid()
-        g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
-        if self.num_v_heads // self.num_k_heads > 1:
-            query = query.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2)
-            key = key.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2)
-
-        if not use_precomputed_states:
-            core_attn_out, last_recurrent_state = self.chunk_gated_delta_rule(
-                query,
-                key,
-                value,
-                g=g.to(dtype=query.dtype),
-                beta=beta,
-                initial_state=None,
-                output_final_state=cache_params is not None,
-                use_qk_l2norm_in_kernel=True,
-                # torch_chunk_gated_delta_rule fallback does not accept cu_seqlens
-                **({"cu_seqlens": cu_seqlens} if cu_seqlens is not None else {}),
-            )
-        else:
-            core_attn_out, last_recurrent_state = self.recurrent_gated_delta_rule(
-                query,
-                key,
-                value,
-                g=g.to(dtype=query.dtype),
-                beta=beta,
-                initial_state=recurrent_state,
-                output_final_state=cache_params is not None,
-                use_qk_l2norm_in_kernel=True,
-            )
-
-        if cache_params is not None:
-            cache_params.recurrent_states[self.layer_idx] = last_recurrent_state
-
-        core_attn_out = core_attn_out.reshape(-1, self.head_v_dim)
-        z = z.reshape(-1, self.head_v_dim)
-        core_attn_out = self.norm(core_attn_out, z)
-        core_attn_out = core_attn_out.reshape(batch_size, seq_len, -1)
-
-        return self.out_proj(core_attn_out)
-
-    return patched_forward
-
-
-def _apply_packing_patches(model_type: str, cls_prefix: str, forward_factory) -> None:
-    module_name = f"transformers.models.{model_type}.modeling_{model_type}"
-
-    try:
-        module = importlib.import_module(module_name)
-    except ImportError:
-        LOG.warning(f"{model_type} not found in transformers, skipping packing patches")
-        return
-
-    _inject_fla_kernels(module)
-    getattr(module, f"{cls_prefix}DecoderLayer").forward = _patched_decoder_forward
-    gated_cls = getattr(module, f"{cls_prefix}GatedDeltaNet")
-    gated_cls.forward = forward_factory(module.apply_mask_to_padding_states)
-
-    LOG.info(
-        f"Applied {cls_prefix} packing patch "
-        f"(fla_causal_conv1d={'available' if fla_causal_conv1d else 'unavailable'})"
-    )
-
-
-def patch_qwen3_5_modeling_packing():
-    _apply_packing_patches("qwen3_5", "Qwen3_5", _make_qwen3_5_gated_delta_forward)
-
-
-def patch_qwen3_5_moe_modeling_packing():
-    _apply_packing_patches(
-        "qwen3_5_moe", "Qwen3_5Moe", _make_qwen3_5_gated_delta_forward
-    )
-
-
-def patch_qwen3_5_vlm_flash_attention():
-    """
-    Patch _is_packed_sequence to handle Qwen3.5's 3-D MRoPE position_ids.
-
-    transformers passes position_ids as [axes, B, T] to decoder layers, but
-    _is_packed_sequence only handles 2-D tensors and mis-classifies the 3-D
-    shape as a packed-sequence indicator, causing CUDA errors in the varlen path.
-    """
-    try:
-        import transformers.modeling_flash_attention_utils as fa_utils
-
-        _original = fa_utils._is_packed_sequence
-
-        def _patched(position_ids, batch_size):
-            if position_ids is not None and position_ids.ndim != 2:
-                return False
-            return _original(position_ids, batch_size)
-
-        fa_utils._is_packed_sequence = _patched
-        LOG.info("Applied Qwen3.5 VLM flash-attention patch (3-D MRoPE position_ids)")
-    except Exception as exc:  # pragma: no cover
-        LOG.warning(f"Failed to apply Qwen3.5 VLM flash-attention patch: {exc}")
--- a/src/axolotl/monkeypatch/models/qwen3_next/modeling.py
+++ b/src/axolotl/monkeypatch/models/qwen3_next/modeling.py
@@ -9,11 +9,6 @@ from axolotl.utils.logging import get_logger

 LOG = get_logger(__name__)

-try:
-    from fla.modules.convolution import causal_conv1d as fla_causal_conv1d
-except ImportError:
-    fla_causal_conv1d = None
-

 def get_cu_seqlens(position_ids):
    """
@@ -142,11 +137,6 @@ def patch_qwen3_next_gateddelta_layer():
            and cache_position is not None
        )

-        # Compute cu_seqlens early for use by both causal_conv1d and chunk_gated_delta_rule
-        cu_seqlens = None
-        if not use_precomputed_states and position_ids is not None:
-            cu_seqlens = get_cu_seqlens(position_ids=position_ids)
-
        # getting projected states from cache if it exists
        if cache_params is not None:
            conv_state = cache_params.conv_states[self.layer_idx]
@@ -161,11 +151,12 @@ def patch_qwen3_next_gateddelta_layer():
            x.reshape(x.shape[0], x.shape[1], -1) for x in (query, key, value)
        )

-        mixed_qkv = torch.cat((query, key, value), dim=-1)  # [B, T, D]
+        mixed_qkv = torch.cat((query, key, value), dim=-1)
+        mixed_qkv = mixed_qkv.transpose(1, 2)

        if use_precomputed_states:
-            # Inference single-token path: causal_conv1d_update expects [B, D, T]
-            mixed_qkv = mixed_qkv.transpose(1, 2)
+            # 2. Convolution sequence transformation
+            # NOTE: the conv state is updated in `causal_conv1d_update`
            mixed_qkv = self.causal_conv1d_update(
                mixed_qkv,
                conv_state,
@@ -173,41 +164,24 @@ def patch_qwen3_next_gateddelta_layer():
                self.conv1d.bias,
                self.activation,
            )
-            mixed_qkv = mixed_qkv.transpose(1, 2)
        else:
            if cache_params is not None:
-                # Cache state expects [B, D, T] for the inference update path
-                mixed_qkv_t = mixed_qkv.transpose(1, 2)
                conv_state = F.pad(
-                    mixed_qkv_t,
-                    (self.conv_kernel_size - mixed_qkv_t.shape[-1], 0),
+                    mixed_qkv, (self.conv_kernel_size - mixed_qkv.shape[-1], 0)
                )
                cache_params.conv_states[self.layer_idx] = conv_state
-
-            if fla_causal_conv1d is not None:
-                # FLA Triton causal_conv1d: [B, T, D] in/out, with cu_seqlens support
-                mixed_qkv, _ = fla_causal_conv1d(
+            if self.causal_conv1d_fn is not None:
+                mixed_qkv = self.causal_conv1d_fn(
                    x=mixed_qkv,
                    weight=self.conv1d.weight.squeeze(1),
                    bias=self.conv1d.bias,
                    activation=self.activation,
-                    cu_seqlens=cu_seqlens,
+                    seq_idx=None,
                )
            else:
-                # PyTorch fallback (no cu_seqlens support)
-                if cu_seqlens is not None and cu_seqlens.shape[0] > batch_size + 1:
-                    raise RuntimeError(
-                        "Packed sequences require fla.modules.convolution.causal_conv1d "
-                        "(cu_seqlens support). Install flash-linear-attention or disable packing."
-                    )
-                LOG.warning_once(
-                    "FLA causal_conv1d not available. Falling back to PyTorch conv1d."
-                )
-                mixed_qkv = mixed_qkv.transpose(1, 2)
                mixed_qkv = F.silu(self.conv1d(mixed_qkv)[:, :, :seq_len])
-                mixed_qkv = mixed_qkv.transpose(1, 2)

-        # mixed_qkv is [B, T, D] in all paths
+        mixed_qkv = mixed_qkv.transpose(1, 2)
        query, key, value = torch.split(
            mixed_qkv,
            [
@@ -229,6 +203,7 @@ def patch_qwen3_next_gateddelta_layer():
            key = key.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2)

        if not use_precomputed_states:
+            cu_seqlens = get_cu_seqlens(position_ids=position_ids)
            core_attn_out, last_recurrent_state = self.chunk_gated_delta_rule(
                query,
                key,
--- a/src/axolotl/monkeypatch/moe_quant.py
+++ b/src/axolotl/monkeypatch/moe_quant.py
@@ -1,198 +0,0 @@
-"""
-Loading-time quantization for MoE expert weights stored as 3D nn.Parameter tensors.
-
-In transformers v5, MoE models store expert weights as fused 3D tensors that BnB
-skips (only targets nn.Linear). This module patches weight loading to quantize them
-on-the-fly (4-bit via bitsandbytes parametrize, 8-bit via custom int8 parametrization),
-reducing peak VRAM from "all experts in bf16" to "one expert at a time."
-"""
-
-import os
-
-import bitsandbytes as bnb
-import torch
-import torch.nn.utils.parametrize as P
-
-from axolotl.utils.logging import get_logger
-
-LOG = get_logger(__name__)
-
-# Module-level state for the loading-time quantization patch.
-_moe_load_state = {
-    "count": 0,
-    "mode": "4bit",
-    "quant_type": "nf4",
-    "compress_statistics": True,
-    "patched": False,
-}
-
-
-class Bnb8bitParametrization(torch.nn.Module):
-    """Parametrization that dequantizes int8 row-wise quantized data on access."""
-
-    def __init__(self, row_stats: torch.Tensor):
-        super().__init__()
-        self.register_buffer("row_stats", row_stats)
-
-    @torch.no_grad()
-    def forward(self, quantized_param: torch.Tensor) -> torch.Tensor:
-        # Flatten 3D+ to 2D for BnB's dequant, then reshape back.
-        orig_shape = quantized_param.shape
-        if quantized_param.ndim > 2:
-            quantized_param = quantized_param.reshape(-1, orig_shape[-1])
-        result = bnb.functional.int8_vectorwise_dequant(quantized_param, self.row_stats)
-        return result.reshape(orig_shape)
-
-
-def _enable_parametrization_cache(module, inputs):
-    P._cache_enabled += 1
-
-
-def _disable_parametrization_cache(module, inputs, output):
-    P._cache_enabled -= 1
-    if not P._cache_enabled:
-        P._cache = {}
-
-
-def replace_parameter_8bit(module, param_name):
-    """Replace a module parameter with an 8-bit quantized version using parametrization."""
-    original_param = getattr(module, param_name)
-    int8_data, row_stats, _ = bnb.functional.int8_vectorwise_quant(
-        original_param.data.to(torch.float16)
-    )
-
-    setattr(module, param_name, torch.nn.Parameter(int8_data, requires_grad=False))
-    del original_param
-
-    P.register_parametrization(
-        module, param_name, Bnb8bitParametrization(row_stats), unsafe=True
-    )
-
-    # Cache dequantized values during forward to avoid redundant dequantization.
-    if not getattr(module, "_axolotl_8bit_hooks_registered", False):
-        module.register_forward_pre_hook(_enable_parametrization_cache)
-        module.register_forward_hook(_disable_parametrization_cache)
-        module._axolotl_8bit_hooks_registered = True
-
-
-def patch_moe_quantization_on_load(cfg):
-    """Patch transformers' weight loading to quantize MoE expert params on-the-fly.
-
-    Wraps ``set_param_for_module`` so that 3D+ CUDA tensors with "expert" in their
-    name are quantized (4-bit or 8-bit) as they're loaded, keeping peak VRAM low.
-    """
-    mode = "8bit" if getattr(cfg, "load_in_8bit", False) else "4bit"
-    _moe_load_state["mode"] = mode
-    _moe_load_state["count"] = 0
-
-    if _moe_load_state["patched"]:
-        LOG.debug("MoE loading-time quantization patch already active")
-        return
-
-    import transformers.core_model_loading
-    import transformers.modeling_utils
-
-    if mode == "4bit":
-        from bitsandbytes.nn.parametrize import replace_parameter_4bit
-
-        quant_type = getattr(cfg, "bnb_4bit_quant_type", None) or "nf4"
-        compress_statistics = getattr(cfg, "bnb_4bit_use_double_quant", None)
-        if compress_statistics is None:
-            compress_statistics = True
-
-        _moe_load_state["quant_type"] = quant_type
-        _moe_load_state["compress_statistics"] = compress_statistics
-
-    # Disable async tensor loading.  Transformers' convert_and_load_state_dict_in_model
-    # uses a ThreadPoolExecutor to materialise tensors (move from safetensors → CUDA)
-    # ahead of time.  With MoE models this pre-fetches many large bf16 expert tensors
-    # onto the GPU simultaneously — long before our set_param_for_module patch can
-    # quantise and free them one-by-one — causing OOM even at <5 % of weights loaded.
-    # Sequential loading ensures only ONE bf16 expert tensor is on-GPU at a time.
-    os.environ["HF_DEACTIVATE_ASYNC_LOAD"] = "1"
-
-    # Disable caching_allocator_warmup — it pre-allocates a huge tensor at bf16
-    # size for all params, defeating our on-load quantization VRAM savings.
-    def _noop_warmup(*args, **kwargs):
-        pass
-
-    transformers.modeling_utils.caching_allocator_warmup = _noop_warmup
-
-    original_set_param = transformers.core_model_loading.set_param_for_module
-
-    def _patched_set_param_for_module(model, target_name, param_value, *args, **kwargs):
-        original_set_param(model, target_name, param_value, *args, **kwargs)
-
-        # Quantize 3D+ expert params that BnB skipped (only on CUDA).
-        if param_value.ndim >= 3 and param_value.is_cuda:
-            mod_path, _, pname = target_name.rpartition(".")
-            mod = model.get_submodule(mod_path) if mod_path else model
-            if not isinstance(mod, (bnb.nn.Linear4bit, bnb.nn.Linear8bitLt)):
-                if "expert" not in target_name.lower():
-                    LOG.debug(
-                        "Skipping non-expert 3D param: %s (shape=%s)",
-                        target_name,
-                        list(param_value.shape),
-                    )
-                    return
-
-                if _moe_load_state["mode"] == "4bit":
-                    replace_parameter_4bit(
-                        mod,
-                        pname,
-                        compress_statistics=_moe_load_state["compress_statistics"],
-                        quant_type=_moe_load_state["quant_type"],
-                    )
-                else:
-                    replace_parameter_8bit(mod, pname)
-                _moe_load_state["count"] += 1
-
-                # Release the bf16 tensor so CUDA memory is freed immediately.
-                param_value.data = torch.empty(0, device="cpu")
-                torch.cuda.empty_cache()
-
-    transformers.core_model_loading.set_param_for_module = _patched_set_param_for_module
-    _moe_load_state["patched"] = True
-
-
-def get_moe_quantized_count():
-    """Return the number of expert parameters quantized during loading."""
-    return _moe_load_state["count"]
-
-
-def patch_peft_target_parameters_matching():
-    """Fix PEFT's _inject_parameters to use suffix matching for parametrized modules."""
-    if getattr(patch_peft_target_parameters_matching, "_axolotl_patched", False):
-        return
-    from peft.tuners.tuners_utils import BaseTuner
-
-    original_inject = BaseTuner._inject_parameters
-
-    def _patched_inject_parameters(
-        self, peft_config, model, adapter_name, low_cpu_mem_usage
-    ):
-        # Patch target_parameters to use full paths for parametrized modules
-        original_targets = list(peft_config.target_parameters)
-        expanded = set(original_targets)
-
-        for module_name, module in model.named_modules():
-            if not hasattr(module, "parametrizations"):
-                continue
-            for target in original_targets:
-                mod_path, _, param_name = target.rpartition(".")
-                if (
-                    module_name == mod_path or module_name.endswith("." + mod_path)
-                ) and hasattr(module, param_name):
-                    expanded.add(f"{module_name}.{param_name}")
-
-        peft_config.target_parameters = sorted(expanded)
-        try:
-            return original_inject(
-                self, peft_config, model, adapter_name, low_cpu_mem_usage
-            )
-        finally:
-            peft_config.target_parameters = original_targets
-
-    BaseTuner._inject_parameters = _patched_inject_parameters
-    patch_peft_target_parameters_matching._axolotl_patched = True
-    LOG.info("Patched PEFT _inject_parameters for parametrized module suffix matching")
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -22,8 +22,6 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
    "qwen3",
    "qwen3_moe",
    "qwen3_next",
-    "qwen3_5",
-    "qwen3_5_moe",
    "falcon",
    "phi",
    "phi3",
@@ -39,7 +37,6 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
    "deepseek_v3",
    "glm",
    "glm4",
-    "glm4_moe",
    "smollm3",
    "granite",
    "granitemoe",
--- a/src/axolotl/processing_strategies.py
+++ b/src/axolotl/processing_strategies.py
@@ -258,32 +258,6 @@ class Qwen2VLProcessingStrategy(ProcessingStrategy):
        )


-class Qwen3_5ProcessingStrategy(ProcessingStrategy):
-    """Processing Strategy class for Qwen3.5 (early-fusion VLM)"""
-
-    def __init__(
-        self,
-        processor: ProcessorMixin,
-        chat_template: Optional[str] = None,
-        image_size: int | tuple[int, int] | None = None,
-        image_resize_algorithm: Resampling | None = None,
-    ):
-        super().__init__(processor, chat_template, image_size, image_resize_algorithm)
-        self.image_token = "<|image_pad|>"  # nosec
-        self.image_token_id = processor.tokenizer.convert_tokens_to_ids(
-            self.image_token
-        )
-        self.video_token = "<|video_pad|>"  # nosec
-        self.video_token_id = processor.tokenizer.convert_tokens_to_ids(
-            self.video_token
-        )
-
-    def process_labels(self, input_ids):
-        labels = super().process_labels(input_ids)
-        labels[labels == self.video_token_id] = -100
-        return labels
-
-
 class Gemma3ProcessingStrategy(ProcessingStrategy):
    """Processing Strategy class for Gemma3"""

@@ -588,10 +562,6 @@ def get_processing_strategy(
        return Qwen2VLProcessingStrategy(
            **processing_kwargs,
        )
-    if chat_template_type in ["qwen3_5", "qwen3_5_moe"]:
-        return Qwen3_5ProcessingStrategy(
-            **processing_kwargs,
-        )
    if chat_template_type == "gemma3":
        return Gemma3ProcessingStrategy(
            **processing_kwargs,
--- a/src/axolotl/telemetry/manager.py
+++ b/src/axolotl/telemetry/manager.py
@@ -156,10 +156,6 @@ class TelemetryManager:
        Returns:
            Boolean denoting whether telemetry is enabled or not.
        """
-        # Only rank 0 will send telemetry
-        if not is_main_process():
-            return False
-
        # Parse relevant env vars
        axolotl_do_not_track = os.getenv("AXOLOTL_DO_NOT_TRACK")
        do_not_track = os.getenv("DO_NOT_TRACK")
@@ -173,6 +169,10 @@ class TelemetryManager:
        ):
            return True

+        # Only rank 0 will send telemetry
+        if not is_main_process():
+            return False
+
        if do_not_track is None:
            do_not_track = "0"

--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -86,21 +86,9 @@ def setup_model_and_tokenizer(
    if model.generation_config is not None:
        model.generation_config.do_sample = True

-    model_properties = model.config.to_dict()
-    try:
-        model_properties["num_parameters"] = model.num_parameters()
-    except Exception:  # pylint: disable=broad-exception-caught
-        model_properties["num_parameters"] = sum(p.numel() for p in model.parameters())
-    # if the num_parameters is less than 2B, let's round to nearest 100M, else round to nearest 1B
-    if model_properties["num_parameters"] < 2e9:
-        model_properties["num_parameters_est"] = (
-            f"{round(model_properties['num_parameters'] / 1e8) * 100}M"
-        )
-    else:
-        model_properties["num_parameters_est"] = (
-            f"{round(model_properties['num_parameters'] / 1e9)}B"
-        )
-    TELEMETRY_MANAGER.send_event(event_type="model-load", properties=model_properties)
+    TELEMETRY_MANAGER.send_event(
+        event_type="model-load", properties=model.config.to_dict()
+    )
    if peft_config:
        TELEMETRY_MANAGER.send_event(
            event_type="peft-config-load", properties=peft_config.to_dict()
@@ -257,7 +245,7 @@ def save_trained_model(
    # Handle ReLoRA early return case
    if cfg.relora:
        if cfg.adapter == "lora" and not (cfg.load_in_4bit or cfg.load_in_8bit):
-            model = model.merge_and_unload(safe_merge=True)
+            model = model.merge_and_unload()
        else:
            # final model weights have already been saved by `ReLoRACallback.on_train_end`
            return
--- a/src/axolotl/utils/callbacks/generation.py
+++ b/src/axolotl/utils/callbacks/generation.py
@@ -1,84 +0,0 @@
-"""Callback for generating samples during SFT/Pretrain training."""
-
-from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState
-from transformers.training_args import TrainingArguments
-
-from axolotl.utils.generation.sft import generate_samples
-from axolotl.utils.logging import get_logger
-
-LOG = get_logger(__name__)
-
-
-class SFTGenerationCallback(TrainerCallback):
-    """Callback for generating samples during SFT/Pretrain training."""
-
-    def __init__(self, trainer):
-        self.trainer = trainer
-
-    def on_evaluate(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        """Generate samples at specified intervals."""
-        cfg = self.trainer.axolotl_cfg
-
-        if not getattr(cfg, "generate_samples", False):
-            return
-
-            dataloader = None
-            try:
-                if getattr(self.trainer, "eval_dataset", None) is not None:
-                    dataloader = self.trainer.get_eval_dataloader()
-                    LOG.info(
-                        f"Using eval dataloader for generation at step {state.global_step}"
-                    )
-            except Exception as e:
-                LOG.warning(f"Could not get eval dataloader: {e}")
-                dataloader = None
-
-            if dataloader is None:
-                dataloader = self.trainer.get_train_dataloader()
-                LOG.info(
-                    f"Using train dataloader for generation at step {state.global_step}"
-                )
-
-            samples = generate_samples(
-                model=self.trainer.model,
-                tokenizer=self.trainer.processing_class,
-                dataloader=dataloader,
-                num_generation_samples=getattr(cfg, "num_generation_samples", 3),
-                max_new_tokens=getattr(cfg, "generation_max_new_tokens", 50),
-                temperature=getattr(cfg, "generation_temperature", 0.7),
-                top_p=getattr(cfg, "generation_top_p", None),
-                top_k=getattr(cfg, "generation_top_k", None),
-                do_sample=getattr(cfg, "generation_do_sample", True),
-                prompt_ratio=getattr(cfg, "generation_prompt_ratio", 0.5),
-            )
-            self._log_samples(samples, state.global_step)
-
-    def _log_samples(self, samples: list, step: int):
-        """Log generated samples to console and W&B."""
-        from axolotl.utils.generation.sft import format_generation_for_logging
-
-        for i, sample in enumerate(samples):
-            console_text, wandb_text = format_generation_for_logging(sample, i, step)
-
-            LOG.info(console_text)
-
-            try:
-                import wandb
-
-                if wandb.run is not None:
-                    wandb.log(
-                        {
-                            f"samples/sample_{i + 1}": wandb.Html(
-                                f"<pre>{wandb_text}</pre>"
-                            )
-                        },
-                        step=step,
-                    )
-            except (ImportError, Exception):
-                pass
--- a/src/axolotl/utils/chat_templates/templates/qwen3_5.jinja
+++ b/src/axolotl/utils/chat_templates/templates/qwen3_5.jinja
@@ -1,123 +0,0 @@
-{%- if tools %}
-    {{- '<|im_start|>system\n' }}
-    {%- if messages[0].role == 'system' %}
-        {{- messages[0].content + '\n\n' }}
-    {%- endif %}
-    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
-    {%- for tool in tools %}
-        {{- "\n" }}
-        {{- tool | tojson }}
-    {%- endfor %}
-    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
-{%- else %}
-    {%- if messages[0].role == 'system' %}
-        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
-    {%- endif %}
-{%- endif %}
-{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
-{#- Determine the real last index: use provided value or default to messages length - 1 #}
-{%- if real_last_index is defined and real_last_index is not none %}
-    {%- set ns.real_last_index = real_last_index %}
-{%- else %}
-    {%- set ns.real_last_index = messages|length - 1 %}
-{%- endif %}
-{%- for message in messages[::-1] %}
-    {%- set index = (messages|length - 1) - loop.index0 %}
-    {%- if message['content'] is string %}
-        {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
-            {%- set ns.multi_step_tool = false %}
-            {%- set ns.last_query_index = index %}
-        {%- endif %}
-    {%- else %}
-        {%- if ns.multi_step_tool and message.role == "user" %}
-            {%- set ns.multi_step_tool = false %}
-            {%- set ns.last_query_index = index %}
-        {%- endif %}
-    {%- endif %}
-{%- endfor %}
-{%- for message in messages %}
-    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
-        {{- '<|im_start|>' + message.role + '\n' }}
-        {%- if message['content'] is string %}
-            {{- message.content }}
-        {%- else %}
-            {%- for content in message['content'] %}
-                {%- if content['type'] == 'image' or 'image' in content or 'image_url' in content %}
-                    {{- '<|vision_start|><|image_pad|><|vision_end|>' }}
-                {%- elif content['type'] == 'video' or 'video' in content %}
-                    {{- '<|vision_start|><|video_pad|><|vision_end|>' }}
-                {%- elif 'text' in content %}
-                    {{- content['text'] }}
-                {%- endif %}
-            {%- endfor %}
-        {%- endif %}
-        {{- '<|im_end|>\n' }}
-    {%- elif message.role == "assistant" %}
-        {%- if message['content'] is string %}
-            {%- set content = message.content %}
-        {%- else %}
-            {%- set content = '' %}
-            {%- for item in message['content'] %}
-                {%- if 'text' in item %}
-                    {%- set content = content + item['text'] %}
-                {%- endif %}
-            {%- endfor %}
-        {%- endif %}
-        {%- set reasoning_content = '' %}
-        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
-            {%- set reasoning_content = message.reasoning_content %}
-        {%- else %}
-            {%- if '</think>' in content %}
-                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
-                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
-            {%- endif %}
-        {%- endif %}
-        {%- if loop.index0 > ns.last_query_index %}
-            {%- if loop.index0 == ns.real_last_index or (loop.index0 != ns.real_last_index and reasoning_content) %}
-                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
-            {%- else %}
-                {{- '<|im_start|>' + message.role + '\n' + content }}
-            {%- endif %}
-        {%- else %}
-            {{- '<|im_start|>' + message.role + '\n' + content }}
-        {%- endif %}
-        {%- if message.tool_calls %}
-            {%- for tool_call in message.tool_calls %}
-                {%- if (loop.first and content) or (not loop.first) %}
-                    {{- '\n' }}
-                {%- endif %}
-                {%- if tool_call.function %}
-                    {%- set tool_call = tool_call.function %}
-                {%- endif %}
-                {{- '<tool_call>\n{"name": "' }}
-                {{- tool_call.name }}
-                {{- '", "arguments": ' }}
-                {%- if tool_call.arguments is string %}
-                    {{- tool_call.arguments }}
-                {%- else %}
-                    {{- tool_call.arguments | tojson }}
-                {%- endif %}
-                {{- '}\n</tool_call>' }}
-            {%- endfor %}
-        {%- endif %}
-        {{- '<|im_end|>\n' }}
-    {%- elif message.role == "tool" %}
-        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
-            {{- '<|im_start|>user' }}
-        {%- endif %}
-        {{- '\n<tool_response>\n' }}
-        {{- message.content }}
-        {{- '\n</tool_response>' }}
-        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
-            {{- '<|im_end|>\n' }}
-        {%- endif %}
-    {%- endif %}
-{%- endfor %}
-{%- if add_generation_prompt %}
-    {{- '<|im_start|>assistant\n' }}
-    {%- if enable_thinking is defined and enable_thinking is false %}
-        {{- '<think>\n\n</think>\n\n' }}
-    {%- else %}
-        {{- '<think>\n\n' }}
-    {%- endif %}
-{%- endif %}
--- a/src/axolotl/utils/config/init.py
+++ b/src/axolotl/utils/config/init.py
@@ -6,10 +6,7 @@ from typing import Optional

 import torch
 from transformers.utils import is_torch_bf16_gpu_available
-from transformers.utils.import_utils import (
-    is_torch_greater_or_equal,
-    is_torch_npu_available,
-)
+from transformers.utils.import_utils import is_torch_npu_available

 from axolotl.integrations.base import PluginManager
 from axolotl.integrations.config import merge_input_args
@@ -84,15 +81,8 @@ def resolve_dtype(cfg):
            cfg.fp16 = True
        cfg.bf16 = False
    else:
-        if cfg.tf32:
-            torch.set_float32_matmul_precision("high")
-            if is_torch_greater_or_equal("2.9.0"):
-                torch.backends.fp32_precision = "tf32"
-                torch.backends.cuda.matmul.fp32_precision = "tf32"
-                torch.backends.cudnn.fp32_precision = "tf32"
-            else:
-                torch.backends.cuda.matmul.allow_tf32 = True
-                torch.backends.cudnn.allow_tf32 = True
+        torch.backends.cuda.matmul.allow_tf32 = cfg.tf32 or False
+        torch.backends.cudnn.allow_tf32 = cfg.tf32 or False
        if cfg.bf16:
            cfg.fp16 = False

@@ -129,12 +119,7 @@ def normalize_config(cfg):
    if cfg.world_size != 1:
        cfg.device_map = {"": int(os.environ.get("LOCAL_RANK", 0))}
        if cfg.fsdp or cfg.fsdp_config or cfg.ddp:
-            effective_world_size = (
-                cfg.world_size
-                // (cfg.context_parallel_size or 1)
-                // (cfg.tensor_parallel_size or 1)
-            )
-            cfg.batch_size = cfg.batch_size * effective_world_size
+            cfg.batch_size = cfg.batch_size * cfg.world_size

    if not cfg.use_ray:
        # delay resolving dtype until on worker node when launching with ray
--- a/src/axolotl/utils/data/lock.py
+++ b/src/axolotl/utils/data/lock.py
@@ -54,19 +54,15 @@ class FileLockLoader:

    def cleanup(self):
        """Clean up ready flag when last process is done."""
-        try:
-            with FileLock(str(self.lock_file_path)):
-                counter_content = self.counter_path.read_text().strip()
-                count = int(counter_content) if counter_content else 0
-                count -= 1
+        with FileLock(str(self.lock_file_path)):
+            counter_content = self.counter_path.read_text().strip()
+            count = int(counter_content) if counter_content else 0
+            count -= 1

-                if count <= 0:
-                    # Last process cleans everything up
-                    self.ready_flag_path.unlink(missing_ok=True)
-                    self.counter_path.unlink(missing_ok=True)
-                else:
-                    # Still have active processes
-                    self.counter_path.write_text(str(count))
-        except FileNotFoundError:
-            # Lock file might have already been deleted by another process
-            pass
+            if count <= 0:
+                # Last process cleans everything up
+                self.ready_flag_path.unlink(missing_ok=True)
+                self.counter_path.unlink(missing_ok=True)
+            else:
+                # Still have active processes
+                self.counter_path.write_text(str(count))
--- a/src/axolotl/utils/data/rl.py
+++ b/src/axolotl/utils/data/rl.py
@@ -246,10 +246,6 @@ def _load_split(cfg: DictDefault, split: Literal["train", "test"]) -> Dataset:
    dataset = merge_datasets(split_datasets, cfg)

    if not cfg.skip_prepare_dataset:
-        # Deduplicate before saving so the saved dataset is already de-duplicated
-        if cfg.dataset_exact_deduplication:
-            dataset, _ = deduplicate_and_log_datasets(dataset=dataset)
-
        # Save preprocessed dataset
        dataset_hash = generate_dataset_hash_from_config(
            cfg, datasets_configs, tokenizer.name_or_path
--- a/src/axolotl/utils/data/sft.py
+++ b/src/axolotl/utils/data/sft.py
@@ -351,10 +351,6 @@ def _load_raw_datasets(
        if cfg.sample_packing:
            dataset, _ = process_datasets_for_packing(cfg, dataset, None)

-        # Deduplicate before saving so the saved dataset is already de-duplicated
-        if cfg.dataset_exact_deduplication:
-            dataset, _ = deduplicate_and_log_datasets(dataset=dataset)
-
        # Save the prepared dataset
        dataset_hash = generate_dataset_hash_from_config(
            cfg, datasets_configs, tokenizer.name_or_path
@@ -442,8 +438,25 @@ def _handle_train_dataset_split(
        )
        return train_dataset, eval_dataset

-    # No validation split - deduplication already applied during preprocessing
-    return dataset, None
+    # No validation split - apply deduplication if needed and return as train dataset
+    if cfg.dataset_exact_deduplication:
+        train_dataset, _ = deduplicate_and_log_datasets(dataset=dataset)
+    else:
+        train_dataset = dataset
+
+    return train_dataset, None
+
+
+def _handle_test_dataset_split(
+    dataset: Dataset, cfg: DictDefault
+) -> tuple[None, Dataset | None]:
+    """Handle processing for test split."""
+    if cfg.dataset_exact_deduplication:
+        eval_dataset, _ = deduplicate_and_log_datasets(dataset=dataset)
+    else:
+        eval_dataset = dataset
+
+    return None, eval_dataset


 def _apply_dataset_sharding(dataset: Dataset, cfg: DictDefault) -> Dataset:
@@ -502,7 +515,6 @@ def _load_and_prepare_datasets(
    if split == "train":
        train_dataset, eval_dataset = _handle_train_dataset_split(dataset, cfg)
    else:
-        # Deduplication already applied during preprocessing
-        train_dataset, eval_dataset = None, dataset
+        train_dataset, eval_dataset = _handle_test_dataset_split(dataset, cfg)

    return train_dataset, eval_dataset, prompters
--- a/src/axolotl/utils/data/shared.py
+++ b/src/axolotl/utils/data/shared.py
@@ -189,7 +189,7 @@ def _get_remote_filesystem(
        try:
            import gcsfs

-            storage_options = {"token": None}  # type: ignore  # nosec B105
+            storage_options = {"token": None}  # type: ignore
            return gcsfs.GCSFileSystem(**storage_options), storage_options
        except ImportError as exc:
            raise ImportError(
@@ -520,8 +520,7 @@ def generate_dataset_hash_from_config(
    """
    config_str = (
        f"{cfg.sequence_len}@{cfg.sample_packing}@{cfg.eval_sample_packing}@"
-        f"{cfg.group_by_length}@{cfg.kd_temperature or 1.0}@"
-        f"{cfg.dataset_exact_deduplication or False}|"
+        f"{cfg.group_by_length}@{cfg.kd_temperature or 1.0}|"
        f"{'|'.join(sorted([f'{d.path}:{d.type}:{d.shards}:{d.conversation}:{d.split}:{d.temperature or 1.0}' for d in cfg_datasets]))}"
        f"|{tokenizer_name}"
    )
--- a/src/axolotl/utils/data/utils.py
+++ b/src/axolotl/utils/data/utils.py
@@ -15,7 +15,7 @@ from datasets import Dataset, IterableDataset
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.logging import get_logger
 from axolotl.utils.samplers.utils import get_dataset_lengths
-from axolotl.utils.trainer import filter_sequences_by_length
+from axolotl.utils.trainer import drop_long_seq

 LOG = get_logger(__name__)

@@ -148,33 +148,22 @@ def deduplicate_and_log_datasets(
    return dataset, other_dataset


-def keep_min_len(sample, min_sequence_len=2):
+def truncate_long_seq(sample, sequence_len=2048, min_sequence_len=2):
    """
-    Batched filter function that keeps only samples with sequence length >= min_sequence_len.
-    Returns a list of booleans indicating which samples to keep.
+    Truncate samples whose sequence length is too long (> sequence_len)
+    or drop those too short (< min_sequence_len).
    """
    min_sequence_len = min_sequence_len or 2

    input_ids = sample["input_ids"]
-
-    # Batched (input_ids is a list of lists)
    results = []
-    for seq in input_ids:
-        results.append(len(seq) >= min_sequence_len)
-    return results
-
-
-def truncate_long_seq(sample, sequence_len=2048):
-    """
-    Truncate samples whose sequence length is too long (> sequence_len).
-    Modifies the sample in-place and returns the modified sample.
-    """
-    input_ids = sample["input_ids"]

    # Batched (input_ids is a list of lists)
    for i, seq in enumerate(input_ids):
        length = len(seq)
-        if length > sequence_len:
+        if length < min_sequence_len:
+            results.append(False)
+        elif length > sequence_len:
            sample["input_ids"][i] = seq[:sequence_len]
            if "attention_mask" in sample:
                sample["attention_mask"][i] = sample["attention_mask"][i][:sequence_len]
@@ -182,133 +171,10 @@ def truncate_long_seq(sample, sequence_len=2048):
                sample["labels"][i] = sample["labels"][i][:sequence_len]
            if "position_ids" in sample:
                sample["position_ids"][i] = sample["position_ids"][i][:sequence_len]
-    return sample
-
-
-def _should_skip_processing(dataset: Dataset) -> bool:
-    """Check if dataset should skip long sequence handling."""
-    if (
-        hasattr(dataset, "column_names")
-        and dataset.column_names
-        and "input_ids" not in dataset.column_names
-    ):
-        LOG.warning(
-            "Dataset does not contain 'input_ids' column. Skip drop long seq. This is "
-            "expected for reward modeling."
-        )
-        return True
-    elif not hasattr(dataset, "column_names") or dataset.column_names is None:
-        LOG.info(
-            "Dataset is streaming (IterableDataset), skipping long sequence handling"
-        )
-        return True
-    return False
-
-
-def _log_dataset_stats(dataset: Dataset) -> None:
-    """Log min/max sequence lengths for debugging."""
-    with contextlib.suppress(AttributeError, ValueError):
-        ds_lengths = get_dataset_lengths(dataset, from_arrow=True)
-        LOG.info(f"min_input_len: {np.min(ds_lengths)}")
-        LOG.info(f"max_input_len: {np.max(ds_lengths)}")
-
-
-def _build_filter_kwargs(dataset: Dataset, cfg: DictDefault) -> dict:
-    """Build kwargs for dataset filter/map operations."""
-    kwargs = {}
-    if not isinstance(dataset, IterableDataset):
-        kwargs["num_proc"] = cfg.dataset_num_proc
-        kwargs["load_from_cache_file"] = not cfg.is_preprocess
-    return kwargs
-
-
-def _filter_short_sequences(
-    dataset: Dataset, min_len: int, filter_kwargs: dict
-) -> tuple[Dataset, int]:
-    """Filter out sequences shorter than min_len. Returns (dataset, num_dropped)."""
-    prior_len = len(dataset) if hasattr(dataset, "__len__") else None
-
-    desc_kwargs = {}
-    if filter_kwargs:
-        desc_kwargs["desc"] = f"Filtering Short Sequences (<{min_len})"
-
-    dataset = dataset.filter(
-        functools.partial(keep_min_len, min_sequence_len=min_len),
-        batched=True,
-        **filter_kwargs,
-        **desc_kwargs,
-    )
-
-    dropped = 0
-    if prior_len:
-        dropped = prior_len - len(dataset)
-        if dropped > 0:
-            LOG.info(f"Dropped {dropped} short sequences (<{min_len} tokens)")
-
-    return dataset, dropped
-
-
-def _truncate_long_sequences(
-    dataset: Dataset, max_len: int, map_kwargs: dict
-) -> Dataset:
-    """Truncate sequences longer than max_len."""
-    desc_kwargs = {}
-    if map_kwargs:
-        desc_kwargs["desc"] = f"Truncating Sequences (target_len={max_len})"
-
-    dataset = dataset.map(
-        functools.partial(truncate_long_seq, sequence_len=max_len),
-        batched=True,
-        **map_kwargs,
-        **desc_kwargs,
-    )
-    LOG.info(f"Truncated long sequences to max length {max_len}")
-    return dataset
-
-
-def _drop_outside_range(
-    dataset: Dataset,
-    max_len: int,
-    min_len: int,
-    raise_on_long: bool,
-    filter_kwargs: dict,
-) -> tuple[Dataset, int]:
-    """Drop sequences outside valid length range [min_len, max_len].
-
-    Returns (dataset, num_dropped)."""
-    prior_len = len(dataset) if hasattr(dataset, "__len__") else None
-
-    desc_kwargs = {}
-    if filter_kwargs:
-        action = (
-            "Checking Sequence Lengths"
-            if raise_on_long
-            else "Dropping Invalid Sequences"
-        )
-        desc_kwargs["desc"] = f"{action} (<{min_len} or >{max_len})"
-
-    dataset = dataset.filter(
-        functools.partial(
-            filter_sequences_by_length,
-            sequence_len=max_len,
-            min_sequence_len=min_len,
-            raise_on_drop=raise_on_long,
-        ),
-        batched=True,
-        **filter_kwargs,
-        **desc_kwargs,
-    )
-
-    dropped = 0
-    if not raise_on_long and prior_len:
-        dropped = prior_len - len(dataset)
-        if dropped > 0:
-            LOG.info(
-                f"Dropped {dropped} sequences outside valid range "
-                f"([{min_len}, {max_len}])"
-            )
-
-    return dataset, dropped
+            results.append(True)
+        else:
+            results.append(True)
+    return results


 def handle_long_seq_in_dataset(
@@ -327,25 +193,80 @@ def handle_long_seq_in_dataset(
            'truncate'          truncates them down to sequence_len
            'raise'             raises a ValueError if any sequence was found that was longer than sequence_len
    """
-    # Early returns for special cases
-    if _should_skip_processing(dataset):
+    if (
+        hasattr(dataset, "column_names")
+        and dataset.column_names
+        and "input_ids" not in dataset.column_names
+    ):
+        LOG.warning(
+            "Dataset does not contain 'input_ids' column. Skip drop long seq. This is "
+            "expected for reward modeling."
+        )
+        return dataset
+    elif not hasattr(dataset, "column_names") or dataset.column_names is None:
+        LOG.info(
+            "Dataset is streaming (IterableDataset), skipping long sequence handling"
+        )
        return dataset

    excess_length_strategy = (cfg.excess_length_strategy or "drop").lower()

-    _log_dataset_stats(dataset)
+    drop_long = functools.partial(
+        drop_long_seq,
+        sequence_len=sequence_len,
+        min_sequence_len=cfg.min_sample_len,
+        raise_on_drop=excess_length_strategy == "raise",
+    )

-    # Setup kwargs
-    filter_kwargs = _build_filter_kwargs(dataset, cfg)
+    with contextlib.suppress(AttributeError):
+        ds_lengths = get_dataset_lengths(dataset, from_arrow=True)
+        min_input_len = np.min(ds_lengths)
+        LOG.info(f"min_input_len: {min_input_len}")
+        max_input_len = np.max(ds_lengths)
+        LOG.info(f"max_input_len: {max_input_len}")

-    # Handle sequences based on strategy
-    if excess_length_strategy == "truncate":
-        dataset, _ = _filter_short_sequences(dataset, cfg.min_sample_len, filter_kwargs)
-        dataset = _truncate_long_sequences(dataset, sequence_len, filter_kwargs)
-    else:
-        raise_on_long = excess_length_strategy == "raise"
-        dataset, _ = _drop_outside_range(
-            dataset, sequence_len, cfg.min_sample_len, raise_on_long, filter_kwargs
+    prior_len = len(dataset) if hasattr(dataset, "__len__") else None
+
+    filter_map_kwargs = {}
+    if not isinstance(dataset, IterableDataset):
+        filter_map_kwargs["num_proc"] = cfg.dataset_num_proc
+        filter_map_kwargs["load_from_cache_file"] = not cfg.is_preprocess
+
+    drop_long_kwargs = {}
+    if filter_map_kwargs:
+        action = (
+            "Checking Sequence Lengths"
+            if excess_length_strategy == "raise"
+            else "Dropping Long Sequences"
        )
+        drop_long_kwargs["desc"] = f"{action} (>{sequence_len})"
+
+    if excess_length_strategy == "truncate":
+        process_fn = functools.partial(
+            truncate_long_seq,
+            sequence_len=sequence_len,
+            min_sequence_len=cfg.min_sample_len,
+        )
+        drop_long_kwargs["desc"] = (
+            f"Truncating/Filtering Sequences (target_len={sequence_len})"
+        )
+    else:
+        process_fn = drop_long
+
+    dataset = dataset.filter(
+        process_fn,
+        batched=True,
+        **filter_map_kwargs,
+        **drop_long_kwargs,
+    )
+    if prior_len:
+        dropped = prior_len - len(dataset)
+        if dropped:
+            action = (
+                "truncated/filtered"
+                if excess_length_strategy == "truncate"
+                else "dropped"
+            )
+            LOG.warning(f"{action.title()} {dropped} samples from dataset")

    return dataset
--- a/src/axolotl/utils/generation/init.py
+++ b/src/axolotl/utils/generation/init.py
@@ -1,5 +0,0 @@
-"""Generation utilities for monitoring during training."""
-
-from .sft import format_generation_for_logging, generate_samples
-
-__all__ = ["generate_samples", "format_generation_for_logging"]
--- a/src/axolotl/utils/generation/sft.py
+++ b/src/axolotl/utils/generation/sft.py
@@ -1,174 +0,0 @@
-"""Sample generation utilities for SFT/Pretrain training."""
-
-from typing import Any, List, Optional
-
-import torch
-from accelerate.utils import extract_model_from_parallel
-from colorama import Fore, Style
-
-from axolotl.utils.logging import get_logger
-
-LOG = get_logger(__name__)
-
-
-def generate_samples(
-    model: torch.nn.Module,
-    tokenizer: Any,
-    dataloader: Any,
-    num_generation_samples: int = 3,
-    max_new_tokens: int = 50,
-    temperature: float = 0.7,
-    top_p: Optional[float] = None,
-    top_k: Optional[int] = None,
-    do_sample: bool = True,
-    prompt_ratio: float = 0.5,
-) -> List[dict]:
-    """
-    Generate samples from the model during training for monitoring.
-
-    Args:
-        model: The model to generate from
-        tokenizer: The tokenizer to use for encoding/decoding
-        dataloader: Dataloader to sample prompts from
-        num_generation_samples: Number of samples to generate
-        max_new_tokens: Maximum new tokens to generate
-        temperature: Sampling temperature (0.0 = greedy)
-        top_p: Nucleus sampling parameter
-        top_k: Top-k sampling parameter
-        do_sample: Whether to use sampling vs greedy decoding
-        prompt_ratio: Ratio of sequence to use as prompt (0.0-1.0)
-
-    Returns:
-        List of dicts with 'prompt', 'generated', and 'full_text' keys
-    """
-    unwrapped_model = extract_model_from_parallel(model)
-
-    training = unwrapped_model.training
-    unwrapped_model.eval()
-
-    device = next(unwrapped_model.parameters()).device
-
-    generations = []
-
-    try:
-        with torch.no_grad():
-            samples_collected = 0
-
-            for batch in dataloader:
-                if samples_collected >= num_generation_samples:
-                    break
-
-                input_ids = batch["input_ids"].to(device)
-                attention_mask = batch.get("attention_mask")
-                if attention_mask is not None:
-                    attention_mask = attention_mask.to(device)
-                batch_size = input_ids.shape[0]
-
-                indices = torch.randperm(batch_size)[
-                    : num_generation_samples - samples_collected
-                ]
-
-                for idx in indices:
-                    if samples_collected >= num_generation_samples:
-                        break
-
-                    sequence = input_ids[idx]
-
-                    if attention_mask is not None:
-                        seq_len = attention_mask[idx].sum().item()
-                    else:
-                        seq_len = sequence.shape[0]
-
-                    if seq_len < 5:
-                        continue
-
-                    prompt_len = max(1, int(seq_len * prompt_ratio))
-                    prompt_ids = sequence[:prompt_len].unsqueeze(0)
-
-                    try:
-                        generation_config = {
-                            "max_new_tokens": max_new_tokens,
-                            "do_sample": do_sample,
-                            "pad_token_id": tokenizer.pad_token_id
-                            if tokenizer.pad_token_id is not None
-                            else tokenizer.eos_token_id,
-                        }
-
-                        if do_sample:
-                            generation_config["temperature"] = temperature
-                            if top_p is not None:
-                                generation_config["top_p"] = top_p
-                            if top_k is not None:
-                                generation_config["top_k"] = top_k
-
-                        generated_ids = unwrapped_model.generate(
-                            prompt_ids, **generation_config
-                        )
-
-                        prompt_text = tokenizer.decode(
-                            prompt_ids[0], skip_special_tokens=True
-                        )
-                        generated_text = tokenizer.decode(
-                            generated_ids[0][prompt_len:], skip_special_tokens=True
-                        )
-                        full_text = tokenizer.decode(
-                            generated_ids[0], skip_special_tokens=True
-                        )
-
-                        generations.append(
-                            {
-                                "prompt": prompt_text,
-                                "generated": generated_text,
-                                "full_text": full_text,
-                            }
-                        )
-
-                        samples_collected += 1
-
-                    except Exception as e:
-                        LOG.warning(f"Failed to generate sample: {e}", exc_info=True)
-                        continue
-
-    except Exception as e:
-        LOG.warning(f"Error during sample generation: {e}", exc_info=True)
-
-    if training:
-        unwrapped_model.train()
-    else:
-        unwrapped_model.eval()
-
-    return generations
-
-
-def format_generation_for_logging(
-    sample: dict, sample_idx: int, step: int
-) -> tuple[str, str]:
-    """
-    Format a generation sample for pretty logging.
-
-    Args:
-        sample: Dict with 'prompt', 'generated', and 'full_text' keys
-        sample_idx: Index of the sample
-        step: Current training step
-
-    Returns:
-        Tuple of (console_text, wandb_text)
-    """
-    console_text = (
-        f"\n{Style.BRIGHT}{Fore.CYAN}{'=' * 80}{Style.RESET_ALL}\n"
-        f"{Style.BRIGHT}{Fore.GREEN}Sample {sample_idx + 1} (Step {step}){Style.RESET_ALL}\n"
-        f"{Style.BRIGHT}{Fore.CYAN}{'=' * 80}{Style.RESET_ALL}\n"
-        f"{Style.BRIGHT}{Fore.YELLOW}[PROMPT]{Style.RESET_ALL}\n{sample['prompt']}\n\n"
-        f"{Style.BRIGHT}{Fore.MAGENTA}[GENERATED]{Style.RESET_ALL}\n{sample['generated']}\n"
-        f"{Style.BRIGHT}{Fore.CYAN}{'=' * 80}{Style.RESET_ALL}\n"
-    )
-    wandb_text = (
-        f"\n{'=' * 80}\n"
-        f"Sample {sample_idx + 1} (Step {step})\n"
-        f"{'=' * 80}\n"
-        f"[PROMPT]\n{sample['prompt']}\n\n"
-        f"[GENERATED]\n{sample['generated']}\n"
-        f"{'=' * 80}\n"
-    )
-
-    return console_text, wandb_text
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Wing Lian	3b5a9d1d88	update create_optimizer for updated api	2026-02-19 23:49:32 -05:00
Wing Lian	eb59070040	fix labels	2026-02-19 23:44:46 -05:00
Wing Lian	9722aaf7d8	fix for tokenizers change	2026-02-19 21:52:44 -05:00
Wing Lian	c5d20bbd79	integration branch for transformers#44041	2026-02-19 18:34:13 -05:00