update create_optimizer for updated api

fix labels
fix for tokenizers change
2026-02-19 23:49:32 -05:00 · 2026-02-19 23:44:46 -05:00 · 2026-02-19 21:52:44 -05:00 · 2026-02-19 18:34:13 -05:00 · 2026-02-19 18:32:26 -05:00 · 2026-02-19 18:27:27 -05:00
213 changed files with 12294 additions and 902 deletions
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -15,6 +15,11 @@
 <!--- Include details of your testing environment, tests ran to see how -->
 <!--- your change affects other areas of the code, etc. -->
 ## AI Usage Disclaimer
 <!--- Was AI (e.g., ChatGPT, Claude, Copilot) used to generate or assist with this PR? -->
 <!--- Please indicate: No / Yes (specify which tool and to what extent) -->
 ## Screenshots (if appropriate)
 ## Types of changes
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -21,31 +21,12 @@ jobs:
    timeout-minutes: 480
    # this job needs to be run on self-hosted GPU runners...
    runs-on: ubuntu-latest-m
    env:
      HAS_DOCKERHUB_CREDS: ${{ secrets.DOCKERHUB_USERNAME != '' && secrets.DOCKERHUB_TOKEN != '' }}
    strategy:
      fail-fast: false
      matrix:
        include:
          - cuda: "126"
            cuda_version: 12.6.3
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.7.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
          - cuda: "126"
            cuda_version: 12.6.3
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.7.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.7.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
@@ -53,6 +34,15 @@ jobs:
            pytorch: 2.8.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
            platforms: "linux/amd64"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.9.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
@@ -60,6 +50,15 @@ jobs:
            pytorch: 2.9.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "129"
            cuda_version: 12.9.1
            cudnn_version: ""
            python_version: "3.12"
            pytorch: 2.9.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "130"
            cuda_version: 13.0.0
            cudnn_version: ""
@@ -67,6 +66,15 @@ jobs:
            pytorch: 2.9.1
            torch_cuda_arch_list: "9.0+PTX"
            dockerfile: "Dockerfile-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "130"
            cuda_version: 13.0.0
            cudnn_version: ""
            python_version: "3.12"
            pytorch: 2.9.1
            torch_cuda_arch_list: "9.0+PTX"
            dockerfile: "Dockerfile-base"
            platforms: "linux/amd64,linux/arm64"
 #          - cuda: "128"
 #            cuda_version: 12.8.1
 #            cudnn_version: ""
@@ -93,6 +101,7 @@ jobs:
            axolotlai/axolotl-base
      - name: Login to Docker Hub
        uses: docker/login-action@v2
        if: ${{ github.event_name != 'pull_request' && env.HAS_DOCKERHUB_CREDS == 'true' }}
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
@@ -103,6 +112,7 @@ jobs:
        with:
          context: .
          file: ./docker/${{ matrix.dockerfile }}
          platforms: ${{ matrix.platforms }}
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
@@ -117,24 +127,12 @@ jobs:
    if: ${{ github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
    timeout-minutes: 480
    runs-on: ubuntu-latest-m
    env:
      HAS_DOCKERHUB_CREDS: ${{ secrets.DOCKERHUB_USERNAME != '' && secrets.DOCKERHUB_TOKEN != '' }}
    strategy:
      fail-fast: false
      matrix:
        include:
          - cuda: "126"
            cuda_version: 12.6.3
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.7.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.7.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
@@ -142,6 +140,7 @@ jobs:
            pytorch: 2.8.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
@@ -149,6 +148,23 @@ jobs:
            pytorch: 2.9.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.9.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "129"
            cuda_version: 12.9.1
            cudnn_version: ""
            python_version: "3.12"
            pytorch: 2.9.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "130"
            cuda_version: 13.0.0
            cudnn_version: ""
@@ -156,6 +172,15 @@ jobs:
            pytorch: 2.9.1
            torch_cuda_arch_list: "9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "130"
            cuda_version: 13.0.0
            cudnn_version: ""
            python_version: "3.12"
            pytorch: 2.9.1
            torch_cuda_arch_list: "9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64,linux/arm64"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -167,6 +192,7 @@ jobs:
            axolotlai/axolotl-base-uv
      - name: Login to Docker Hub
        uses: docker/login-action@v2
        if: ${{ github.event_name != 'pull_request' && env.HAS_DOCKERHUB_CREDS == 'true' }}
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
@@ -177,6 +203,7 @@ jobs:
        with:
          context: .
          file: ./docker/${{ matrix.dockerfile }}
          platforms: ${{ matrix.platforms }}
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -15,37 +15,37 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.7.0
            axolotl_extras:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.7.1
            axolotl_extras: vllm
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.7.1
            axolotl_extras:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.8.0
            axolotl_extras:
-            is_latest: true
+            platforms: "linux/amd64"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.0
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
            is_latest: true
          - cuda: 129
            cuda_version: 12.9.1
            python_version: "3.12"
            pytorch: 2.9.1
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -71,6 +71,7 @@ jobs:
        uses: docker/build-push-action@v5
        with:
          context: .
          platforms: ${{ matrix.platforms }}
          build-args: |
            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
            CUDA=${{ matrix.cuda }}
@@ -92,43 +93,37 @@ jobs:
    strategy:
      matrix:
        include:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.7.0
            axolotl_extras:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.7.1
            axolotl_extras:
            is_latest:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.7.1
            axolotl_extras: vllm
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.7.1
            axolotl_extras:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.8.0
            axolotl_extras:
-            is_latest: true
+            platforms: "linux/amd64"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.0
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
            is_latest: true
            platforms: "linux/amd64,linux/arm64"
          - cuda: 129
            cuda_version: 12.9.1
            python_version: "3.12"
            pytorch: 2.9.1
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -153,6 +148,7 @@ jobs:
        uses: docker/build-push-action@v5
        with:
          context: .
          platforms: ${{ matrix.platforms }}
          build-args: |
            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
            CUDA=${{ matrix.cuda }}
@@ -170,22 +166,16 @@ jobs:
    strategy:
      matrix:
        include:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.7.1
            axolotl_extras:
            is_latest:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.7.1
            axolotl_extras: vllm
            is_latest: true
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
-            pytorch: 2.8.0
+            pytorch: 2.9.1
            axolotl_extras:
            is_latest: true
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
            is_latest:
    runs-on: axolotl-gpu-runner
@@ -212,6 +202,7 @@ jobs:
        uses: docker/build-push-action@v5
        with:
          context: .
          platforms: linux/amd64,linux/arm64
          build-args: |
            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
            CUDA=${{ matrix.cuda }}
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -19,6 +19,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
 env:
  MODAL_IMAGE_BUILDER_VERSION: "2025.06"
 jobs:
  test-axolotl-multigpu:
    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
@@ -26,27 +29,32 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.7.1
            axolotl_extras: vllm
            num_gpus: 2
            nightly_build: "true"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.8.0
            axolotl_extras: fbgemm-gpu
            num_gpus: 2
            nightly_build: "true"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
-            pytorch: 2.9.0
+            pytorch: 2.9.1
-            axolotl_extras: fbgemm-gpu
+            axolotl_extras: "fbgemm-gpu"
            num_gpus: 2
          - cuda: 129
            cuda_version: 12.9.1
            python_version: "3.12"
            pytorch: 2.9.1
            axolotl_extras: "fbgemm-gpu"
            num_gpus: 2
            dockerfile: "Dockerfile-uv.jinja"
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
 #            axolotl_extras: fbgemm-gpu
            num_gpus: 2
            nightly_build: "true"
    runs-on: [self-hosted, modal]
    timeout-minutes: 120
    steps:
@@ -59,7 +67,7 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==1.0.2 jinja2
+          pip install modal==1.3.0.post1 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
@@ -68,8 +76,8 @@ jobs:
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
-          modal run cicd.multigpu
+          modal run -m cicd.multigpu
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -12,16 +12,16 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.7.1
            axolotl_extras:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.8.0
            axolotl_extras:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -64,16 +64,16 @@ jobs:
    strategy:
      matrix:
        include:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.7.1
            axolotl_extras:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.8.0
            axolotl_extras:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -40,7 +40,7 @@ jobs:
      - name: Install dependencies
        run: |
-          pip3 install wheel packaging==23.2
+          pip3 install wheel packaging==26.0
          pip3 install --no-build-isolation -e .
          pip3 install -r requirements-dev.txt -r requirements-tests.txt
@@ -48,9 +48,9 @@ jobs:
        id: tag
        run: echo ::set-output name=TAG_NAME::$(echo $GITHUB_REF | cut -d / -f 3)
-      - name: Update version in setup.py
+      - name: Update version in VERSION file
        run: |
-          sed -i -E 's/version="([0-9.]+)",/version="${{ steps.tag.outputs.TAG_NAME }}",/g' setup.py
+          echo "${{ steps.tag.outputs.TAG_NAME }}" | sed 's/^v//' > VERSION
      - name: Build a source dist
        run: |
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -26,7 +26,7 @@ jobs:
      max-parallel: 2
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.7.1", "2.8.0"]
+        pytorch_version: ["2.8.0", "2.9.0", "2.9.1"]
    timeout-minutes: 20
    steps:
@@ -48,7 +48,7 @@ jobs:
      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
+          pip3 install --upgrade packaging==26.0 setuptools==75.8.0 wheel
      - name: Install PyTorch
        run: |
@@ -99,17 +99,17 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 126
+          - cuda: 128
-            cuda_version: 12.6.3
+            cuda_version: 12.8.1
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.8.0
            num_gpus: 1
            axolotl_extras:
            nightly_build: "true"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
-            pytorch: 2.8.0
+            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
            nightly_build: "true"
@@ -123,7 +123,7 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==1.0.2 jinja2
+          pip install modal==1.3.0.post1 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
@@ -148,10 +148,10 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 126
+          - cuda: 128
-            cuda_version: 12.6.3
+            cuda_version: 12.8.1
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.9.1
            num_gpus: 2
            axolotl_extras:
            nightly_build: "true"
@@ -165,7 +165,7 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==1.0.2 jinja2
+          pip install modal==1.3.0.post1 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -54,8 +54,13 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python_version: ["3.11"]
+        python_version: ["3.11", "3.12"]
-        pytorch_version: ["2.7.1", "2.8.0", "2.9.0"]
+        pytorch_version: ["2.8.0", "2.9.0", "2.9.1"]
        exclude:
          - python_version: "3.12"
            pytorch_version: "2.8.0"
          - python_version: "3.12"
            pytorch_version: "2.9.0"
    timeout-minutes: 20
    steps:
@@ -66,12 +71,13 @@ jobs:
      - name: Check out repository code
        uses: actions/checkout@v4
-#      - name: Restore Cache from S3
+      - name: Restore Cache from S3
-#        id: hf-cache-restore-s3
+        id: hf-cache-restore-s3
-#        run: |
+        run: |
-#          mkdir -p ~/.cache/huggingface/hub
+          mkdir -p ~/.cache/huggingface/hub
-#          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C ~/.cache/huggingface/hub/  --use-compress-program unzstd
+          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/  --use-compress-program unzstd --strip-components=1
-#
+          ls -ltr ~/.cache/huggingface/hub/
      - name: Setup Python
        uses: actions/setup-python@v5
        with:
@@ -81,7 +87,7 @@ jobs:
      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
+          pip3 install --upgrade packaging==26.0 setuptools==75.8.0 wheel
      - name: Install PyTorch
        run: |
@@ -109,7 +115,10 @@ jobs:
      - name: Pre-Download dataset fixture
        run: |
-          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
+          hf download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
      - name: Show HF cache
        run: hf cache ls
      - name: Run tests
        run: |
@@ -122,6 +131,9 @@ jobs:
          df -h
          pytest -v --durations=10 tests/cli/ --cov=axolotl --cov-append --cov-report=xml
      - name: Show HF cache
        run: hf cache ls
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v5
        with:
@@ -137,8 +149,13 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python_version: ["3.11"]
+        python_version: ["3.11", "3.12"]
-        pytorch_version: ["2.7.1", "2.8.0", "2.9.0"]
+        pytorch_version: ["2.8.0", "2.9.0", "2.9.1"]
        exclude:
          - python_version: "3.12"
            pytorch_version: "2.8.0"
          - python_version: "3.12"
            pytorch_version: "2.9.0"
    timeout-minutes: 20
    steps:
@@ -149,12 +166,13 @@ jobs:
      - name: Check out repository code
        uses: actions/checkout@v4
-#      - name: Restore Cache from S3
+      - name: Restore Cache from S3
-#        id: hf-cache-restore-s3
+        id: hf-cache-restore-s3
-#        run: |
+        run: |
-#          mkdir -p ~/.cache/huggingface/hub
+          mkdir -p ~/.cache/huggingface/hub
-#          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C ~/.cache/huggingface/hub/  --use-compress-program unzstd
+          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/  --use-compress-program unzstd --strip-components=1
-#
+          ls -ltr ~/.cache/huggingface/hub/
      - name: Setup Python
        uses: actions/setup-python@v5
        with:
@@ -164,7 +182,7 @@ jobs:
      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 setuptools_scm build wheel psutil
+          pip3 install --upgrade packaging==26.0 setuptools==75.8.0 setuptools_scm build wheel psutil
      - name: Install PyTorch
        run: |
@@ -192,7 +210,7 @@ jobs:
          axolotl --help
      - name: Show HF cache
-        run: hf cache scan
+        run: hf cache ls
      - name: Run tests
        run: |
@@ -200,8 +218,11 @@ jobs:
          pytest -v --durations=10 tests/monkeypatch/ --cov=axolotl --cov-append --cov-report=xml
          pytest -v --durations=10 tests/cli/
      - name: Show HF cache
        run: hf cache ls
  gate-skip-e2e:
-    needs: [pre-commit, pytest, pytest-sdist]
+    needs: [pre-commit]
    runs-on: ubuntu-latest
    outputs:
      skip: ${{ steps.compute.outputs.skip }}
@@ -237,16 +258,16 @@ jobs:
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
    timeout-minutes: 120
-    needs: [pre-commit, pytest, pytest-sdist, gate-skip-e2e]
+    needs: [pre-commit, pytest]
    strategy:
      fail-fast: false
      matrix:
        include:
-          - cuda: 128
+          - cuda: 129
-            cuda_version: 12.8.1
+            cuda_version: 12.9.1
-            python_version: "3.11"
+            python_version: "3.12"
-            pytorch: 2.8.0
+            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
            dockerfile: "Dockerfile-uv.jinja"
@@ -260,7 +281,7 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==1.0.2 jinja2
+          pip install modal==1.3.0.post1 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
@@ -292,18 +313,6 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.7.1
            num_gpus: 1
            axolotl_extras:
 #          - cuda: 128
 #            cuda_version: 12.8.1
 #            python_version: "3.11"
 #            pytorch: 2.7.1
 #            num_gpus: 1
 #            axolotl_extras:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
@@ -314,7 +323,13 @@ jobs:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
-            pytorch: 2.9.0
+            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.11"
            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
    steps:
@@ -327,7 +342,7 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==1.0.2 jinja2
+          pip install modal==1.3.0.post1 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
@@ -354,10 +369,10 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 126
+          - cuda: 129
-            cuda_version: 12.6.3
+            cuda_version: 12.9.1
-            python_version: "3.11"
+            python_version: "3.12"
-            pytorch: 2.7.1
+            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
    steps:
@@ -370,7 +385,7 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==1.0.2 jinja2
+          pip install modal==1.3.0.post1 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,13 +11,13 @@ repos:
    -   id: no-commit-to-branch
        args: ['--branch', 'main']
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.14.7
+    rev: v0.14.10
    hooks:
    -   id: ruff
        args: [--fix]
    -   id: ruff-format
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.19.0
+    rev: v1.19.1
    hooks:
    - id: mypy
      additional_dependencies:
--- a/.runpod/README.md
+++ b/.runpod/README.md
@@ -123,7 +123,7 @@ datasets:
 | --------------------------------- | -------------------------- | ----------------------------------- |
 | `dataset_prepared_path`           | `"data/last_run_prepared"` | Path for prepared dataset           |
 | `push_dataset_to_hub`             | `""`                       | Push dataset to HF hub              |
-| `dataset_processes`               | `4`                        | Number of preprocessing processes   |
+| `dataset_num_proc`                | `4`                        | Number of preprocessing processes   |
 | `dataset_keep_in_memory`          | `false`                    | Keep dataset in memory              |
 | `shuffle_merged_datasets`         | `true`                     | Shuffle merged datasets             |
 | `shuffle_before_merging_datasets` | `false`                    | Shuffle each dataset before merging |
--- a/.runpod/src/config/config.yaml
+++ b/.runpod/src/config/config.yaml
@@ -39,7 +39,6 @@
 #     type: # linear | dynamic
 #     factor: # float
 # # Whether you are training a 4-bit GPTQ quantized model
 # gptq: true
 # gptq_groupsize: 128 # group size
@@ -107,7 +106,7 @@
 # push_dataset_to_hub: # repo path
 # # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
 # # if not set.
-# dataset_processes: # defaults to os.cpu_count() if not set
+# dataset_num_proc: # defaults to os.cpu_count() if not set
 # # push checkpoints to hub
 # hub_model_id: # repo path to push finetuned model
 # # how to push checkpoints to hub
@@ -224,9 +223,6 @@
 # eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
 # eval_table_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
 # # Save model as safetensors (require safetensors package)
 # save_safetensors:
 # # Whether to mask out or include the human's prompt from the training labels
 # train_on_inputs: false
 # # Group similarly sized data to minimize padding.
@@ -352,8 +348,6 @@
 # # Allow overwrite yml config using from cli
 # strict:
 base_model: ${BASE_MODEL}
 base_model_ignore_patterns: ${BASE_MODEL_IGNORE_PATTERNS}
 base_model_config: ${BASE_MODEL_CONFIG}
@@ -412,7 +406,7 @@ chat_template_jinja: ${CHAT_TEMPLATE_JINJA}
 default_system_message: ${DEFAULT_SYSTEM_MESSAGE}
 dataset_prepared_path: ${DATASET_PREPARED_PATH}
 push_dataset_to_hub: ${PUSH_DATASET_TO_HUB}
-dataset_processes: ${DATASET_PROCESSES}
+dataset_num_proc: ${DATASET_NUM_PROC}
 dataset_keep_in_memory: ${DATASET_KEEP_IN_MEMORY}
 hub_model_id: ${HUB_MODEL_ID}
 hub_strategy: ${HUB_STRATEGY}
@@ -512,7 +506,6 @@ profiler_steps: ${PROFILER_STEPS}
 loss_watchdog_threshold: ${LOSS_WATCHDOG_THRESHOLD}
 loss_watchdog_patience: ${LOSS_WATCHDOG_PATIENCE}
 save_safetensors: ${SAVE_SAFETENSORS}
 train_on_inputs: ${TRAIN_ON_INPUTS}
 group_by_length: ${GROUP_BY_LENGTH}
 gradient_checkpointing: ${GRADIENT_CHECKPOINTING}
--- a/README.md
+++ b/README.md
@@ -29,15 +29,15 @@
 ## 🎉 Latest Updates
- 2025/12: Axolotl now includes support for [Olmo3](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/olmo3), [Trinity](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/trinity), and [Ministral3](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/ministral3).
+- 2025/12: Axolotl now includes support for [Kimi-Linear](https://docs.axolotl.ai/docs/models/kimi-linear.html), [Plano-Orchestrator](https://docs.axolotl.ai/docs/models/plano.html), [MiMo](https://docs.axolotl.ai/docs/models/mimo.html), [InternVL 3.5](https://docs.axolotl.ai/docs/models/internvl3_5.html), [Olmo3](https://docs.axolotl.ai/docs/models/olmo3.html), [Trinity](https://docs.axolotl.ai/docs/models/trinity.html), and [Ministral3](https://docs.axolotl.ai/docs/models/ministral3.html).
- 2025/10: New model support has been added in Axolotl for: [Qwen3 Next](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/qwen3-next), [Qwen2.5-vl, Qwen3-vl](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen2_5-vl), [Qwen3, Qwen3MoE](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen3), [Granite 4](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/granite4), [HunYuan](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/hunyuan), [Magistral 2509](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral#vision), [Apertus](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/apertus), and [Seed-OSS](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/seed-oss).
+- 2025/10: New model support has been added in Axolotl for: [Qwen3 Next](https://docs.axolotl.ai/docs/models/qwen3-next.html), [Qwen2.5-vl, Qwen3-vl](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen2_5-vl), [Qwen3, Qwen3MoE](https://docs.axolotl.ai/docs/models/qwen3.html), [Granite 4](https://docs.axolotl.ai/docs/models/granite4.html), [HunYuan](https://docs.axolotl.ai/docs/models/hunyuan.html), [Magistral 2509](https://docs.axolotl.ai/docs/models/magistral/vision.html), [Apertus](https://docs.axolotl.ai/docs/models/apertus.html), and [Seed-OSS](https://docs.axolotl.ai/docs/models/seed-oss.html).
 - 2025/09: Axolotl now has text diffusion training. Read more [here](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations/diffusion).
 - 2025/08: QAT has been updated to include NVFP4 support. See [PR](https://github.com/axolotl-ai-cloud/axolotl/pull/3107).
 - 2025/07:
  - ND Parallelism support has been added into Axolotl. Compose Context Parallelism (CP), Tensor Parallelism (TP), and Fully Sharded Data Parallelism (FSDP) within a single node and across multiple nodes. Check out the [blog post](https://huggingface.co/blog/accelerate-nd-parallel) for more info.
-  - Axolotl adds more models: [GPT-OSS](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/gpt-oss), [Gemma 3n](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/gemma3n), [Liquid Foundation Model 2 (LFM2)](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/lfm2), and [Arcee Foundation Models (AFM)](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/afm).
+  - Axolotl adds more models: [GPT-OSS](https://docs.axolotl.ai/docs/models/gpt-oss.html), [Gemma 3n](https://docs.axolotl.ai/docs/models/gemma3n.html), [Liquid Foundation Model 2 (LFM2)](https://docs.axolotl.ai/docs/models/LiquidAI.html), and [Arcee Foundation Models (AFM)](https://docs.axolotl.ai/docs/models/arcee.html).
  - FP8 finetuning with fp8 gather op is now possible in Axolotl via `torchao`. Get started [here](https://docs.axolotl.ai/docs/mixed_precision.html#sec-fp8)!
-  - [Voxtral](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/voxtral), [Magistral 1.1](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral), and [Devstral](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/devstral) with mistral-common tokenizer support has been integrated in Axolotl!
+  - [Voxtral](https://docs.axolotl.ai/docs/models/voxtral.html), [Magistral 1.1](https://docs.axolotl.ai/docs/models/magistral.html), and [Devstral](https://docs.axolotl.ai/docs/models/devstral.html) with mistral-common tokenizer support has been integrated in Axolotl!
  - TiledMLP support for single-GPU to multi-GPU training with DDP, DeepSpeed and FSDP support has been added to support Arctic Long Sequence Training. (ALST). See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/alst) for using ALST with Axolotl!
 - 2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the [docs](https://docs.axolotl.ai/docs/qat.html) to learn more!
@@ -46,8 +46,8 @@
 <summary>Expand older updates</summary>
 - 2025/03: Axolotl has implemented Sequence Parallelism (SP) support. Read the [blog](https://huggingface.co/blog/axolotl-ai-co/long-context-with-sequence-parallelism-in-axolotl) and [docs](https://docs.axolotl.ai/docs/sequence_parallelism.html) to learn how to scale your context length when fine-tuning.
- 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral) to start training your own Magistral models with Axolotl!
+- 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [docs](https://docs.axolotl.ai/docs/models/magistral.html) to start training your own Magistral models with Axolotl!
- 2025/04: Llama 4 support has been added in Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/llama-4) to start training your own Llama 4 models with Axolotl's linearized version!
+- 2025/04: Llama 4 support has been added in Axolotl. See [docs](https://docs.axolotl.ai/docs/models/llama-4.html) to start training your own Llama 4 models with Axolotl's linearized version!
 - 2025/03: (Beta) Fine-tuning Multimodal models is now supported in Axolotl. Check out the [docs](https://docs.axolotl.ai/docs/multimodal.html) to fine-tune your own!
 - 2025/02: Axolotl has added LoRA optimizations to reduce memory usage and improve training speed for LoRA and QLoRA in single GPU and multi-GPU training (DDP and DeepSpeed). Jump into the [docs](https://docs.axolotl.ai/docs/lora_optims.html) to give it a try.
 - 2025/02: Axolotl has added GRPO support. Dive into our [blog](https://huggingface.co/blog/axolotl-ai-co/training-llms-w-interpreter-feedback-wasm) and [GRPO example](https://github.com/axolotl-ai-cloud/grpo_code) and have some fun!
@@ -77,7 +77,7 @@ Features:
 - NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
 - Python 3.11
- PyTorch ≥2.7.1
+- PyTorch ≥2.8.0
 ### Google Colab
@@ -88,7 +88,7 @@ Features:
 #### Using pip
 ```bash
-pip3 install -U packaging==23.2 setuptools==75.8.0 wheel ninja
+pip3 install -U packaging==26.0 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
 # Download example axolotl configs, deepspeed configs
--- a/1
+++ b/1
@@ -0,0 +1 @@
 0.15.0.dev0
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -1,6 +1,8 @@
 project:
  type: website
-  pre-render: docs/scripts/generate_config_docs.py
+  pre-render:
   - docs/scripts/generate_config_docs.py
   - docs/scripts/generate_examples_docs.py
 quartodoc:
  dir: docs/api
@@ -240,6 +242,46 @@ website:
            - docs/getting-started.qmd
            - docs/installation.qmd
            - docs/inference.qmd
            - section: "Model Guides"
              contents:
                - docs/models/kimi-linear.qmd
                - docs/models/plano.qmd
                - docs/models/mimo.qmd
                - docs/models/internvl3_5.qmd
                - docs/models/olmo3.qmd
                - docs/models/trinity.qmd
                - docs/models/arcee.qmd
                - section: "Ministral3"
                  contents:
                    - docs/models/ministral3.qmd
                    - docs/models/ministral3/think.qmd
                    - docs/models/ministral3/vision.qmd
                - section: "Magistral"
                  contents:
                    - docs/models/magistral.qmd
                    - docs/models/magistral/think.qmd
                    - docs/models/magistral/vision.qmd
                - docs/models/ministral.qmd
                - docs/models/mistral-small.qmd
                - docs/models/voxtral.qmd
                - docs/models/devstral.qmd
                - docs/models/mistral.qmd
                - docs/models/llama-4.qmd
                - docs/models/llama-2.qmd
                - docs/models/qwen3-next.qmd
                - docs/models/qwen3.qmd
                - docs/models/gemma3n.qmd
                - docs/models/apertus.qmd
                - docs/models/gpt-oss.qmd
                - docs/models/seed-oss.qmd
                - docs/models/phi.qmd
                - docs/models/smolvlm2.qmd
                - docs/models/granite4.qmd
                - docs/models/LiquidAI.qmd
                - docs/models/hunyuan.qmd
                - docs/models/jamba.qmd
                - docs/models/orpheus.qmd
            - docs/cli.qmd
            - docs/telemetry.qmd
            - docs/config-reference.qmd
@@ -278,6 +320,7 @@ website:
            - docs/multipack.qmd
            - docs/mixed_precision.qmd
            - docs/optimizers.qmd
            - docs/attention.qmd
        - section: "Advanced Features"
          contents:
--- a/cicd/Dockerfile-uv.jinja
+++ b/cicd/Dockerfile-uv.jinja
@@ -31,7 +31,7 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
    fi
-RUN uv pip install packaging==23.2 setuptools==75.8.0
+RUN uv pip install packaging==26.0 setuptools==75.8.0
 RUN uv pip install torchvision
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -32,7 +32,7 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
    fi
-RUN pip install packaging==23.2 setuptools==75.8.0 psutil
+RUN pip install packaging==26.0 setuptools==75.8.0 psutil
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -17,7 +17,8 @@ template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
 template_env = jinja2.Environment(
    loader=template_loader, autoescape=select_autoescape()
 )
-df_template = template_env.get_template("Dockerfile.jinja")
+dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja")
 df_template = template_env.get_template(dockerfile)
 df_args = {
    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
@@ -27,8 +28,11 @@ df_args = {
    "CUDA": os.environ.get("CUDA", "126"),
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
    "HF_HOME": "/workspace/data/huggingface-cache/hub",
    "PYTHONUNBUFFERED": os.environ.get("PYTHONUNBUFFERED", "1"),
    "DEEPSPEED_LOG_LEVEL": os.environ.get("DEEPSPEED_LOG_LEVEL", "WARNING"),
 }
 dockerfile_contents = df_template.render(**df_args)
--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
@@ -2,7 +2,7 @@
 set -e
 # Only run two tests at a time to avoid OOM on GPU (with coverage collection)
-pytest -v --durations=10 -n2 \
+pytest -v --durations=10 -n2 --maxfail=3 \
  --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ \
  --ignore=/workspace/axolotl/tests/e2e/multigpu/patched/ \
  /workspace/axolotl/tests/e2e/multigpu/ \
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -6,6 +6,7 @@ ARG AXOLOTL_EXTRAS=""
 ARG AXOLOTL_ARGS=""
 ARG CUDA="118"
 ARG PYTORCH_VERSION="2.1.2"
 ARG TARGETARCH
 ENV PYTORCH_VERSION=$PYTORCH_VERSION
@@ -20,13 +21,17 @@ RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
 WORKDIR /workspace/axolotl
-# If AXOLOTL_EXTRAS is set, append it in brackets
+# If AXOLOTL_EXTRAS is set, append it in brackets; don't install deepspeed with arm64
-RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
+RUN if [ "$TARGETARCH" = "arm64" ]; then \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        BASE_EXTRAS="flash-attn,ring-flash-attn,optimizers,ray"; \
    else \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
+        BASE_EXTRAS="deepspeed,flash-attn,ring-flash-attn,optimizers,ray"; \
    fi && \
-    python scripts/unsloth_install.py | sh && \
+    if [ "$AXOLOTL_EXTRAS" != "" ]; then \
        pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
        pip install --no-build-isolation -e .[$BASE_EXTRAS] $AXOLOTL_ARGS; \
    fi && \    python scripts/unsloth_install.py | sh && \
    python scripts/cutcrossentropy_install.py | sh && \
    pip install pytest && \
    pip cache purge
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -2,14 +2,16 @@ ARG CUDA_VERSION="11.8.0"
 ARG CUDNN_VERSION="8"
 ARG UBUNTU_VERSION="22.04"
 ARG MAX_JOBS=4
 ARG TARGETARCH
 FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
 ENV PATH="/root/miniconda3/bin:${PATH}"
-ARG PYTHON_VERSION="3.10"
+ARG TARGETARCH
 ARG PYTHON_VERSION="3.11"
 ARG PYTORCH_VERSION="2.1.2"
-ARG CUDA="118"
+ARG CUDA="128"
 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
 ENV PYTHON_VERSION=$PYTHON_VERSION
@@ -22,11 +24,17 @@ RUN apt-get update \
        librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm \
    && rm -rf /var/cache/apt/archives \
    && rm -rf /var/lib/apt/lists/* \
-    && wget \
+    && if [ "$TARGETARCH" = "amd64" ]; then \
-    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
+        MINICONDA_ARCH="x86_64"; \
    elif [ "$TARGETARCH" = "arm64" ]; then \
        MINICONDA_ARCH="aarch64"; \
    else \
        echo "Unsupported architecture: $TARGETARCH"; exit 1; \
    fi \
    && wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh \
    && mkdir /root/.conda \
-    && bash Miniconda3-latest-Linux-x86_64.sh -b \
+    && bash Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh -b \
-    && rm -f Miniconda3-latest-Linux-x86_64.sh \
+    && rm -f Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh \
    && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \
    && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \
    && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
@@ -35,7 +43,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
 WORKDIR /workspace
-RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel psutil && \
+RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==26.0 setuptools==75.8.0 wheel psutil && \
    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
    python3 -m pip cache purge
@@ -51,8 +59,34 @@ RUN git lfs install --skip-repo && \
    pip3 install -U --no-cache-dir pydantic==1.10.10 && \
    pip3 cache purge
-RUN if [ "$PYTORCH_VERSION" = "2.9.1" ] && [ "$CUDA" = "128" ] ; then \
+RUN case "$PYTORCH_VERSION" in \
-        wget https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.4.17/flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
+        2.9.[0-9]*) \
-        pip3 install --no-cache-dir flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
+            if [ "$CUDA" = "128" ]; then \
-        rm flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
+                if [ "$TARGETARCH" = "amd64" ]; then \
-    fi
+                    WHL_FILE="flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl"; \
                    WHL_VERSION="v0.5.4"; \
                elif [ "$TARGETARCH" = "arm64" ]; then \
                    WHL_FILE="flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_aarch64.whl"; \
                    WHL_VERSION="v0.6.4"; \
                else \
                    echo "Unsupported architecture: $TARGETARCH"; exit 1; \
                fi; \
                wget -nv https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}; \
                pip3 install --no-cache-dir ${WHL_FILE}; \
                rm ${WHL_FILE}; \
            elif [ "$CUDA" = "130" ]; then \
                if [ "$TARGETARCH" = "amd64" ]; then \
                    WHL_FILE="flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_x86_64.whl"; \
                    WHL_VERSION="v0.5.4"; \
                elif [ "$TARGETARCH" = "arm64" ]; then \
                    WHL_FILE="flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_aarch64.whl"; \
                    WHL_VERSION="v0.6.4"; \
                else \
                    echo "Unsupported architecture: $TARGETARCH"; exit 1; \
                fi; \
                wget -nv https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}; \
                pip3 install --no-cache-dir ${WHL_FILE}; \
                rm ${WHL_FILE}; \
            fi \
            ;; \
    esac
--- a/docker/Dockerfile-base-nightly
+++ b/docker/Dockerfile-base-nightly
@@ -30,7 +30,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
 WORKDIR /workspace
-RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
+RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==26.0 setuptools==75.8.0 wheel && \
    python3 -m pip install --no-cache-dir -U torch --extra-index-url https://download.pytorch.org/whl/nightly/cu$CUDA && \
    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" && \
--- a/docker/Dockerfile-uv-base
+++ b/docker/Dockerfile-uv-base
@@ -2,6 +2,7 @@ ARG CUDA_VERSION="12.6.3"
 ARG CUDNN_VERSION=""
 ARG UBUNTU_VERSION="22.04"
 ARG MAX_JOBS=4
 ARG TARGETARCH
 FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
@@ -31,12 +32,35 @@ ENV PATH="/workspace/axolotl-venv/bin:${PATH}"
 RUN uv pip install packaging setuptools wheel psutil \
    && uv pip install torch==${PYTORCH_VERSION} torchvision \
    && uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" \
    && uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" \
    && uv pip install awscli pydantic
-RUN if [ "$PYTORCH_VERSION" = "2.9.0" ] && [ "$CUDA" = "128" ] ; then \
+RUN if [ "$TARGETARCH" = "amd64" ]; then \
-        wget https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.4.17/flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
+        uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main"; \
-        uv pip install --no-cache-dir flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
+        uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"; \
        rm flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
    fi
 RUN case "$PYTORCH_VERSION" in \
        2.9.[0-9]*) \
            if [ "$TARGETARCH" = "amd64" ]; then \
                if [ "$CUDA" = "128" ]; then \
                    wget -nv https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.5.4/flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
                    uv pip install --no-cache-dir flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
                    rm flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
                elif [ "$CUDA" = "130" ]; then \
                    wget -nv https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.5.4/flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_x86_64.whl; \
                    uv pip install --no-cache-dir flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_x86_64.whl; \
                    rm flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_x86_64.whl; \
                fi \
            elif [ "$TARGETARCH" = "arm64" ]; then \
                if [ "$CUDA" = "128" ]; then \
                    wget -nv https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.6.4/flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_aarch64.whl; \
                    uv pip install --no-cache-dir flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_aarch64.whl; \
                    rm flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_aarch64.whl; \
                elif [ "$CUDA" = "130" ]; then \
                    wget -nv https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.6.4/flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_aarch64.whl; \
                    uv pip install --no-cache-dir flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_aarch64.whl; \
                    rm flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_aarch64.whl; \
                fi \
            fi \
            ;; \
    esac
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -3,3 +3,5 @@ _site/
 /api/*.qmd
 /api/*.html
 config-reference.qmd
 models/**/*.qmd
 models/**/*.html
--- a/docs/amd_hpc.qmd
+++ b/docs/amd_hpc.qmd
@@ -86,7 +86,7 @@ export HF_DATASETS_OFFLINE=1
 Download a base model using the Hugging Face CLI:
 ```bash
-huggingface-cli download meta-llama/Meta-Llama-3.1-8B --local-dir ~/hfdata/llama3.1-8B
+hf download meta-llama/Meta-Llama-3.1-8B --local-dir ~/hfdata/llama3.1-8B
 ```
 ### 10. Create Axolotl Configuration
--- a/docs/attention.qmd
+++ b/docs/attention.qmd
@@ -0,0 +1,140 @@
 ---
 title: Attention
 description: Supported attention modules in Axolotl
 ---
 ## SDP Attention
 This is the default built-in attention in PyTorch.
 ```yaml
 sdp_attention: true
 ```
 For more details: [PyTorch docs](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
 ## Flash Attention 2
 Uses efficient kernels to compute attention.
 ```yaml
 flash_attention: true
 ```
 For more details: [Flash Attention](https://github.com/Dao-AILab/flash-attention/)
 ### Nvidia
 Requirements: Ampere, Ada, or Hopper GPUs
 Note: For Turing GPUs or lower, please use other attention methods.
 ```bash
 pip install flash-attn --no-build-isolation
 ```
 ::: {.callout-tip}
 If you get `undefined symbol` while training, ensure you installed PyTorch prior to Axolotl. Alternatively, try reinstall or downgrade a version.
 :::
 #### Flash Attention 3
 Requirements: Hopper only and CUDA 12.8 (recommended)
 ```bash
 git clone https://github.com/Dao-AILab/flash-attention.git
 cd flash-attention/hopper
 python setup.py install
 ```
 ### AMD
 Requirements: ROCm 6.0 and above.
 See [Flash Attention AMD docs](https://github.com/Dao-AILab/flash-attention/tree/main?tab=readme-ov-file#amd-rocm-support).
 ## Flex Attention
 A flexible PyTorch API for attention used in combination with `torch.compile`.
 ```yaml
 flex_attention: true
 # recommended
 torch_compile: true
 ```
 ::: {.callout-note}
 We recommend using latest stable version of PyTorch for best performance.
 :::
 For more details: [PyTorch docs](https://pytorch.org/blog/flexattention/)
 ## SageAttention
 Attention kernels with QK Int8 and PV FP16 accumulator.
 ```yaml
 sage_attention: true
 ```
 Requirements: Ampere, Ada, or Hopper GPUs
 ```bash
 pip install sageattention==2.2.0 --no-build-isolation
 ```
 ::: {.callout-warning}
 Only LoRA/QLoRA recommended at the moment. We found loss drop to 0 for full finetuning. See [GitHub Issue](https://github.com/thu-ml/SageAttention/issues/198).
 :::
 For more details: [Sage Attention](https://github.com/thu-ml/SageAttention)
 ::: {.callout-note}
 We do not support SageAttention 3 at the moment. If you are interested on adding this or improving SageAttention implementation, please make an Issue.
 :::
 ## xFormers
 ```yaml
 xformers_attention: true
 ```
 ::: {.callout-tip}
 We recommend using with Turing GPUs or below (such as on Colab).
 :::
 For more details: [xFormers](https://github.com/facebookresearch/xformers)
 ## Shifted Sparse Attention
 ::: {.callout-warning}
 We plan to deprecate this! If you use this feature, we recommend switching to methods above.
 :::
 Requirements: LLaMA model architecture
 ```yaml
 flash_attention: true
 s2_attention: true
 ```
 ::: {.callout-tip}
 No sample packing support!
 :::
--- a/docs/checkpoint_saving.qmd
+++ b/docs/checkpoint_saving.qmd
@@ -0,0 +1,86 @@
 ---
 title: "Checkpoint Saving"
 format:
  html:
    toc: true
    toc-depth: 2
    number-sections: true
 execute:
  enabled: false
 ---
 ## Overview
 Axolotl supports on-demand checkpoint saving during training. You can trigger checkpoints via file-based triggers (for programmatic control) or Control+C (for interactive use).
 ## File-Based Checkpoint Trigger
 ### Configuration
 Enable in your config:
 ```yaml
 dynamic_checkpoint:
  enabled: true
  check_interval: 100  # Optional: check every N steps (default: 100)
  trigger_file_path: "axolotl_checkpoint.save"  # Optional: custom filename
 ```
 **Options:**
 - `enabled`: `true` to enable (required)
 - `check_interval`: Steps between file checks. Default: 100. Lower = faster response, higher I/O overhead.
 - `trigger_file_path`: Custom trigger filename. Default: `axolotl_checkpoint.save`
 ### How It Works
 1. Rank 0 checks for trigger file every `check_interval` steps in `output_dir`
 2. When detected, file is deleted and checkpoint is saved
 3. In distributed training, rank 0 broadcasts to synchronize all ranks
 ### Usage
 **Command line:**
 ```bash
 touch /path/to/output_dir/axolotl_checkpoint.save
 ```
 **Programmatic:**
 ```python
 from pathlib import Path
 Path("/path/to/output_dir/axolotl_checkpoint.save").touch()
 ```
 Checkpoint saves within the next `check_interval` steps. The trigger file is auto-deleted after detection, so you can create it multiple times.
 **Custom filename:**
 ```yaml
 dynamic_checkpoint:
  enabled: true
  trigger_file_path: "my_trigger.save"
 ```
 ```bash
 touch /path/to/output_dir/my_trigger.save
 ```
 ## Control+C (SIGINT) Checkpoint
 Pressing `Ctrl+C` during training saves the model state and exits gracefully. **Note:** This saves only the model weights, not optimizer state. For resumable checkpoints, use the file-based trigger.
 ## Best Practices
 - **Check interval**: Lower values (10-50) for fast training, default 100 for slower training
 - **Distributed training**: Create trigger file once; rank 0 handles synchronization
 - **Resume**: Dynamic checkpoints can be resumed like regular checkpoints via `resume_from_checkpoint`
 ## Example
 ```yaml
 output_dir: ./outputs/lora-out
 save_steps: 500  # Scheduled checkpoints
 dynamic_checkpoint:
  enabled: true
  check_interval: 50
 ```
 This enables scheduled checkpoints every 500 steps plus on-demand saves via file trigger (checked every 50 steps).
--- a/docs/cli.qmd
+++ b/docs/cli.qmd
@@ -210,6 +210,8 @@ axolotl lm-eval config.yml
 Configuration options:
 ```yaml
 lm_eval_model: # model to evaluate (local or hf path)
 # List of tasks to evaluate
 lm_eval_tasks:
  - arc_challenge
@@ -218,7 +220,7 @@ lm_eval_batch_size: # Batch size for evaluation
 output_dir: # Directory to save evaluation results
 ```
-See [LM Eval Harness](https://github.com/EleutherAI/lm-evaluation-harness) for more details.
+See [LM Eval Harness integration docs](https://docs.axolotl.ai/docs/custom_integrations.html#language-model-evaluation-harness-lm-eval) for full configuration details.
 ### delinearize-llama4
--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -32,11 +32,8 @@ main-base-py{python_version}-cu{cuda_version}-{pytorch_version}
 Tags examples:
- `main-base-py3.11-cu128-2.7.1`
+- `main-base-py3.11-cu128-2.8.0`
- `main-base-py3.11-cu126-2.7.1`
+- `main-base-py3.11-cu128-2.9.1`
 - `main-base-py3.11-cu126-2.7.0`
 - `main-base-py3.11-cu126-2.6.0`
 - `main-base-py3.11-cu124-2.6.0`
 ## Main
@@ -74,15 +71,12 @@ There may be some extra tags appended to the image, like `-vllm` which installs
 Tags examples:
- `main-py3.11-cu128-2.7.1`
+- `main-py3.11-cu128-2.8.0`
- `main-py3.11-cu126-2.7.1`
+- `main-py3.11-cu128-2.9.1`
 - `main-py3.11-cu126-2.7.0`
 - `main-py3.11-cu126-2.6.0`
 - `main-py3.11-cu124-2.6.0`
 - `main-latest`
 - `main-20250303-py3.11-cu124-2.6.0`
 - `main-20250303-py3.11-cu126-2.6.0`
- `0.10.1`
+- `0.12.0`
 ## Cloud
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -26,7 +26,7 @@ Follow the instructions at: [https://pytorch.org/get-started/locally/](https://p
 :::
 ::: {.callout-important}
-For Blackwell GPUs, please use Pytorch 2.7.0 and CUDA 12.8.
+For Blackwell GPUs, please use Pytorch 2.9.1 and CUDA 12.8.
 :::
 ### PyPI Installation (Recommended) {#sec-pypi}
@@ -111,7 +111,7 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it \
 :::
 ::: {.callout-important}
-For Blackwell GPUs, please use `axolotlai/axolotl:main-py3.11-cu128-2.7.0` or the cloud variant `axolotlai/axolotl-cloud:main-py3.11-cu128-2.7.0`.
+For Blackwell GPUs, please use `axolotlai/axolotl:main-py3.11-cu128-2.9.1` or the cloud variant `axolotlai/axolotl-cloud:main-py3.11-cu128-2.9.1`.
 :::
 Please refer to the [Docker documentation](docker.qmd) for more information on the different Docker images that are available.
@@ -165,7 +165,7 @@ We recommend using WSL2 (Windows Subsystem for Linux) or Docker.
   ```
 4. (Optional) Login to Hugging Face:
   ```{.bash}
-   huggingface-cli login
+   hf auth login
   ```
 ## Troubleshooting {#sec-troubleshooting}
--- a/docs/lora_optims.qmd
+++ b/docs/lora_optims.qmd
@@ -89,6 +89,10 @@ lora_o_kernel: true
 Currently, LoRA kernels are not supported for RLHF training, only SFT.
 :::
 ::: {.callout-warning}
 LoRA kernels do not support remote modeling code.
 :::
 ## Requirements
 - One or more NVIDIA or AMD GPUs (in order to use the Triton kernels)
--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -19,8 +19,10 @@ format:
 - [Gemma-3n](#sec-gemma-3n)
 - [Qwen2-VL](#sec-qwen2-vl)
 - [Qwen2.5-VL](#sec-qwen25-vl)
 - [GLM-4.6V](#sec-glm-4-6v)
 - [SmolVLM2](#sec-smolvlm2)
 - [LFM2-VL](#sec-lfm2-vl)
 - [Intern-VL](#sec-intern-vl)
 ## Usage
@@ -182,6 +184,18 @@ base_model: Qwen/Qwen3-VL-4B-Instruct
 chat_template: qwen2_vl  # same as qwen2-vl
 ```
 ### GLM-4.6V {#sec-glm-4-6v}
 Both GLM-4.6V (106B MoE) and GLM-4.6V-Flash (9B) are supported.
 ```yaml
 # GLM-4.6V (106B MoE version)
 base_model: zai-org/GLM-4.6V
 # OR GLM-4.6V-Flash (9B version)
 base_model: zai-org/GLM-4.6V-Flash
 ```
 ### SmolVLM2 {#sec-smolvlm2}
 ::: {.callout-tip}
@@ -202,6 +216,16 @@ Please uninstall `causal-conv1d` via `pip3 uninstall -y causal-conv1d`
 base_model: LiquidAI/LFM2-VL-450M
 ```
 ### Intern-VL {#sec-intern-vl}
 ::: {.callout-tip}
 Please make sure to install `timm` via `pip3 install timm==1.0.19`
 :::
 ```yaml
 base_model: OpenGVLab/InternVL3_5-8B
 ```
 ## Dataset Format
 For multi-modal datasets, we adopt an extended `chat_template` format similar to OpenAI's Message format.
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -17,6 +17,7 @@ feedback. Various methods include, but not limited to:
 - [Kahneman-Tversky Optimization (KTO)](#kto)
 - [Odds Ratio Preference Optimization (ORPO)](#orpo)
 - [Group Relative Policy Optimization (GRPO)](#grpo)
 - [Group Reward-Decoupled Policy Optimization (GDPO)](#gdpo)
 ## RLHF using Axolotl
@@ -720,6 +721,102 @@ trl:
 For more information, see [GRPO docs](https://huggingface.co/docs/trl/v0.17.0/en/grpo_trainer#loss-types).
 ### GDPO
 GDPO (Group Reward-Decoupled Policy Optimization) extends GRPO for multi-reward training. It addresses the **reward advantage collapse** problem by normalizing each reward function independently before combining them.
 ::: {.callout-tip}
 Use GDPO when training with multiple reward functions. For single reward, GRPO and GDPO produce equivalent results.
 :::
 Paper: [https://arxiv.org/pdf/2501.05242](https://arxiv.org/pdf/2501.05242)
 GDPO uses TRL's native `multi_objective_aggregation` parameter under the hood. When you set `rl: gdpo`, axolotl automatically configures TRL to use `normalize_then_sum` aggregation.
 ```yaml
 base_model: Qwen/Qwen2.5-1.5B-Instruct
 vllm:
    host: 0.0.0.0
    port: 8000
    tensor_parallel_size: 2
    gpu_memory_utilization: 0.85
 rl: gdpo
 trl:
    beta: 0.001
    max_completion_length: 256
    use_vllm: true
    num_generations: 4
    reward_funcs:
        - rewards.format_reward
        - rewards.correctness_reward
    reward_weights: [1.0, 2.0]
 datasets:
    - path: openai/gsm8k
      name: main
      type: rewards.oai_gsm8k_transform
 ```
 You can also use GRPO with explicit aggregation control:
 ```yaml
 rl: grpo
 trl:
    multi_objective_aggregation: normalize_then_sum  # GDPO behavior
    # or: sum_then_normalize  # Default GRPO behavior
 ```
 #### GDPO vs GRPO
 | Aspect | GRPO | GDPO |
 |--------|------|------|
 | **Aggregation** | `sum_then_normalize` | `normalize_then_sum` |
 | **Multi-reward** | May collapse advantages | Preserves reward signals |
 | **Single reward** | Standard behavior | Equivalent to GRPO |
 #### Why GDPO?
 When using multiple rewards with GRPO, different reward combinations can produce identical advantages:
 ```
 # Example: format + correctness rewards
 [format=0, correct=3] → sum=3
 [format=1, correct=2] → sum=3  ← GRPO sees these as equal!
 [format=2, correct=1] → sum=3
 [format=3, correct=0] → sum=3
 ```
 GDPO normalizes each reward independently, preserving their relative differences.
 #### Reward Functions
 GDPO uses the same reward function format as GRPO:
 ```python
 # rewards.py
 def format_reward(completions, **kwargs) -> list[float]:
    return [1.0 if len(c) > 10 else 0.0 for c in completions]
 def correctness_reward(completions, answers, **kwargs) -> list[float]:
    rewards = []
    for completion, answer in zip(completions, answers):
        # Your scoring logic here
        rewards.append(score)
    return rewards
 ```
 #### Sequence Parallelism
 GDPO supports sequence parallelism for long-context training:
 ```yaml
 rl: gdpo
 context_parallel_size: 2
 ```
 ### SimPO
 SimPO uses [CPOTrainer](https://huggingface.co/docs/trl/main/en/cpo_trainer) but with alternative loss function.
--- a/docs/scripts/examples-allowlist.yml
+++ b/docs/scripts/examples-allowlist.yml
@@ -0,0 +1,90 @@
 examples:
  # December 2025
  - name: kimi-linear
    title: Kimi Linear
  - name: plano
    title: Plano Orchestrator
  - name: mimo
    title: MiMo
  - name: internvl3_5
    title: InternVL 3.5
  # AllenAI
  - name: olmo3
    title: OLMo 3
  # ArceeAI
  - name: trinity
    title: Trinity
  - name: arcee
    title: Arcee AFM
  # MistralAI
  - name: ministral3/think
    title: Ministral 3 Thinking
  - name: ministral3/vision
    title: Ministral 3 Vision
  - name: magistral/think
    title: Magistral Thinking
  - name: magistral/vision
    title: Magistral Vision
  - name: ministral
    title: Ministral
  - name: mistral-small
    title: Mistral Small 3.1/3.2
  - name: voxtral
    title: Voxtral
  - name: devstral
    title: Devstral
  - name: mistral
    title: Mistral 7B
  # Meta
  - name: llama-4
    title: Llama 4
  - name: llama-2
    title: Llama 2
  # Alibaba
  - name: qwen3-next
    title: Qwen 3 Next
  - name: qwen3
    title: Qwen 3
  # Google
  - name: gemma3n
    title: Gemma 3n
  # Swiss AI
  - name: apertus
    title: Apertus
  # GPT-OSS
  - name: gpt-oss
    title: GPT-OSS
  - name: seed-oss
    title: Seed-OSS
  # Microsoft
  - name: phi
    title: Phi
  # SmolVLM
  - name: smolvlm2
    title: SmolVLM 2
  # IBM
  - name: granite4
    title: Granite 4
  # LiquidAI
  - name: LiquidAI
    title: Liquid Foundation Models 2
  # Other
  - name: hunyuan
    title: Hunyuan
  - name: jamba
    title: Jamba
  - name: orpheus
    title: Orpheus
--- a/docs/scripts/generate_examples_docs.py
+++ b/docs/scripts/generate_examples_docs.py
@@ -0,0 +1,424 @@
 """
 auto generate example docs from allowlist
 """
 import re
 import shutil
 import sys
 from pathlib import Path
 import yaml
 # Paths
 THIS = Path(__file__).resolve()
 ROOT = THIS.parents[2]  # repo root (docs/scripts -> docs -> ROOT)
 EXAMPLES_DIR = ROOT / "examples"
 OUTPUT_DIR = ROOT / "docs" / "models"
 ALLOWLIST_YML = THIS.parent / "examples-allowlist.yml"
 def slugify(name: str) -> str:
    """Convert a name to a slug (lowercase, hyphens for spaces)."""
    s = re.sub(r"[^a-zA-Z0-9\s\-]+", "", name.strip())
    s = re.sub(r"\s+", "-", s).strip("-").lower()
    return s or "example"
 def read_allowlist():
    with open(ALLOWLIST_YML, "r", encoding="utf-8") as f:
        data = yaml.safe_load(f) or {}
    items = data.get("examples", [])
    if not isinstance(items, list):
        raise ValueError("`examples` must be a list in examples-allowlist.yml")
    return items
 def find_readme(folder: Path) -> Path | None:
    for name in ("README.md", "Readme.md", "readme.md"):
        p = folder / name
        if p.exists():
            return p
    return None
 def remove_first_h1(md: str) -> tuple[str, str | None]:
    """
    Remove the first H1 from markdown and return (modified_md, h1_title).
    The H1 is removed since we use the frontmatter title instead.
    """
    lines = md.splitlines()
    result = []
    h1_title = None
    skipped_first = False
    for line in lines:
        if not skipped_first and line.startswith("# "):
            h1_title = line[2:].strip()
            skipped_first = True
            continue
        result.append(line)
    return "\n".join(result), h1_title
 IMG_RE = re.compile(r"!\[[^\]]*\]\(([^)]+)\)")
 LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
 def rewrite_and_copy_assets(md: str, src_dir: Path, dest_assets_root: Path) -> str:
    """
    Copy local image assets referenced in markdown to
    docs/examples/assets/... and rewrite the links.
    """
    dest_assets = dest_assets_root / "assets"
    def repl(m):
        url = m.group(1).strip()
        if re.match(r"^(https?:)?//", url):
            return m.group(0)  # leave remote URLs
        src_path = (src_dir / url).resolve()
        if not src_path.exists():
            return m.group(0)  # leave as-is if not found
        rel = src_path.relative_to(src_dir)
        # Create a unique asset path based on source directory name
        asset_name = src_dir.name.replace("/", "-")
        dest_path = dest_assets / asset_name / rel
        dest_path.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(src_path, dest_path)
        new_rel = f"assets/{asset_name}/{rel.as_posix()}"
        return m.group(0).replace(url, new_rel)
    return IMG_RE.sub(repl, md)
 def rewrite_readme_links(
    md: str,
    src_dir: Path,
    examples_dir: Path,
    parent_index_only: set,
    current_src_path: str,
    allowlist_entries: set,
    current_output_path: str,
 ) -> str:
    """
    Rewrite links between README.md files to point to the correct .qmd files.
    """
    def repl(m):
        text = m.group(1)
        url = m.group(2).strip()
        # Skip remote URLs and anchor links
        if re.match(r"^(https?:)?//", url) or url.startswith("#"):
            return m.group(0)
        # Skip non-markdown files
        if not url.lower().endswith(".md"):
            return m.group(0)
        # Resolve the target path
        try:
            target_path = (src_dir / url).resolve()
            # Check if target is outside examples_dir
            try:
                rel_path = target_path.relative_to(examples_dir)
            except ValueError:
                # Target is outside examples_dir, leave as-is
                return m.group(0)
            parts = list(rel_path.parts)
            # Determine the output path for the target
            if len(parts) > 0 and parts[-1].lower() in ("readme.md", "readme"):
                # This is a README link
                if len(parts) == 1:
                    # Link to root README -> index.qmd
                    target_output = "index.qmd"
                elif len(parts) == 2:
                    if parts[0] == ".":
                        # Current directory README
                        target_output = "index.qmd"
                    else:
                        # subdir/README.md
                        parent_dir = parts[0]
                        if parent_dir in parent_index_only:
                            target_output = f"{parent_dir}/index.qmd"
                        else:
                            target_output = f"{parent_dir}.qmd"
                else:
                    # Deeper nesting: parent/subdir/README.md
                    # Build the full path like "parent/subdir"
                    full_path = "/".join(parts[:-1])  # Remove README.md
                    # Check if this exact path is in allowlist
                    if full_path in allowlist_entries:
                        # This is a sub-entry with its own entry -> use .qmd
                        target_output = f"{full_path}.qmd"
                    elif parts[0] == ".":
                        # ./subdir/README.md -> check if subdir has own entry
                        subdir = parts[1]
                        if subdir in parent_index_only:
                            target_output = f"{subdir}/index.qmd"
                        else:
                            target_output = f"{subdir}.qmd"
                    else:
                        # parent/subdir where parent doesn't have own entry
                        target_output = f"{full_path}/index.qmd"
            else:
                # Regular .md file -> convert to .qmd, keep path structure
                target_output = "/".join(parts)[:-2] + "qmd"
            # Compute relative path from current output file to target
            current_parts = current_output_path.split("/")
            target_parts = target_output.split("/")
            # Special case: if current is a subdir file and target is a single-component file at root
            # Example: current="magistral/vision", target="magistral.qmd"
            if len(current_parts) > 1 and len(target_parts) == 1:
                # Current is in subdir, target is at root level
                # Go up to root: ../ for each level
                up_count = len(current_parts) - 1
                rel_parts = [".."] * up_count + [target_parts[0]]
                new_url = "/".join(rel_parts)
            else:
                # Find common prefix
                i = 0
                while (
                    i < min(len(current_parts) - 1, len(target_parts))
                    and current_parts[i] == target_parts[i]
                ):
                    i += 1
                # Build relative path: go up (../) then down to target
                up_count = len(current_parts) - 1 - i
                rel_parts = [".."] * up_count + target_parts[i:]
                if not rel_parts or rel_parts == [".."]:
                    # Points to same directory or parent
                    new_url = "/".join(rel_parts) if rel_parts else "."
                else:
                    new_url = "/".join(rel_parts)
            return f"[{text}]({new_url})"
        except (ValueError, IndexError):
            return m.group(0)
    return LINK_RE.sub(repl, md)
 def write_qmd(out_path: Path, title: str, body_md: str):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    fm = f"---\ntitle: {title!r}\nexecute:\n  eval: false\nformat:\n  html:\n    toc: true\n---\n\n"
    out_path.write_text(fm + body_md, encoding="utf-8")
 def update_quarto_yml(generated: list[tuple[str, str, str]]):
    """
    Update _quarto.yml with the generated example files in the correct order.
    This keeps the sidebar in sync with the allowlist.
    Model Guides is now nested under "Getting Started" section.
    Creates nested sections for models with sub-entries (e.g., magistral, ministral3).
    Parent pages are now flat files (e.g., ministral3.qmd) with sub-pages in subdirs.
    """
    quarto_yml = ROOT / "_quarto.yml"
    if not quarto_yml.exists():
        print(f"[WARN] {quarto_yml} not found, skipping update", file=sys.stderr)
        return
    content = quarto_yml.read_text(encoding="utf-8")
    # First pass: find all parents that have sub-entries
    parents_with_subs = set()
    for path, _name, _title in generated:
        if "/" in path:
            parent = path.split("/")[0]
            parents_with_subs.add(parent)
    # Build the YAML contents while preserving allowlist order
    lines = []
    processed_sections = set()
    for path, _name, title in generated:
        # Check if this is a parent page that has sub-pages
        if path in parents_with_subs:
            # This is a parent page with sub-pages - create a nested section
            if path not in processed_sections:
                processed_sections.add(path)
                section_title = (
                    title or path.replace("-", " ").replace("_", " ").title()
                )
                lines.append(f'                - section: "{section_title}"')
                lines.append("                  contents:")
                # Add the parent page first
                lines.append(f"                    - docs/models/{path}.qmd")
                # Then add all sub-pages
                for sub_path, _sub_name, _sub_title in generated:
                    if "/" in sub_path and sub_path.split("/")[0] == path:
                        lines.append(
                            f"                    - docs/models/{sub_path}.qmd"
                        )
        elif "/" not in path:
            # This is a flat item with no sub-pages
            # Skip if it was already included as part of a parent section
            if path not in processed_sections:
                lines.append(f"                - docs/models/{path}.qmd")
    yaml_content = "\n".join(lines) + "\n"
    # Pattern to match only the Model Guides contents, stopping at the next item
    # in Getting Started (lines starting with 12 spaces: same level as the section)
    pattern = r'(            - section: "Model Guides"\n              contents:)([^\n]*|.*?)(?=\n            - |\n        - section:|\n\nformat:)'
    def replacement(match):
        prefix = match.group(1)
        return prefix + "\n" + yaml_content
    new_content = re.sub(pattern, replacement, content, flags=re.DOTALL)
    if new_content != content:
        quarto_yml.write_text(new_content, encoding="utf-8")
        print(f"Updated {quarto_yml}")
    else:
        print(f"No changes needed for {quarto_yml}")
 def main():
    allow = read_allowlist()
    if not EXAMPLES_DIR.exists():
        print(f"[WARN] {EXAMPLES_DIR} not found", file=sys.stderr)
        return
    (OUTPUT_DIR / "assets").mkdir(parents=True, exist_ok=True)
    # First pass: identify which parents have their own entry vs only sub-entries
    parent_entries = set()  # Parents that have their own entry
    parent_with_subs = set()  # Parents that have sub-entries
    allowlist_entries = set()  # All entries in allowlist
    for item in allow:
        if isinstance(item, str):
            name = item
        else:
            name = item.get("name")
        allowlist_entries.add(name)
        if "/" in name:
            parent = name.split("/")[0]
            parent_with_subs.add(parent)
        else:
            parent_entries.add(name)
    # Parents with subs that DON'T have their own entry -> use index.qmd
    parent_index_only = parent_with_subs - parent_entries
    generated = []
    seen_dirs = set()  # Track which parent directories we've created index for
    for item in allow:
        if isinstance(item, str):
            name = item
            title = None
        else:
            name = item.get("name")
            title = item.get("title")
        if not name:
            print(f"[WARN] Skipping item without name: {item}", file=sys.stderr)
            continue
        src_dir = EXAMPLES_DIR / name
        if not src_dir.exists() or not src_dir.is_dir():
            print(f"[WARN] Skipping {name} (not a directory)", file=sys.stderr)
            continue
        readme = find_readme(src_dir)
        if not readme:
            print(f"[WARN] Skipping {name} (no README.md)", file=sys.stderr)
            continue
        md = readme.read_text(encoding="utf-8")
        # Determine output path first (needed for link rewriting)
        parts = name.split("/")
        if len(parts) == 1:
            # Simple case: no subdirectory
            out_path = OUTPUT_DIR / f"{parts[0]}.qmd"
            sidebar_path = parts[0]
        else:
            # Has subdirectory: e.g., magistral/think
            parent = parts[0]
            child = "-".join(parts[1:])  # handle nested subdirs
            out_path = OUTPUT_DIR / parent / f"{child}.qmd"
            sidebar_path = f"{parent}/{child}"
        # Remove the first H1 (we use frontmatter title instead)
        md, _ = remove_first_h1(md)
        # Rewrite links between README files
        md = rewrite_readme_links(
            md,
            src_dir,
            EXAMPLES_DIR,
            parent_index_only,
            name,
            allowlist_entries,
            sidebar_path,
        )
        md = rewrite_and_copy_assets(md, src_dir, OUTPUT_DIR)
        # Handle parent page generation for sub-entries
        if len(parts) > 1:
            # Has subdirectory: e.g., magistral/think
            parent = parts[0]
            # Create parent.qmd if not already done and parent doesn't have own entry
            if parent not in seen_dirs and parent in parent_index_only:
                parent_readme = find_readme(EXAMPLES_DIR / parent)
                if parent_readme:
                    parent_md = parent_readme.read_text(encoding="utf-8")
                    parent_md, _ = remove_first_h1(parent_md)
                    parent_md = rewrite_readme_links(
                        parent_md,
                        EXAMPLES_DIR / parent,
                        EXAMPLES_DIR,
                        parent_index_only,
                        parent,
                        allowlist_entries,
                        parent,
                    )
                    parent_md = rewrite_and_copy_assets(
                        parent_md, EXAMPLES_DIR / parent, OUTPUT_DIR
                    )
                    parent_title = parent.replace("-", " ").replace("_", " ").title()
                    write_qmd(OUTPUT_DIR / f"{parent}.qmd", parent_title, parent_md)
                    generated.append((parent, parent, parent_title))
                    seen_dirs.add(parent)
        if not title:
            title = name.replace("/", " ").replace("-", " ").title()
        write_qmd(out_path, title, md)
        generated.append((sidebar_path, name, title))
    # Index page - preserve allowlist order
    if generated:
        listing = "\n".join(
            [f"- [{title}]({path}.qmd)" for path, name, title in generated]
        )
        index_md = (
            "# Model Guides\n\nBelow are the curated examples for training various model architectures:\n\n"
            + listing
            + "\n"
        )
        index_fm = (
            "---\nexecute:\n  eval: false\nformat:\n  html:\n    toc: true\n---\n\n"
        )
        (OUTPUT_DIR / "index.qmd").write_text(index_fm + index_md, encoding="utf-8")
        # Auto-update _quarto.yml to keep sidebar in sync
        update_quarto_yml(generated)
 if __name__ == "__main__":
    main()
--- a/examples/apertus/README.md
+++ b/examples/apertus/README.md
@@ -15,7 +15,7 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
-pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
+pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation -e '.[flash-attn]'
 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
--- a/examples/arcee/README.md
+++ b/examples/arcee/README.md
@@ -17,7 +17,7 @@ Thanks to the team at Arcee.ai for using Axolotl in supervised fine-tuning the A
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
-pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
+pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation -e '.[flash-attn]'
 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -40,7 +40,7 @@
    "%%capture\n",
    "# This step can take ~5-10 minutes to install dependencies\n",
    "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
-    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@f643b88\""
+    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0d4ce4b\""
   ]
  },
  {
--- a/examples/devstral/README.md
+++ b/examples/devstral/README.md
@@ -16,7 +16,7 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r
 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
+pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```
--- a/examples/devstral/devstral-small-qlora.yml
+++ b/examples/devstral/devstral-small-qlora.yml
@@ -52,6 +52,7 @@ gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 scaling_softmax: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/eaft/eaft-example.yml
+++ b/examples/eaft/eaft-example.yml
@@ -0,0 +1,77 @@
 base_model: google/gemma-3-1b-it
 model_type: Gemma3ForCausalLM
 cls_model_config: Gemma3TextConfig
 # gemma3 doesn't seem to play nice with ddp
 ddp_find_unused_parameters: true
 chat_template: gemma3
 eot_tokens:
  - <end_of_turn>
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
  - path: cgato/SlimOrcaDedupCleaned
    type: chat_template
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
 dataset_prepared_path:
 val_set_size: 0
 output_dir: ./outputs/eaft-gemma-3-1b
 use_eaft: true
 eaft_alpha: 1.0
 eaft_k: 20
 sequence_len: 1024
 sample_packing: false
 adapter:
 lora_model_dir:
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 1
 eval_batch_size: 1
 max_steps: 1000
 evaluation_strategy: "no"
 optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 5e-5
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_ratio: 0.1
 weight_decay: 0.0
 debug:
 deepspeed:
 fsdp:
 fsdp_config:
 special_tokens:
--- a/examples/gemma3/gemma-3-1b-qlora.yml
+++ b/examples/gemma3/gemma-3-1b-qlora.yml
@@ -1,6 +1,7 @@
 base_model: google/gemma-3-1b-it
 model_type: Gemma3ForCausalLM
 cls_model_config: Gemma3TextConfig
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
@@ -29,7 +30,7 @@ output_dir: ./outputs/out
 adapter: qlora
 lora_r: 32
 lora_alpha: 16
-lora_dropout: 0.05
+lora_dropout: 0
 lora_target_linear: true
 sequence_len: 2048
--- a/examples/gemma3/gemma-3-270m-qlora.yml
+++ b/examples/gemma3/gemma-3-270m-qlora.yml
@@ -1,6 +1,7 @@
 base_model: google/gemma-3-270m-it
 model_type: Gemma3ForCausalLM
 cls_model_config: Gemma3TextConfig
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
@@ -29,7 +30,7 @@ output_dir: ./outputs/out
 adapter: qlora
 lora_r: 32
 lora_alpha: 16
-lora_dropout: 0.05
+lora_dropout: 0
 lora_target_linear: true
 sequence_len: 2048
--- a/examples/gemma3/gemma-3-4b-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-qlora.yml
@@ -2,6 +2,7 @@ base_model: google/gemma-3-4b-it
 # Need to set else transformers tries to load vision too
 model_type: Gemma3ForCausalLM
 cls_model_config: Gemma3TextConfig
 load_in_4bit: true
@@ -32,8 +33,8 @@ sample_packing: true
 lora_r: 32
 lora_alpha: 16
-lora_dropout: 0.05
+lora_dropout: 0
-lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_linear: true
 wandb_project:
 wandb_entity:
--- a/examples/gemma3/gemma-3-4b-vision-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-vision-qlora.yml
@@ -31,7 +31,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
-lora_dropout: 0.05
+lora_dropout: 0
 lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
 wandb_project:
--- a/examples/gemma3n/README.md
+++ b/examples/gemma3n/README.md
@@ -10,7 +10,7 @@ Gemma-3n is a family of multimodal models from Google found on [HuggingFace](htt
 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
+pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```
--- a/examples/glm46v/README.md
+++ b/examples/glm46v/README.md
@@ -0,0 +1,44 @@
 # Finetune GLM-4.6V with Axolotl
 GLM-4.6V is a family of vision-language models from ZhipuAI found on [HuggingFace](https://huggingface.co/zai-org/GLM-4.6V). This guide shows how to fine-tune it with Axolotl for vision-language tasks.
 ## Getting started
 1. Install Axolotl from source following the [installation guide](https://docs.axolotl.ai/docs/installation.html#sec-edge-build).
 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
 3. Run the fine-tuning:
    glm-4-6v-flash(9B)
    ```bash
    axolotl train examples/glm46v/glm-4-6v-flash-qlora.yaml
    ```
 Let us know how it goes. Happy finetuning! 🚀
 ## Tips
 - Vision datasets should follow the format described in the [multimodal docs](https://docs.axolotl.ai/docs/multimodal.html#dataset-format)
 - You can run a **full finetuning** by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
 - Read more on how to load your own dataset in the [dataset loading docs](https://docs.axolotl.ai/docs/dataset_loading.html).
 ## Supported Models
 - **GLM-4.6V**: Full vision-language model (`zai-org/GLM-4.6V`)
 - **GLM-4.6V-Flash**: Faster variant (`zai-org/GLM-4.6V-Flash`)
 ## Optimization Guides
 Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
 ## Related Resources
 - [ZhipuAI GLM-4.6V](https://huggingface.co/zai-org/GLM-4.6V)
 - [Axolotl Docs](https://docs.axolotl.ai)
 - [Axolotl Website](https://axolotl.ai)
 - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
 - [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
--- a/examples/glm46v/glm-4-6v-flash-ddp.yaml
+++ b/examples/glm46v/glm-4-6v-flash-ddp.yaml
@@ -0,0 +1,53 @@
 base_model: zai-org/GLM-4.6V-Flash
 trust_remote_code: true
 processor_type: AutoProcessor
 load_in_4bit: true
 # these 3 lines are needed for now to handle vision chat templates w images
 skip_prepare_dataset: true
 remove_unused_columns: false
 sample_packing: false
 ddp_find_unused_parameters: true
 output_dir: ./outputs/glm-4-6v-flash-qlora
 datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]
 adapter: qlora
 lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.05
 lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj
 sequence_len: 2048
 gradient_accumulation_steps: 4
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: false
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 logging_steps: 1
 sdp_attention: true
 warmup_ratio: 0.1
 evals_per_epoch: 0
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/glm46v/glm-4-6v-flash-qlora.yaml
+++ b/examples/glm46v/glm-4-6v-flash-qlora.yaml
@@ -0,0 +1,50 @@
 base_model: zai-org/GLM-4.6V-Flash
 trust_remote_code: true
 processor_type: AutoProcessor
 load_in_4bit: true
 # these 3 lines are needed for now to handle vision chat templates w images
 skip_prepare_dataset: true
 remove_unused_columns: false
 sample_packing: false
 output_dir: ./outputs/glm-4-6v-flash-qlora
 datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]
 adapter: qlora
 lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.05
 lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj
 sequence_len: 2048
 gradient_accumulation_steps: 4
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: false
 gradient_checkpointing: true
 logging_steps: 1
 sdp_attention: true
 warmup_ratio: 0.1
 evals_per_epoch: 0
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/gpt-oss/README.md
+++ b/examples/gpt-oss/README.md
@@ -14,7 +14,7 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
+pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```
--- a/examples/granite4/README.md
+++ b/examples/granite4/README.md
@@ -15,7 +15,7 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
-pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
+pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation -e '.[flash-attn]'
 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
--- a/examples/hunyuan/README.md
+++ b/examples/hunyuan/README.md
@@ -13,7 +13,7 @@ Tencent released a family of opensource models called HunYuan with varying param
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
-pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
+pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation -e '.[flash-attn]'
 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
--- a/examples/internvl3_5/README.md
+++ b/examples/internvl3_5/README.md
@@ -0,0 +1,43 @@
 # Finetune OpenGV's InternVL with Axolotl
 [InternVL 3.5](https://huggingface.co/OpenGVLab/InternVL3_5-8B-HF) is a family of powerful vision-language models supporting dynamic resolution and multi-image understanding by OpenGV. It features a ViT-style vision encoder and strong language model backbone for tasks like visual question answering, OCR, and scene text understanding.
 This guide shows how to fine-tune it with Axolotl.
 ## Getting started
 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
 2. Install `timm` for vision model support:
    ```bash
    pip install timm==1.0.19
    ```
 3. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
 4. Run the finetuning example:
    ```bash
    axolotl train examples/internvl3_5/internvl3_5-8b-qlora.yml
    ```
 This config uses about 8.21 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀
 ### Tips
 - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
 - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
 - The dataset format follows the multi-modal format as seen [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format).
 ## Optimization Guides
 Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
 ## Related Resources
 - [InternVL Paper](https://huggingface.co/papers/2508.18265)
 - [Axolotl Docs](https://docs.axolotl.ai)
 - [Axolotl Website](https://axolotl.ai)
 - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
 - [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
--- a/examples/internvl3_5/internvl3_5-8b-qlora.yml
+++ b/examples/internvl3_5/internvl3_5-8b-qlora.yml
@@ -0,0 +1,61 @@
 base_model: OpenGVLab/InternVL3_5-8B-HF
 processor_type: AutoProcessor
 plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
 load_in_4bit: true
 # these 3 lines are needed for now to handle vision chat templates w images
 skip_prepare_dataset: true
 remove_unused_columns: false
 sample_packing: false
 datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]
    field_messages: messages
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.01
 output_dir: ./outputs/out
 adapter: qlora
 lora_model_dir:
 sequence_len: 2048
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 2
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: true
 fp16:
 tf32: true
 gradient_checkpointing: true
 logging_steps: 1
 flash_attention: true
 eager_attention:
 warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/jamba/qlora_fsdp_large.yaml
+++ b/examples/jamba/qlora_fsdp_large.yaml
@@ -19,7 +19,6 @@ datasets:
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: jamba-large-fsdp-qlora-ft
 save_safetensors: true
 adapter: qlora
 sequence_len: 2048
 sample_packing: true
--- a/examples/kimi-linear/README.md
+++ b/examples/kimi-linear/README.md
@@ -0,0 +1,47 @@
 # Finetune MoonshotAI's Kimi Linear with Axolotl
 [Kimi Linear](https://huggingface.co/collections/moonshotai/kimi-linear-a3b) is a MoE model (48B total, 3B active) by MoonshotAI using a hybrid linear attention architecture to achieve a 1M token context length. It uses Kimi Delta Attention (KDA), a refined version of Gated DeltaNet that reduces KV cache size by up to 75% and boosts decoding throughput by up to 6x for long contexts.
 This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
 **Note:** Axolotl uses experimental training code for Kimi Linear as their original modeling code is inference-only.
 ## Getting started
 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
 2. Install CCE via [docs](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy)
 3. Run the finetuning example:
    ```bash
    axolotl train examples/kimi-linear/kimi-48b-lora.yaml
    ```
 This config uses about 98.7GiB VRAM.
 Let us know how it goes. Happy finetuning!
 ### TIPS
 - Kimi Linear requires `trust_remote_code: true`.
 - You can run a full finetuning by removing the `adapter: lora` and `load_in_8bit: true`.
 - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html)
 - The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template)
 ## Optimization Guides
 See 👉 [docs](https://docs.axolotl.ai/docs/optimizations.html).
 ## Limitations
 This is not yet compatible with MoE kernels from transformers v5.
 ## Related Resources
 - [Kimi Linear Paper](https://huggingface.co/papers/2510.26692)
 - [Kimi Linear GitHub](https://github.com/MoonshotAI/Kimi-Linear)
 - [Axolotl Docs](https://docs.axolotl.ai)
 - [Axolotl Website](https://axolotl.ai)
 - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
 - [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
--- a/examples/kimi-linear/kimi-48b-lora.yaml
+++ b/examples/kimi-linear/kimi-48b-lora.yaml
@@ -0,0 +1,81 @@
 base_model: moonshotai/Kimi-Linear-48B-A3B-Instruct
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template
    split: train
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.2
 output_dir: ./outputs/lora-out
 adapter: lora
 lora_model_dir:
 sequence_len: 2048
 sample_packing: true
 pad_to_sequence_len: true
 lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.05
 lora_fan_in_fan_out:
 lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 2
 micro_batch_size: 2
 num_epochs: 1
 optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
 warmup_ratio: 0.1
 evals_per_epoch: 2
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
--- a/examples/llama-3/qlora-1b-gdpo.yaml
+++ b/examples/llama-3/qlora-1b-gdpo.yaml
@@ -0,0 +1,68 @@
 base_model: meta-llama/Llama-3.2-1B-Instruct
 chat_template: llama3
 rl: gdpo
 trl:
  beta: 0.001
  max_completion_length: 128
  num_generations: 2
  temperature: 0.7
  top_p: 0.95
  use_vllm: false
  multi_objective_aggregation: normalize_then_sum
  reward_funcs:
    - rwd.format_reward
    - rwd.correctness_reward
  reward_weights: [1.0, 2.0]
  log_completions: true
  num_completions_to_print: 3
  scale_rewards: true
 datasets:
  - path: openai/gsm8k
    name: main
    split: train[:1000]
    type: rwd.gsm8k_transform
 val_set_size: 0.0
 output_dir: ./outputs/llama3-gdpo-out
 sequence_len: 512
 sample_packing: false
 pad_to_sequence_len: false
 gradient_accumulation_steps: 8
 micro_batch_size: 1
 num_epochs: 1
 max_steps: 100
 optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 5e-5
 weight_decay: 0.01
 warmup_steps: 10
 bf16: auto
 tf32: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 flash_attention: true
 logging_steps: 1
 save_steps: 50
 save_safetensors: true
 special_tokens:
  pad_token: "<|end_of_text|>"
 seed: 42
--- a/examples/llama-3/qlora-fsdp-405b.yaml
+++ b/examples/llama-3/qlora-fsdp-405b.yaml
@@ -12,7 +12,6 @@ datasets:
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: ./outputs/out/qlora-llama3_1-405b
 save_safetensors: true
 adapter: qlora
--- a/examples/magistral/README.md
+++ b/examples/magistral/README.md
@@ -14,7 +14,7 @@ Thanks to the team at MistralAI for giving us early access to prepare for these
 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.7.0 min)
-pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
+pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```
--- a/examples/magistral/think/README.md
+++ b/examples/magistral/think/README.md
@@ -5,6 +5,7 @@ This guide covers fine-tuning [Magistral Small 2507](https://huggingface.co/mist
 ## Prerequisites
 Before starting, ensure you have:
 - Installed Axolotl (see [main README](../README.md))
 ## Getting Started
--- a/examples/magistral/vision/README.md
+++ b/examples/magistral/vision/README.md
@@ -5,7 +5,8 @@ This guide covers fine-tuning [Magistral Small 2509](https://huggingface.co/mist
 ## Prerequisites
 Before starting, ensure you have:
- Installed Axolotl from source (see [main README](../README.md#getting-started))
+
 - Installed Axolotl from source (see [main README](../README.md))
 ## Getting started
--- a/examples/mamba/config.yml
+++ b/examples/mamba/config.yml
@@ -47,6 +47,5 @@ saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 tokens:
 save_safetensors: False
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/mimo/README.md
+++ b/examples/mimo/README.md
@@ -0,0 +1,39 @@
 # Finetune Xiaomi's MiMo with Axolotl
 [MiMo](https://huggingface.co/XiaomiMiMo/MiMo-7B-RL) is a family of models trained from scratch for reasoning tasks, incorporating **Multiple-Token Prediction (MTP)** as an additional training objective for enhanced performance and faster inference. Pre-trained on ~25T tokens with a three-stage data mixture strategy and optimized reasoning pattern density.
 This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
 ## Getting started
 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
 2. Run the finetuning example:
    ```bash
    axolotl train examples/mimo/mimo-7b-qlora.yaml
    ```
 This config uses about 17.2 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀
 ### Tips
 - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
 - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
 - The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
 ## Optimization Guides
 Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
 ## Limitations
 **Cut Cross Entropy (CCE)**: Currently not supported. We plan to include CCE support for MiMo in the near future.
 ## Related Resources
 - [MiMo Paper](https://arxiv.org/abs/2505.07608)
 - [Axolotl Docs](https://docs.axolotl.ai)
 - [Axolotl Website](https://axolotl.ai)
 - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
 - [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
--- a/examples/mimo/mimo-7b-qlora.yaml
+++ b/examples/mimo/mimo-7b-qlora.yaml
@@ -0,0 +1,67 @@
 base_model: XiaomiMiMo/MiMo-7B-RL
 trust_remote_code: true
 revision_of_model: 6299b5a
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 # CCE - N/A as of now
 # plugins:
 #   - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
 load_in_8bit: false
 load_in_4bit: true
 datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.1
 output_dir: ./outputs/lora-out
 adapter: qlora
 lora_model_dir:
 sequence_len: 2048
 sample_packing: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 2
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/ministral3/ministral3-3b-qlora.yaml
+++ b/examples/ministral3/ministral3-3b-qlora.yaml
@@ -59,6 +59,7 @@ gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 scaling_softmax: true
 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/ministral3/think/README.md
+++ b/examples/ministral3/think/README.md
@@ -5,6 +5,7 @@ This guide covers fine-tuning [Ministral3 2512](https://huggingface.co/collectio
 ## Prerequisites
 Before starting, ensure you have:
 - Installed Axolotl (see [main README](../README.md))
 ## Getting Started
--- a/examples/ministral3/vision/README.md
+++ b/examples/ministral3/vision/README.md
@@ -5,7 +5,8 @@ This guide covers fine-tuning [Ministral3 2512](https://huggingface.co/collectio
 ## Prerequisites
 Before starting, ensure you have:
- Installed Axolotl from source (see [main README](../README.md#getting-started))
+
 - Installed Axolotl from source (see [main README](../README.md))
 ## Getting started
--- a/examples/mistral/mistral-small/README.md
+++ b/examples/mistral/mistral-small/README.md
@@ -5,6 +5,7 @@ This guide covers fine-tuning [Mistral Small 3.1](mistralai/Mistral-Small-3.1-24
 ## Prerequisites
 Before starting, ensure you have:
 - Installed Axolotl (see [Installation docs](https://docs.axolotl.ai/docs/installation.html))
 ## Getting Started
--- a/examples/mistral/mistral-small/mistral-small-3.1-24B-lora.yml
+++ b/examples/mistral/mistral-small/mistral-small-3.1-24B-lora.yml
--- a/examples/olmo3/README.md
+++ b/examples/olmo3/README.md
@@ -16,7 +16,7 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
    axolotl train examples/olmo3/olmo3-7b-qlora.yaml
    ```
-Let us know how it goes. Happy finetuning! 🚀
+This uses about 11.3 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀
 ### TIPS
--- a/examples/olmo3/olmo3-7b-qlora.yaml
+++ b/examples/olmo3/olmo3-7b-qlora.yaml
@@ -42,10 +42,10 @@ wandb_watch:
 wandb_name:
 wandb_log_model:
-gradient_accumulation_steps: 4
+gradient_accumulation_steps: 2
 micro_batch_size: 2
 num_epochs: 1
-optimizer: adamw_bnb_8bit
+optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
--- a/examples/plano/README.md
+++ b/examples/plano/README.md
@@ -0,0 +1,42 @@
 # Finetune Katanemo's Plano-Orchestrator with Axolotl
 [Plano-Orchestrator](https://huggingface.co/collections/katanemo/plano-orchestrator) is a family of 4B and 30B-A3B routing and orchestration models designed for multi-agent systems. It analyzes user intent and conversation context to make precise routing decisions, excelling at multi-turn context understanding, multi-intent detection, and context-dependent routing.
 This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
 ## Getting started
 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
 3. Run the finetuning example:
    ```bash
    axolotl train examples/plano/plano-4b-qlora.yaml
    ```
 This config uses about 5.1 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀
 ### Orchestration Prompt
 Plano-Orchestrator uses a specific orchestration prompt format for routing/agent decisions. Please check the [official model card](https://huggingface.co/katanemo/Plano-Orchestrator-4B) for proper prompt formatting and the `ORCHESTRATION_PROMPT` template.
 ### Tips
 - To use the larger [Plano-Orchestrator-30B-A3B](https://huggingface.co/katanemo/Plano-Orchestrator-30B-A3B) MoE model, simply change `base_model: katanemo/Plano-Orchestrator-30B-A3B` in the config and enable multi-GPU training if needed.
 - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
 - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
 - The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
 ## Optimization Guides
 Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
 ## Related Resources
 - [Plano GitHub](https://github.com/katanemo/plano)
 - [Axolotl Docs](https://docs.axolotl.ai)
 - [Axolotl Website](https://axolotl.ai)
 - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
 - [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
--- a/examples/plano/plano-4b-qlora.yaml
+++ b/examples/plano/plano-4b-qlora.yaml
@@ -0,0 +1,65 @@
 base_model: katanemo/Plano-Orchestrator-4B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
 load_in_8bit: false
 load_in_4bit: true
 chat_template: qwen3
 datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.1
 output_dir: ./outputs/lora-out
 adapter: qlora
 lora_model_dir:
 sequence_len: 2048
 sample_packing: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 2
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/qwen3-next/README.md
+++ b/examples/qwen3-next/README.md
@@ -15,7 +15,7 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
-pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
+pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation -e '.[flash-attn]'
 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
--- a/examples/swanlab/README.md
+++ b/examples/swanlab/README.md
@@ -0,0 +1,285 @@
 # SwanLab Integration Examples
 This directory contains example configurations demonstrating SwanLab integration with Axolotl.
 ## Examples Overview
 ### 1. DPO with Completion Logging
 **File**: `dpo-swanlab-completions.yml`
 Demonstrates DPO (Direct Preference Optimization) training with RLHF completion table logging.
 **Features**:
 - Basic SwanLab experiment tracking
 - Completion table logging (prompts, chosen/rejected responses, rewards)
 - Memory-bounded buffer for long training runs
 - Cloud sync configuration
 **Best for**: RLHF practitioners who want to analyze model outputs qualitatively
 **Quick start**:
 ```bash
 export SWANLAB_API_KEY=your-api-key
 accelerate launch -m axolotl.cli.train examples/swanlab/dpo-swanlab-completions.yml
 ```
 ---
 ### 2. LoRA with Performance Profiling
 **File**: `lora-swanlab-profiling.yml`
 Demonstrates standard LoRA fine-tuning with performance profiling enabled.
 **Features**:
 - SwanLab experiment tracking
 - Automatic profiling of trainer methods
 - Profiling metrics visualization
 - Performance optimization guidance
 **Best for**: Engineers optimizing training performance and comparing different configurations
 **Quick start**:
 ```bash
 export SWANLAB_API_KEY=your-api-key
 accelerate launch -m axolotl.cli.train examples/swanlab/lora-swanlab-profiling.yml
 ```
 ---
 ### 3. Full-Featured DPO Production Setup
 **File**: `dpo-swanlab-full-featured.yml`
 Comprehensive production-ready configuration with ALL SwanLab features enabled.
 **Features**:
 - Experiment tracking with team workspace
 - RLHF completion logging
 - Performance profiling
 - Lark (Feishu) team notifications
 - Private deployment support
 - Production checklist and troubleshooting
 **Best for**: Production RLHF training with team collaboration
 **Quick start**:
 ```bash
 export SWANLAB_API_KEY=your-api-key
 export SWANLAB_LARK_WEBHOOK_URL=https://open.feishu.cn/...
 export SWANLAB_LARK_SECRET=your-webhook-secret
 accelerate launch -m axolotl.cli.train examples/swanlab/dpo-swanlab-full-featured.yml
 ```
 ---
 ### 4. Custom Trainer Profiling (Python)
 **File**: `custom_trainer_profiling.py`
 Python code examples showing how to add SwanLab profiling to custom trainers.
 **Features**:
 - `@swanlab_profile` decorator examples
 - Context manager profiling for fine-grained timing
 - `ProfilingConfig` for advanced filtering and throttling
 - Multiple profiling patterns and best practices
 **Best for**: Advanced users creating custom trainers
 **Usage**:
 ```python
 from custom_trainer_profiling import CustomTrainerWithProfiling
 # See file for detailed examples and patterns
 ```
 ---
 ## Feature Matrix
 | Example | Tracking | Completion Logging | Profiling | Lark Notifications | Team Workspace |
 |---------|----------|-------------------|-----------|-------------------|----------------|
 | dpo-swanlab-completions.yml | ✅ | ✅ | ✅ (auto) | ➖ (commented) | ➖ (commented) |
 | lora-swanlab-profiling.yml | ✅ | ➖ (disabled) | ✅ (auto) | ➖ (commented) | ➖ (commented) |
 | dpo-swanlab-full-featured.yml | ✅ | ✅ | ✅ (auto) | ✅ | ✅ |
 | custom_trainer_profiling.py | N/A | N/A | ✅ (manual) | N/A | N/A |
 ---
 ## Configuration Quick Reference
 ### Basic SwanLab Setup
 ```yaml
 plugins:
  - axolotl.integrations.swanlab.SwanLabPlugin
 use_swanlab: true
 swanlab_project: my-project
 swanlab_experiment_name: my-experiment
 swanlab_mode: cloud  # cloud, local, offline, disabled
 ```
 ### RLHF Completion Logging
 ```yaml
 swanlab_log_completions: true
 swanlab_completion_log_interval: 100  # Log every 100 steps
 swanlab_completion_max_buffer: 128    # Memory-bounded buffer
 ```
 ### Lark Team Notifications
 ```yaml
 swanlab_lark_webhook_url: https://open.feishu.cn/...
 swanlab_lark_secret: your-webhook-secret  # Required for production
 ```
 ### Team Workspace
 ```yaml
 swanlab_workspace: my-research-team
 ```
 ### Private Deployment
 ```yaml
 swanlab_web_host: https://swanlab.yourcompany.com
 swanlab_api_host: https://api.swanlab.yourcompany.com
 ```
 ---
 ## Authentication
 ### Recommended: Environment Variable
 ```bash
 export SWANLAB_API_KEY=your-api-key
 export SWANLAB_LARK_WEBHOOK_URL=https://open.feishu.cn/...
 export SWANLAB_LARK_SECRET=your-webhook-secret
 ```
 ### Alternative: Config File (less secure)
 ```yaml
 swanlab_api_key: your-api-key
 swanlab_lark_webhook_url: https://open.feishu.cn/...
 swanlab_lark_secret: your-webhook-secret
 ```
 ---
 ## Common Use Cases
 ### Use Case 1: Migrate from WandB to SwanLab
 Start with `lora-swanlab-profiling.yml`, add your model/dataset config, disable WandB:
 ```yaml
 use_swanlab: true
 use_wandb: false
 ```
 ### Use Case 2: Analyze DPO Model Outputs
 Use `dpo-swanlab-completions.yml`, adjust completion logging interval based on your training length:
 ```yaml
 swanlab_completion_log_interval: 50   # More frequent for short training
 swanlab_completion_log_interval: 200  # Less frequent for long training
 ```
 ### Use Case 3: Optimize Training Performance
 Use `lora-swanlab-profiling.yml`, run multiple experiments with different optimizations:
 - Baseline: `flash_attention: false, gradient_checkpointing: false`
 - Flash Attention: `flash_attention: true`
 - Gradient Checkpointing: `gradient_checkpointing: true`
 - Both: `flash_attention: true, gradient_checkpointing: true`
 Compare profiling metrics in SwanLab dashboard.
 ### Use Case 4: Production RLHF with Team Collaboration
 Use `dpo-swanlab-full-featured.yml`, set up team workspace and Lark notifications:
 ```yaml
 swanlab_workspace: ml-team
 swanlab_lark_webhook_url: ...
 swanlab_lark_secret: ...
 ```
 ---
 ## Viewing Your Experiments
 ### Cloud Mode
 Visit [https://swanlab.cn](https://swanlab.cn) and navigate to your project.
 **Dashboard sections**:
 - **Metrics**: Training loss, learning rate, profiling metrics
 - **Tables**: RLHF completions (for DPO/KTO/ORPO/GRPO)
 - **Config**: Hyperparameters and configuration
 - **System**: Resource usage (GPU, memory, CPU)
 - **Files**: Logged artifacts
 ### Local Mode
 ```bash
 swanlab watch ./swanlog
 # Open browser to http://localhost:5092
 ```
 ---
 ## Troubleshooting
 ### SwanLab not initializing
 ```bash
 # Check API key
 echo $SWANLAB_API_KEY
 # Verify SwanLab is installed
 pip show swanlab
 # Check config
 grep -A 5 "use_swanlab" your-config.yml
 ```
 ### Completions not appearing
 - Verify you're using an RLHF trainer (DPO/KTO/ORPO/GRPO)
 - Check `swanlab_log_completions: true`
 - Wait for `swanlab_completion_log_interval` steps
 - Look for "Registered SwanLab RLHF completion logging" in logs
 ### Lark notifications not working
 - Test webhook manually: `curl -X POST "$SWANLAB_LARK_WEBHOOK_URL" ...`
 - Verify `SWANLAB_LARK_SECRET` is set correctly
 - Check bot is added to Lark group chat
 - Look for "Registered Lark notification callback" in logs
 ### Profiling metrics not appearing
 - Verify `use_swanlab: true`
 - Check SwanLab is initialized (look for init log message)
 - Profiling metrics are under "profiling/" namespace
 - Profiling auto-enabled when SwanLab is enabled
 ---
 ## Performance Notes
 ### Overhead Comparison
 | Feature | Overhead per Step | Memory Usage |
 |---------|------------------|--------------|
 | Basic tracking | < 0.1% | ~10 MB |
 | Completion logging | < 0.5% | ~64 KB (buffer=128) |
 | Profiling | < 0.1% | ~1 KB |
 | **Total** | **< 0.7%** | **~10 MB** |
 ### Best Practices
 1. Use ONE logging tool in production (disable WandB/MLflow when using SwanLab)
 2. Adjust completion log interval based on training length (100-200 steps)
 3. Keep completion buffer size reasonable (128-512)
 4. Profile critical path methods first (training_step, compute_loss)
 5. Use ProfilingConfig to throttle high-frequency operations
 ---
 ## Further Reading
 - **Full Documentation**: [src/axolotl/integrations/swanlab/README.md](../../src/axolotl/integrations/swanlab/README.md)
 - **SwanLab Docs**: [https://docs.swanlab.cn](https://docs.swanlab.cn)
 - **Axolotl Docs**: [https://axolotl-ai-cloud.github.io/axolotl/](https://axolotl-ai-cloud.github.io/axolotl/)
 - **DPO Paper**: [Direct Preference Optimization](https://arxiv.org/abs/2305.18290)
 ---
 ## Contributing
 Found an issue or have an improvement? Please submit a PR or open an issue:
 - [Axolotl Issues](https://github.com/axolotl-ai-cloud/axolotl/issues)
 - [SwanLab Issues](https://github.com/SwanHubX/SwanLab/issues)
--- a/examples/swanlab/custom_trainer_profiling.py
+++ b/examples/swanlab/custom_trainer_profiling.py
@@ -0,0 +1,299 @@
 """Example: Custom Trainer with SwanLab Profiling
 This example demonstrates how to add SwanLab profiling to your custom trainer.
 Features:
 - @swanlab_profile decorator for automatic profiling
 - swanlab_profiling_context for fine-grained profiling
 - ProfilingConfig for advanced filtering and throttling
 Usage:
    1. Create your custom trainer extending AxolotlTrainer
    2. Add @swanlab_profile decorators to methods you want to profile
    3. Use swanlab_profiling_context for fine-grained profiling within methods
    4. Enable SwanLab in your config (use_swanlab: true)
 See also:
    - examples/swanlab/lora-swanlab-profiling.yml for config
    - src/axolotl/integrations/swanlab/profiling.py for implementation
 """
 from axolotl.core.trainers.base import AxolotlTrainer
 from axolotl.integrations.swanlab.profiling import (
    ProfilingConfig,
    swanlab_profile,
    swanlab_profiling_context,
    swanlab_profiling_context_advanced,
 )
 class CustomTrainerWithProfiling(AxolotlTrainer):
    """Custom trainer with SwanLab profiling enabled.
    This trainer demonstrates three profiling patterns:
    1. Decorator-based profiling (@swanlab_profile)
    2. Context manager profiling (swanlab_profiling_context)
    3. Advanced profiling with filtering (ProfilingConfig)
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Create custom profiling config for high-frequency operations
        self.fast_op_config = ProfilingConfig(
            enabled=True,
            min_duration_ms=0.5,  # Only log if duration > 0.5ms
            log_interval=50,  # Log every 50th call
        )
    # ========================================================================
    # Pattern 1: Decorator-based Profiling
    # ========================================================================
    # Best for: Methods you always want to profile
    # Overhead: ~2-5 microseconds per call (negligible)
    @swanlab_profile
    def training_step(self, model, inputs):
        """Main training step - always profile.
        Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.training_step
        """
        return super().training_step(model, inputs)
    @swanlab_profile
    def compute_loss(self, model, inputs, return_outputs=False):
        """Loss computation - always profile.
        Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.compute_loss
        """
        return super().compute_loss(model, inputs, return_outputs)
    @swanlab_profile
    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        """Prediction step - always profile.
        Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.prediction_step
        """
        return super().prediction_step(model, inputs, prediction_loss_only, ignore_keys)
    # ========================================================================
    # Pattern 2: Fine-grained Context Manager Profiling
    # ========================================================================
    # Best for: Profiling specific code blocks within a method
    # Use case: When you want to profile forward vs backward separately
    def complex_training_step(self, model, inputs):
        """Training step with fine-grained profiling.
        Profiling metrics:
        - profiling/Time taken: CustomTrainerWithProfiling.forward_pass
        - profiling/Time taken: CustomTrainerWithProfiling.backward_pass
        - profiling/Time taken: CustomTrainerWithProfiling.optimizer_step
        """
        # Profile just the forward pass
        with swanlab_profiling_context(self, "forward_pass"):
            outputs = model(**inputs)
            loss = outputs.loss
        # Profile just the backward pass
        with swanlab_profiling_context(self, "backward_pass"):
            loss.backward()
        # Profile optimizer step
        with swanlab_profiling_context(self, "optimizer_step"):
            self.optimizer.step()
            self.optimizer.zero_grad()
        return outputs
    # ========================================================================
    # Pattern 3: Advanced Profiling with Filtering
    # ========================================================================
    # Best for: High-frequency operations where you want to throttle logging
    # Use case: Methods called 100+ times per step
    def _prepare_inputs(self, inputs):
        """Prepare inputs - throttled profiling.
        This method is called frequently (once per batch), so we throttle
        profiling to reduce overhead:
        - Only log if duration > 0.5ms (skip very fast operations)
        - Only log every 50th call (reduce logging frequency)
        Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.prepare_inputs
        """
        with swanlab_profiling_context_advanced(
            self, "prepare_inputs", config=self.fast_op_config
        ):
            return super()._prepare_inputs(inputs)
    def _prepare_input_for_model(self, input_ids):
        """Another high-frequency operation - throttled profiling.
        Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.prepare_input_for_model
        """
        with swanlab_profiling_context_advanced(
            self, "prepare_input_for_model", config=self.fast_op_config
        ):
            # Your custom input preparation logic
            return input_ids
    # ========================================================================
    # Pattern 4: Exception-safe Profiling
    # ========================================================================
    # Profiling is exception-safe: duration is logged even if method raises
    @swanlab_profile
    def potentially_failing_method(self):
        """This method may raise an exception.
        SwanLab profiling will still log the duration before re-raising.
        Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.potentially_failing_method
        """
        # Do some work
        result = self._do_risky_computation()
        # If this raises, profiling duration is still logged
        if result < 0:
            raise ValueError("Invalid result")
        return result
    def _do_risky_computation(self):
        """Placeholder for risky computation."""
        return 42
 # ============================================================================
 # Advanced Example: Custom ProfilingConfig Per Method
 # ============================================================================
 class AdvancedProfilingTrainer(AxolotlTrainer):
    """Trainer with method-specific profiling configurations."""
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Different profiling configs for different method types
        self.critical_path_config = ProfilingConfig(
            enabled=True,
            min_duration_ms=0.0,  # Log everything on critical path
            log_interval=1,  # Log every call
        )
        self.fast_path_config = ProfilingConfig(
            enabled=True,
            min_duration_ms=1.0,  # Only log if > 1ms
            log_interval=100,  # Log every 100th call
        )
        self.debug_config = ProfilingConfig(
            enabled=True,
            min_duration_ms=0.0,  # Log everything
            log_interval=1,  # Log every call
        )
    def training_step(self, model, inputs):
        """Critical path - log everything."""
        with swanlab_profiling_context_advanced(
            self, "training_step", config=self.critical_path_config
        ):
            return super().training_step(model, inputs)
    def _prepare_inputs(self, inputs):
        """Fast path - throttle logging."""
        with swanlab_profiling_context_advanced(
            self, "prepare_inputs", config=self.fast_path_config
        ):
            return super()._prepare_inputs(inputs)
    def _debug_method(self, data):
        """Debug-only method - verbose logging."""
        with swanlab_profiling_context_advanced(
            self, "debug_method", config=self.debug_config
        ):
            # Your debug logic
            pass
 # ============================================================================
 # How to Use This Custom Trainer
 # ============================================================================
 """
 To use this custom trainer:
 1. Save this file to your project (e.g., my_custom_trainer.py)
 2. Create a config file that uses your custom trainer:
    # config.yml
    base_model: NousResearch/Llama-3.2-1B
    # ... other config ...
    plugins:
      - axolotl.integrations.swanlab.SwanLabPlugin
    use_swanlab: true
    swanlab_project: my-profiling-experiment
    # Optional: Specify custom trainer
    # (Or modify axolotl to use your custom trainer class)
 3. Run training:
    export SWANLAB_API_KEY=your-api-key
    accelerate launch -m axolotl.cli.train config.yml
 4. View profiling metrics in SwanLab dashboard:
   - profiling/Time taken: CustomTrainerWithProfiling.training_step
   - profiling/Time taken: CustomTrainerWithProfiling.forward_pass
   - profiling/Time taken: CustomTrainerWithProfiling.backward_pass
   - etc.
 5. Compare profiling metrics across runs:
   - Run baseline without optimizations
   - Run with flash_attention enabled
   - Run with gradient_checkpointing enabled
   - Compare profiling metrics to see performance impact
 """
 # ============================================================================
 # Tips for Effective Profiling
 # ============================================================================
 """
 1. Profile the critical path first:
   - training_step, compute_loss, prediction_step
   - These methods are called most frequently and have biggest impact
 2. Use throttling for high-frequency operations:
   - Methods called 100+ times per step
   - Use log_interval=50 or log_interval=100
   - Reduces profiling overhead and dashboard clutter
 3. Filter noise with min_duration_ms:
   - Set min_duration_ms=1.0 to skip very fast operations
   - Focus on operations that actually take time
 4. Compare across runs:
   - Run same config multiple times to check consistency
   - Compare different optimization strategies
   - Track profiling trends over time
 5. Monitor distributed training:
   - Check for per-rank timing differences
   - Look for stragglers (slower ranks)
   - Identify synchronization bottlenecks
 6. Disable profiling in production:
   - from axolotl.integrations.swanlab.profiling import DEFAULT_PROFILING_CONFIG
   - DEFAULT_PROFILING_CONFIG.enabled = False
 7. Exception handling:
   - Profiling is exception-safe
   - Duration logged even if method raises
   - Useful for debugging methods that fail intermittently
 """
--- a/examples/swanlab/dpo-swanlab-completions.yml
+++ b/examples/swanlab/dpo-swanlab-completions.yml
@@ -0,0 +1,168 @@
 # SwanLab DPO Training Example with Completion Logging
 #
 # This example demonstrates DPO (Direct Preference Optimization) training
 # with SwanLab integration for experiment tracking and completion table logging.
 #
 # Features enabled:
 # - SwanLab experiment tracking
 # - RLHF completion table logging (prompts, chosen/rejected responses, rewards)
 # - Lark (Feishu) team notifications (optional)
 #
 # To run:
 #   export SWANLAB_API_KEY=your-api-key
 #   accelerate launch -m axolotl.cli.train examples/swanlab/dpo-swanlab-completions.yml
 # Model Configuration
 base_model: meta-llama/Meta-Llama-3-8B-Instruct
 model_type: LlamaForCausalLM
 tokenizer_type: AutoTokenizer
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot_id|>
 # Quantization
 load_in_8bit: true
 load_in_4bit: false
 # LoRA Configuration
 adapter: lora
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 # DPO Configuration
 chat_template: llama3
 rl: dpo
 datasets:
  - path: fozziethebeat/alpaca_messages_2k_dpo_test
    type: chat_template.default
    field_messages: conversation
    field_chosen: chosen
    field_rejected: rejected
    message_property_mappings:
      role: role
      content: content
    roles:
      system:
        - system
      user:
        - user
      assistant:
        - assistant
 # Dataset and Output
 dataset_prepared_path:
 val_set_size: 0.05
 output_dir: ./outputs/dpo-swanlab-out
 # Training Configuration
 sequence_len: 4096
 sample_packing: false
 micro_batch_size: 2
 gradient_accumulation_steps: 4
 num_epochs: 4
 # Optimization
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 warmup_ratio: 0.1
 weight_decay: 0.0
 # Precision
 bf16: auto
 tf32: false
 # Performance
 gradient_checkpointing: true
 flash_attention: true
 # Checkpointing and Logging
 logging_steps: 1
 evals_per_epoch: 4
 saves_per_epoch: 1
 # ============================================================================
 # SwanLab Integration
 # ============================================================================
 plugins:
  - axolotl.integrations.swanlab.SwanLabPlugin
 # Basic SwanLab Configuration
 use_swanlab: true
 swanlab_project: dpo-training
 swanlab_experiment_name: llama-3-dpo-completions-demo
 swanlab_description: "DPO training with completion table logging"
 swanlab_mode: cloud  # Options: cloud, local, offline, disabled
 # SwanLab Authentication
 # Recommended: Set via environment variable
 #   export SWANLAB_API_KEY=your-api-key
 # Or set in config (less secure):
 # swanlab_api_key: your-api-key
 # Optional: Team workspace
 # swanlab_workspace: my-research-team
 # ============================================================================
 # RLHF Completion Table Logging
 # ============================================================================
 #
 # Automatically logs model completions to SwanLab for qualitative analysis:
 # - Prompts from your DPO dataset
 # - Chosen responses (preferred)
 # - Rejected responses (non-preferred)
 # - Reward differences
 #
 # View the table in SwanLab dashboard under "rlhf_completions"
 swanlab_log_completions: true
 swanlab_completion_log_interval: 100  # Log every 100 training steps
 swanlab_completion_max_buffer: 128    # Keep last 128 completions in memory
 # Memory Usage Notes:
 # - Buffer size 128: ~64 KB (default, recommended)
 # - Buffer size 512: ~256 KB (for more historical completions)
 # - Buffer size 1024: ~512 KB (maximum for very long training runs)
 # Performance Notes:
 # - Completion logging overhead: < 0.5% per training step
 # - Only logs every N steps to minimize impact
 # - Memory-bounded buffer prevents memory leaks
 # ============================================================================
 # Optional: Lark (Feishu) Team Notifications
 # ============================================================================
 #
 # Get real-time training notifications in your team chat
 # Uncomment to enable:
 # swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx
 # swanlab_lark_secret: your-webhook-secret  # Recommended for production
 # Notifications sent for:
 # - Training start
 # - Training completion
 # - Training errors
 # - Metric milestones (if configured)
 # ============================================================================
 # Optional: Private SwanLab Deployment
 # ============================================================================
 #
 # For enterprise users with private SwanLab deployment:
 # swanlab_web_host: https://swanlab.yourcompany.com
 # swanlab_api_host: https://api.swanlab.yourcompany.com
 # ============================================================================
 # Disable WandB if you're migrating from it
 # ============================================================================
 # wandb_project:
 # wandb_entity:
 # use_wandb: false
--- a/examples/swanlab/dpo-swanlab-full-featured.yml
+++ b/examples/swanlab/dpo-swanlab-full-featured.yml
@@ -0,0 +1,329 @@
 # SwanLab Full-Featured DPO Training Example
 #
 # This example demonstrates ALL SwanLab integration features:
 # - Experiment tracking with cloud sync
 # - RLHF completion table logging
 # - Performance profiling
 # - Lark (Feishu) team notifications
 # - Team workspace collaboration
 #
 # Use this as a reference for production RLHF training setups.
 #
 # To run:
 #   export SWANLAB_API_KEY=your-api-key
 #   export SWANLAB_LARK_WEBHOOK_URL=https://open.feishu.cn/...
 #   export SWANLAB_LARK_SECRET=your-webhook-secret
 #   accelerate launch -m axolotl.cli.train examples/swanlab/dpo-swanlab-full-featured.yml
 # ============================================================================
 # Model Configuration
 # ============================================================================
 base_model: meta-llama/Meta-Llama-3-8B-Instruct
 model_type: LlamaForCausalLM
 tokenizer_type: AutoTokenizer
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot_id|>
 # Quantization for efficient training
 load_in_8bit: true
 load_in_4bit: false
 # ============================================================================
 # LoRA Configuration
 # ============================================================================
 adapter: lora
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true  # Target all linear layers
 # ============================================================================
 # DPO (Direct Preference Optimization) Configuration
 # ============================================================================
 chat_template: llama3
 rl: dpo  # Enable DPO trainer
 datasets:
  - path: fozziethebeat/alpaca_messages_2k_dpo_test
    type: chat_template.default
    field_messages: conversation
    field_chosen: chosen
    field_rejected: rejected
    message_property_mappings:
      role: role
      content: content
    roles:
      system:
        - system
      user:
        - user
      assistant:
        - assistant
 # ============================================================================
 # Dataset and Output Configuration
 # ============================================================================
 dataset_prepared_path:
 val_set_size: 0.05
 output_dir: ./outputs/dpo-swanlab-full-featured-out
 # ============================================================================
 # Training Configuration
 # ============================================================================
 sequence_len: 4096
 sample_packing: false
 micro_batch_size: 2
 gradient_accumulation_steps: 4
 num_epochs: 4
 # ============================================================================
 # Optimization
 # ============================================================================
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 warmup_ratio: 0.1
 weight_decay: 0.0
 # ============================================================================
 # Precision and Performance
 # ============================================================================
 bf16: auto
 tf32: false
 gradient_checkpointing: true
 flash_attention: true
 # ============================================================================
 # Checkpointing and Logging
 # ============================================================================
 logging_steps: 1
 evals_per_epoch: 4
 saves_per_epoch: 1
 # ============================================================================
 # SwanLab Integration - Full Configuration
 # ============================================================================
 plugins:
  - axolotl.integrations.swanlab.SwanLabPlugin
 # ------------------------------------------------------------------------------
 # Basic SwanLab Configuration
 # ------------------------------------------------------------------------------
 use_swanlab: true
 swanlab_project: dpo-production
 swanlab_experiment_name: llama-3-dpo-full-featured-v1
 swanlab_description: |
  Production DPO training with all SwanLab features enabled:
  - Completion table logging for qualitative analysis
  - Performance profiling for optimization
  - Lark notifications for team collaboration
 swanlab_mode: cloud  # Options: cloud, local, offline, disabled
 # ------------------------------------------------------------------------------
 # Team Collaboration
 # ------------------------------------------------------------------------------
 # Workspace for team collaboration (shared experiments)
 swanlab_workspace: ml-research-team
 # Authentication (recommended: use environment variable)
 #   export SWANLAB_API_KEY=your-api-key
 # Or set in config (less secure):
 # swanlab_api_key: your-api-key
 # ------------------------------------------------------------------------------
 # RLHF Completion Table Logging
 # ------------------------------------------------------------------------------
 # Automatically logs model completions for qualitative analysis:
 # - Prompts from your DPO dataset
 # - Chosen responses (preferred)
 # - Rejected responses (non-preferred)
 # - Reward differences
 #
 # View in SwanLab dashboard under "rlhf_completions" table
 swanlab_log_completions: true
 swanlab_completion_log_interval: 100  # Log every 100 steps
 swanlab_completion_max_buffer: 256    # Larger buffer for long training runs
 # Buffer size recommendations:
 # - 128: Default, ~64 KB memory (recommended for most cases)
 # - 256: ~128 KB memory (this config, good for longer training)
 # - 512: ~256 KB memory (maximum for very long runs)
 # ------------------------------------------------------------------------------
 # Lark (Feishu) Team Notifications
 # ------------------------------------------------------------------------------
 # Get real-time training notifications in your team chat
 #
 # Notifications sent for:
 # - Training start
 # - Training completion
 # - Training errors
 # - Metric milestones (if configured)
 # Recommended: Set via environment variables
 #   export SWANLAB_LARK_WEBHOOK_URL=https://open.feishu.cn/...
 #   export SWANLAB_LARK_SECRET=your-webhook-secret
 # Or set in config (less secure):
 # swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx
 # swanlab_lark_secret: your-webhook-secret  # REQUIRED for production
 # Security note: ALWAYS use swanlab_lark_secret in production to prevent
 # unauthorized parties from sending fake notifications to your team chat.
 # ------------------------------------------------------------------------------
 # Performance Profiling
 # ------------------------------------------------------------------------------
 # Profiling is automatically enabled when SwanLab is enabled.
 # Metrics logged to SwanLab under "profiling/" namespace:
 #   profiling/Time taken: AxolotlTrainer.training_step
 #   profiling/Time taken: AxolotlTrainer.compute_loss
 #   profiling/Time taken: AxolotlTrainer.prediction_step
 #
 # Use these metrics to:
 # - Identify bottlenecks in training loop
 # - Compare performance across different configurations
 # - Monitor performance regressions over time
 # - Debug unexpected slowdowns
 # For custom profiling in your own trainer, see:
 #   examples/swanlab/custom_trainer_profiling.py
 # ------------------------------------------------------------------------------
 # Optional: Private SwanLab Deployment
 # ------------------------------------------------------------------------------
 # For enterprise users with private SwanLab deployment:
 # swanlab_web_host: https://swanlab.yourcompany.com
 # swanlab_api_host: https://api.swanlab.yourcompany.com
 # ------------------------------------------------------------------------------
 # Optional: Model Checkpointing to SwanLab
 # ------------------------------------------------------------------------------
 # Log model checkpoints to SwanLab (coming soon)
 swanlab_log_model: false
 # ============================================================================
 # Disable Other Logging Tools (Recommended)
 # ============================================================================
 # Using multiple logging tools simultaneously can impact performance:
 # - Expected overhead: ~1-2% per logger
 # - Potential config/callback conflicts
 #
 # For production training, use ONLY SwanLab:
 # wandb_project:
 # use_wandb: false
 #
 # use_mlflow: false
 #
 # use_comet: false
 # ============================================================================
 # Expected Training Behavior
 # ============================================================================
 # With this configuration, you should see:
 #
 # 1. SwanLab Initialization (rank 0 only):
 #    INFO: SwanLab initialized for project: dpo-production
 #    INFO: SwanLab experiment: llama-3-dpo-full-featured-v1
 #    INFO: SwanLab mode: cloud
 #    INFO: SwanLab workspace: ml-research-team
 #
 # 2. Completion Logging (rank 0 only):
 #    INFO: Registered SwanLab RLHF completion logging callback for DPOTrainer
 #          (log_interval=100, max_buffer=256)
 #
 # 3. Lark Notifications (rank 0 only):
 #    INFO: Registered Lark notification callback with HMAC authentication
 #
 # 4. Distributed Training Detection (if multi-GPU):
 #    INFO: Distributed training detected (world_size=N)
 #    INFO: Only rank 0 will initialize SwanLab
 #    INFO: Other ranks will skip SwanLab to avoid conflicts
 #
 # 5. Training Start Notification (Lark):
 #    Your team chat receives: "Training started: llama-3-dpo-full-featured-v1"
 #
 # 6. Periodic Completion Logging:
 #    Every 100 steps, completion table is updated in SwanLab dashboard
 #
 # 7. Training Complete Notification (Lark):
 #    Your team chat receives: "Training completed: llama-3-dpo-full-featured-v1"
 #    With link to SwanLab dashboard and final metrics
 #
 # 8. SwanLab Dashboard Shows:
 #    - Training metrics (loss, learning rate, etc.)
 #    - Completion table (rlhf_completions)
 #    - Profiling metrics (profiling/Time taken: ...)
 #    - Hyperparameters and configuration
 #    - System resource usage
 # ============================================================================
 # Production Checklist
 # ============================================================================
 # Before deploying to production, verify:
 # ✅ SwanLab API key is set via environment variable (not in config)
 # ✅ Lark webhook secret is set (required for HMAC authentication)
 # ✅ Workspace is set to your team's workspace
 # ✅ Experiment name is descriptive and unique
 # ✅ Only SwanLab is enabled (other loggers disabled)
 # ✅ Completion logging buffer size is appropriate for your training duration
 # ✅ Private deployment hosts are set (if using enterprise SwanLab)
 # ✅ Test run completes successfully and shows up in SwanLab dashboard
 # ✅ Lark notifications are received in team chat
 # ✅ Profiling metrics are logged correctly
 # ============================================================================
 # Troubleshooting
 # ============================================================================
 # If SwanLab initialization fails:
 # 1. Check SWANLAB_API_KEY environment variable is set
 # 2. Verify swanlab_project is set in config
 # 3. Check swanlab_mode is valid (cloud/local/offline/disabled)
 # 4. Verify internet connectivity (for cloud mode)
 # If Lark notifications not received:
 # 1. Check SWANLAB_LARK_WEBHOOK_URL is set correctly
 # 2. Verify SWANLAB_LARK_SECRET matches your Lark bot settings
 # 3. Test webhook manually: curl -X POST "$SWANLAB_LARK_WEBHOOK_URL" ...
 # 4. Check training logs for "Registered Lark notification callback"
 # 5. Verify bot is added to the target Lark group chat
 # If completions not appearing in SwanLab:
 # 1. Verify you're using an RLHF trainer (DPO/KTO/ORPO/GRPO)
 # 2. Check swanlab_log_completions is true
 # 3. Wait for log_interval steps (default: 100)
 # 4. Check training logs for "Registered SwanLab RLHF completion logging"
 # If profiling metrics not appearing:
 # 1. Verify use_swanlab is true
 # 2. Check SwanLab is initialized (check logs)
 # 3. Look under "profiling/" namespace in dashboard
 # 4. Profiling may be disabled if DEFAULT_PROFILING_CONFIG.enabled = False
 # For more help:
 # - SwanLab docs: https://docs.swanlab.cn
 # - Axolotl SwanLab integration: src/axolotl/integrations/swanlab/README.md
 # - GitHub issues: https://github.com/axolotl-ai-cloud/axolotl/issues
--- a/examples/swanlab/lora-swanlab-profiling.yml
+++ b/examples/swanlab/lora-swanlab-profiling.yml
@@ -0,0 +1,178 @@
 # SwanLab LoRA Training Example with Performance Profiling
 #
 # This example demonstrates standard LoRA fine-tuning with SwanLab integration
 # for performance profiling and optimization.
 #
 # Features enabled:
 # - SwanLab experiment tracking
 # - Performance profiling (training step, forward/backward pass timing)
 # - Real-time metrics visualization
 #
 # To run:
 #   export SWANLAB_API_KEY=your-api-key
 #   accelerate launch -m axolotl.cli.train examples/swanlab/lora-swanlab-profiling.yml
 # Model Configuration
 base_model: NousResearch/Llama-3.2-1B
 # Dataset Configuration
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
 val_set_size: 0.1
 output_dir: ./outputs/lora-swanlab-profiling-out
 # LoRA Configuration
 adapter: lora
 lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.05
 lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj
 # Training Configuration
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: true
 micro_batch_size: 2
 gradient_accumulation_steps: 2
 num_epochs: 1
 # Optimization
 optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 warmup_ratio: 0.1
 weight_decay: 0.0
 # Precision
 bf16: auto
 tf32: false
 # Performance
 gradient_checkpointing: true
 flash_attention: true
 # Checkpointing and Logging
 logging_steps: 1
 evals_per_epoch: 4
 saves_per_epoch: 1
 # Loss Monitoring
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
 special_tokens:
  pad_token: "<|end_of_text|>"
 # ============================================================================
 # SwanLab Integration
 # ============================================================================
 plugins:
  - axolotl.integrations.swanlab.SwanLabPlugin
 # Basic SwanLab Configuration
 use_swanlab: true
 swanlab_project: lora-profiling
 swanlab_experiment_name: llama-3.2-1b-profiling-demo
 swanlab_description: "LoRA fine-tuning with performance profiling"
 swanlab_mode: cloud  # Options: cloud, local, offline, disabled
 # SwanLab Authentication
 # Recommended: Set via environment variable
 #   export SWANLAB_API_KEY=your-api-key
 # Or set in config (less secure):
 # swanlab_api_key: your-api-key
 # Optional: Team workspace
 # swanlab_workspace: my-ml-team
 # ============================================================================
 # Performance Profiling
 # ============================================================================
 #
 # SwanLab automatically profiles trainer methods when enabled.
 # Profiling metrics appear in SwanLab dashboard under "profiling/" namespace.
 #
 # Built-in profiling:
 # - Minimal overhead (< 0.1% per step)
 # - High-precision timing (microsecond accuracy)
 # - Exception-safe (logs duration even if method fails)
 #
 # View profiling metrics in SwanLab dashboard:
 #   profiling/Time taken: AxolotlTrainer.training_step
 #   profiling/Time taken: AxolotlTrainer.compute_loss
 #   profiling/Time taken: AxolotlTrainer.prediction_step
 #
 # For custom profiling in your own trainer, see:
 #   examples/swanlab/custom_trainer_profiling.py
 # Completion logging is disabled for non-RLHF trainers
 swanlab_log_completions: false  # Only works with DPO/KTO/ORPO/GRPO
 # ============================================================================
 # Optional: Compare with Multiple Runs
 # ============================================================================
 #
 # To compare profiling metrics across different configurations:
 #
 # 1. Run baseline without flash attention:
 #    swanlab_experiment_name: llama-3.2-1b-no-flash-attn
 #    flash_attention: false
 #
 # 2. Run with gradient checkpointing:
 #    swanlab_experiment_name: llama-3.2-1b-grad-checkpoint
 #    gradient_checkpointing: true
 #
 # 3. Run with both:
 #    swanlab_experiment_name: llama-3.2-1b-optimized
 #    flash_attention: true
 #    gradient_checkpointing: true
 #
 # Then compare profiling metrics in SwanLab dashboard to see performance impact
 # ============================================================================
 # Optional: Lark (Feishu) Team Notifications
 # ============================================================================
 #
 # Get notified when profiling experiments complete:
 # swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx
 # swanlab_lark_secret: your-webhook-secret
 # ============================================================================
 # Profiling Best Practices
 # ============================================================================
 #
 # 1. Run multiple epochs to see profiling trends over time
 # 2. Ignore first ~10 steps (warmup period, slower)
 # 3. Look for outliers (steps that take significantly longer)
 # 4. Compare profiling metrics before/after optimization changes
 # 5. Monitor per-rank profiling in distributed training
 #
 # Common bottlenecks to profile:
 # - training_step: Overall step time (should be consistent)
 # - compute_loss: Loss computation (scales with sequence length)
 # - prediction_step: Evaluation time (can be slow for large val sets)
 #
 # If you see inconsistent timing:
 # - Check for data loading bottlenecks
 # - Monitor GPU utilization (may be CPU-bound)
 # - Check for gradient accumulation effects
 # - Verify CUDA kernel synchronization
 # ============================================================================
 # Disable WandB if you're migrating from it
 # ============================================================================
 # wandb_project:
 # use_wandb: false
--- a/examples/trinity/README.md
+++ b/examples/trinity/README.md
@@ -29,6 +29,10 @@ Let us know how it goes. Happy finetuning! 🚀
 Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
 ## Limitations
 **Cut Cross Entropy (CCE)**: Currently not supported. We plan to include CCE support for Trinity in the near future.
 ## Related Resources
 - [Trinity Blog](https://www.arcee.ai/blog/the-trinity-manifesto)
--- a/examples/trinity/trinity-nano-preview-qlora.yaml
+++ b/examples/trinity/trinity-nano-preview-qlora.yaml
@@ -1,5 +1,6 @@
 base_model: arcee-ai/Trinity-Nano-Preview
 trust_remote_code: true
 revision_of_model: 2ee94b0
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
--- a/examples/voxtral/README.md
+++ b/examples/voxtral/README.md
@@ -12,7 +12,7 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r
 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
+pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools>=64", "wheel", "setuptools_scm>=8", "packaging==23.2"]
+requires = ["setuptools>=64", "wheel", "setuptools_scm>=8", "packaging==26.0"]
 build-backend = "setuptools.build_meta"
 [project]
@@ -24,6 +24,9 @@ Repository = "https://github.com/axolotl-ai-cloud/axolotl.git"
 py-modules = ["setuptools_axolotl_dynamic_dependencies"]
 include-package-data = true
 [tool.setuptools.dynamic]
 version = { file = "VERSION" }
 [tool.setuptools.cmdclass]
 build_py = "setuptools_axolotl_dynamic_dependencies.BuildPyCommand"
@@ -57,3 +60,6 @@ indent-style = "space"
 skip-magic-trailing-comma = false
 line-ending = "auto"
 docstring-code-format = false
 [tool.uv.extra-build-dependencies]
 axolotl = ["huggingface_hub"]
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,35 +1,35 @@
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
 # START section of dependencies that don't install on Darwin/MacOS
-bitsandbytes==0.48.2
+bitsandbytes==0.49.1
-triton>=3.0.0
+triton>=3.4.0
 mamba-ssm==1.2.0.post1
 xformers>=0.0.23.post1
-liger-kernel==0.6.4
+liger-kernel==0.7.0
 # END section
-packaging==23.2
+packaging==26.0
-
+huggingface_hub>=1.1.7
-huggingface_hub>=0.36.0
+peft>=0.18.1
 peft>=0.18.0
 tokenizers>=0.22.1
-transformers==4.57.1
+transformers @ git+https://github.com/winglian/transformers.git@refactor-inner-training-loop-reorder-only
-accelerate==1.11.0
+accelerate==1.12.0
-datasets==4.4.1
+datasets==4.5.0
-deepspeed>=0.17.0
+deepspeed>=0.18.3
-trl==0.25.0
+trl==0.28.0
 hf_xet==1.2.0
-kernels>=0.9.0
+kernels==0.11.5
 trackio>=0.13.0
-typing_extensions>=4.14.0
+typing-extensions>=4.15.0
 optimum==1.16.2
 hf_transfer
 sentencepiece
 gradio>=6.2.0,<7.0
-modal==1.0.2
+modal==1.3.0.post1
-pydantic>=2.10.6,<2.12
+pydantic>=2.10.6
 addict
 fire
 PyYAML>=6.0
@@ -63,7 +63,7 @@ langdetect==1.0.9
 immutabledict==4.2.0
 antlr4-python3-runtime==4.13.2
-torchao==0.13.0
+torchao==0.16.0
 openenv-core==0.1.0
 schedulefree==1.4.1
@@ -72,4 +72,4 @@ axolotl-contribs-mit==0.0.6
 # telemetry
 posthog==6.7.11
-mistral-common==1.8.6
+mistral-common==1.8.8
--- a/scripts/cutcrossentropy_install.py
+++ b/scripts/cutcrossentropy_install.py
@@ -29,5 +29,5 @@ UV_PREFIX = "uv " if USE_UV else ""
 print(
    UNINSTALL_PREFIX
-    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@f643b88"'
+    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0d4ce4b"'
 )
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,5 @@
 """setup.py for axolotl"""
 import ast
 import os
 import platform
 import re
@@ -26,6 +25,7 @@ def parse_requirements(extras_require_map):
                _install_requires.append(line)
    try:
        xformers_version = [req for req in _install_requires if "xformers" in req][0]
        install_xformers = platform.machine() != "aarch64"
        if "Darwin" in platform.system():
            # skip packages not compatible with OSX
            skip_packages = [
@@ -62,44 +62,68 @@ def parse_requirements(extras_require_map):
            else:
                raise ValueError("Invalid version format")
            torch_parts = torch_version.split("+")
            if len(torch_parts) == 2:
                torch_cuda_version = torch_parts[1]
                _dependency_links.append(
                    f"https://download.pytorch.org/whl/{torch_cuda_version}"
                )
            if (major, minor) >= (2, 9):
                extras_require_map.pop("fbgemm-gpu")
-                extras_require_map["fbgemm-gpu"] = ["fbgemm-gpu-genai==1.4.1"]
+                extras_require_map["fbgemm-gpu"] = [
                    "fbgemm-gpu==1.4.0",
                    "fbgemm-gpu-genai==1.4.2",
                ]
                extras_require_map["vllm"] = ["vllm==0.11.1"]
                if not install_xformers:
                    _install_requires.pop(_install_requires.index(xformers_version))
                extras_require_map["vllm"] = ["vllm==0.13.0"]
                if patch == 0:
                    extras_require_map["vllm"] = ["vllm==0.13.0"]
                else:
                    extras_require_map["vllm"] = ["vllm==0.14.0"]
            elif (major, minor) >= (2, 8):
                extras_require_map.pop("fbgemm-gpu")
                extras_require_map["fbgemm-gpu"] = ["fbgemm-gpu-genai==1.3.0"]
                extras_require_map["vllm"] = ["vllm==0.11.0"]
                if not install_xformers:
                    _install_requires.pop(_install_requires.index(xformers_version))
            elif (major, minor) >= (2, 7):
                _install_requires.pop(_install_requires.index(xformers_version))
                if patch == 0:
-                    _install_requires.append("xformers==0.0.30")
+                    if install_xformers:
                        _install_requires.append("xformers==0.0.30")
                    # vllm 0.9.x is incompatible with latest transformers
                    extras_require_map.pop("vllm")
                else:
-                    _install_requires.append("xformers==0.0.31")
+                    if install_xformers:
                        _install_requires.append("xformers==0.0.31")
                    extras_require_map["vllm"] = ["vllm==0.10.1"]
            elif (major, minor) >= (2, 6):
                _install_requires.pop(_install_requires.index(xformers_version))
-                _install_requires.append("xformers==0.0.29.post3")
+                if install_xformers:
                    _install_requires.append("xformers==0.0.29.post3")
                # since we only support 2.6.0+cu126
                _dependency_links.append("https://download.pytorch.org/whl/cu126")
                extras_require_map.pop("vllm")
            elif (major, minor) >= (2, 5):
                _install_requires.pop(_install_requires.index(xformers_version))
-                if patch == 0:
+                if install_xformers:
-                    _install_requires.append("xformers==0.0.28.post2")
+                    if patch == 0:
-                else:
+                        _install_requires.append("xformers==0.0.28.post2")
-                    _install_requires.append("xformers>=0.0.28.post3")
+                    else:
                        _install_requires.append("xformers>=0.0.28.post3")
                extras_require_map.pop("vllm")
            elif (major, minor) >= (2, 4):
                extras_require_map.pop("vllm")
-                if patch == 0:
+                if install_xformers:
-                    _install_requires.pop(_install_requires.index(xformers_version))
+                    if patch == 0:
-                    _install_requires.append("xformers>=0.0.27")
+                        _install_requires.pop(_install_requires.index(xformers_version))
-                else:
+                        _install_requires.append("xformers>=0.0.27")
-                    _install_requires.pop(_install_requires.index(xformers_version))
+                    else:
-                    _install_requires.append("xformers==0.0.28.post1")
+                        _install_requires.pop(_install_requires.index(xformers_version))
                        _install_requires.append("xformers==0.0.28.post1")
            else:
                raise ValueError("axolotl requires torch>=2.4")
@@ -110,15 +134,11 @@ def parse_requirements(extras_require_map):
 def get_package_version():
    with open(
-        Path(os.path.dirname(os.path.abspath(__file__)))
+        Path(os.path.dirname(os.path.abspath(__file__))) / "VERSION",
        / "src"
        / "axolotl"
        / "__init__.py",
        "r",
        encoding="utf-8",
    ) as fin:
-        version_match = re.search(r"^__version__\s*=\s*(.*)$", fin.read(), re.MULTILINE)
+        version_ = fin.read().strip()
    version_ = ast.literal_eval(version_match.group(1))
    return version_
@@ -156,7 +176,7 @@ extras_require = {
        "came_pytorch==0.1.3",
    ],
    "ray": [
-        "ray[train]",
+        "ray[train]>=2.52.1",
    ],
    "vllm": [
        "vllm==0.10.0",
--- a/src/axolotl/init.py
+++ b/src/axolotl/init.py
@@ -1,7 +1,11 @@
 """Axolotl - Train and fine-tune large language models"""
 import pkgutil
 from importlib.metadata import PackageNotFoundError, version
 __path__ = pkgutil.extend_path(__path__, __name__)  # Make this a namespace package
-__version__ = "0.13.0.dev"
+try:
    __version__ = version("axolotl")
 except PackageNotFoundError:
    __version__ = "unknown"
--- a/src/axolotl/cli/init.py
+++ b/src/axolotl/cli/init.py
@@ -5,6 +5,6 @@ import os
 from axolotl.logging_config import configure_logging
 os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
-os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
+os.environ.setdefault("HF_XET_HIGH_PERFORMANCE", "1")
 configure_logging()
--- a/src/axolotl/cli/checks.py
+++ b/src/axolotl/cli/checks.py
@@ -44,7 +44,7 @@ def check_user_token() -> bool:
        return bool(user_info)
    except LocalTokenNotFoundError:
        LOG.warning(
-            "Error verifying HuggingFace token. Remember to log in using `huggingface-cli login` and get your access token from https://huggingface.co/settings/tokens if you want to use gated models or datasets."
+            "Error verifying HuggingFace token. Remember to log in using `hf auth login` and get your access token from https://huggingface.co/settings/tokens if you want to use gated models or datasets."
        )
        return False
    except HTTPError:
--- a/src/axolotl/cli/cloud/baseten/template/train_sft.py
+++ b/src/axolotl/cli/cloud/baseten/template/train_sft.py
@@ -24,8 +24,7 @@ if launcher_args:
    launcher_args_str = "-- " + " ".join(launcher_args)
 # 1. Define a base image for your training job
-# must use torch 2.7.0 for vllm
+BASE_IMAGE = "axolotlai/axolotl:main-py3.11-cu128-2.9.1"
 BASE_IMAGE = "axolotlai/axolotl:main-py3.11-cu126-2.7.1"
 # 2. Define the Runtime Environment for the Training Job
 # This includes start commands and environment variables.a
--- a/src/axolotl/cli/cloud/modal_.py
+++ b/src/axolotl/cli/cloud/modal_.py
@@ -82,7 +82,7 @@ class ModalCloud(Cloud):
        return res
    def get_image(self):
-        docker_tag = "main-py3.11-cu126-2.7.1"
+        docker_tag = "main-py3.11-cu128-2.9.1"
        if self.config.docker_tag:
            docker_tag = self.config.docker_tag
        docker_image = f"axolotlai/axolotl:{docker_tag}"
--- a/src/axolotl/cli/merge_lora.py
+++ b/src/axolotl/cli/merge_lora.py
@@ -24,7 +24,6 @@ def do_merge_lora(*, cfg: DictDefault) -> None:
        cfg: Dictionary mapping `axolotl` config keys to values.
    """
    model, tokenizer, processor = load_model_and_tokenizer(cfg=cfg)
    safe_serialization = cfg.save_safetensors is True
    LOG.info("Running merge of LoRA with base model...")
    model = model.merge_and_unload(progressbar=True)
@@ -42,7 +41,6 @@ def do_merge_lora(*, cfg: DictDefault) -> None:
        LOG.info(f"Saving merged model to: {str(Path(cfg.output_dir) / 'merged')}...")
        model.save_pretrained(
            str(Path(cfg.output_dir) / "merged"),
            safe_serialization=safe_serialization,
            progressbar=True,
        )
        tokenizer.save_pretrained(
--- a/src/axolotl/cli/merge_sharded_fsdp_weights.py
+++ b/src/axolotl/cli/merge_sharded_fsdp_weights.py
@@ -14,8 +14,6 @@ from accelerate import PartialState
 from accelerate.utils import (
    SAFE_WEIGHTS_INDEX_NAME,
    SAFE_WEIGHTS_NAME,
    WEIGHTS_INDEX_NAME,
    WEIGHTS_NAME,
    is_torch_version,
 )
 from huggingface_hub import split_torch_state_dict_into_shards
@@ -40,17 +38,15 @@ class BFloat16CastPlanner(_EmptyStateDictLoadPlanner):
 def _distributed_checkpoint_to_merged_weights(
    checkpoint_dir: Union[str, Path],
    save_path: str,
    safe_serialization: bool = False,
    max_shard_size: str = "5GB",
 ) -> Path:
    """
    Passthrough to `torch.distributed.checkpoint.format_utils.dcp_to_torch_save`. Will
-    save under `save_path` as either `model.safetensors` or `pytorch_model.bin`.
+    save under `save_path` as `model.safetensors`.
    Args:
        checkpoint_dir: Directory where distributed checkpoint is saved.
        save_path: Path to save model to.
        safe_serialization: Whether to save in safetensors format.
        max_shard_size: Max size of model shards to save.
    Returns:
@@ -76,11 +72,7 @@ def _distributed_checkpoint_to_merged_weights(
        if isinstance(value, torch.Tensor) and value.dtype != torch.bfloat16:
            state_dict[key] = value.to(torch.bfloat16)
-    weights_name = SAFE_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME
+    filename_pattern = SAFE_WEIGHTS_NAME.replace(".safetensors", "{suffix}.safetensors")
    filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(
        ".safetensors", "{suffix}.safetensors"
    )
    state_dict_split = split_torch_state_dict_into_shards(
        state_dict, filename_pattern=filename_pattern, max_shard_size=max_shard_size
    )
@@ -98,19 +90,12 @@ def _distributed_checkpoint_to_merged_weights(
    for shard_file, tensors in filename_to_tensors:
        shard = {tensor: state_dict[tensor] for tensor in tensors}
-
+        safe_save_file(
-        if safe_serialization:
+            shard, os.path.join(save_path_, shard_file), metadata={"format": "pt"}
-            safe_save_file(
+        )
                shard, os.path.join(save_path_, shard_file), metadata={"format": "pt"}
            )
        else:
            torch.save(shard, os.path.join(save_path_, shard_file))
    if index is not None:
-        save_index_file = (
+        save_index_file = os.path.join(save_path_, SAFE_WEIGHTS_INDEX_NAME)
            SAFE_WEIGHTS_INDEX_NAME if safe_serialization else WEIGHTS_INDEX_NAME
        )
        save_index_file = os.path.join(save_path_, save_index_file)
        # Save the index as well
        with open(save_index_file, "w", encoding="utf-8") as fout:
            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
@@ -123,13 +108,11 @@ def _distributed_checkpoint_to_merged_weights(
 def merge_fsdp_weights(
    checkpoint_dir: str,
    output_path: str,
    safe_serialization: bool = False,
    remove_checkpoint_dir: bool = False,
 ):
    """
    Merge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if
-    `SHARDED_STATE_DICT` was used for the model. Weights will be saved to `{output_path}/model.safetensors` if
+    `SHARDED_STATE_DICT` was used for the model. Weights will be saved to `{output_path}/model.safetensors`.
    `safe_serialization` else `pytorch_model.bin`.
    Note: this is a CPU-bound process.
@@ -138,8 +121,6 @@ def merge_fsdp_weights(
            The directory containing the FSDP checkpoints (can be either the model or optimizer).
        output_path (`str`):
            The path to save the merged checkpoint.
        safe_serialization (`bool`, *optional*, defaults to `True`):
            Whether to save the merged weights with safetensors (recommended).
        remove_checkpoint_dir (`bool`, *optional*, defaults to `False`):
            Whether to remove the checkpoint directory after merging.
@@ -177,7 +158,7 @@ def merge_fsdp_weights(
    if state.is_main_process:
        LOG.info(f"Merging FSDP weights from {checkpoint_dir_}")
        save_path = _distributed_checkpoint_to_merged_weights(
-            checkpoint_dir_, output_path, safe_serialization
+            checkpoint_dir_, output_path
        )
        LOG.info(f"Successfully merged FSDP weights and saved to {save_path}")
        if remove_checkpoint_dir:
@@ -210,7 +191,6 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
    merge_fsdp_weights(
        checkpoint_dir=str(fsdp_dir),
        output_path=output_path,
        safe_serialization=True,
    )
    state = PartialState()
    state.wait_for_everyone()
--- a/src/axolotl/cli/quantize.py
+++ b/src/axolotl/cli/quantize.py
@@ -102,12 +102,10 @@ def do_quantize(
    LOG.info(f"Saving quantized model to: {str(Path(output_dir) / 'quantized')}.")
    model.save_pretrained(
        str(Path(output_dir) / "quantized"),
        safe_serialization=False,
        progressbar=True,
    )
    tokenizer.save_pretrained(
        str(Path(output_dir) / "quantized"),
        safe_serialization=False,
        progressbar=True,
        save_jinja_files=cfg.tokenizer_save_jinja_files,
    )
@@ -121,7 +119,7 @@ def do_quantize(
            hub_model_id.rstrip("-")
            + f"-{quantization_config_to_str[type(quantization_config)]}"
        )
-        model.push_to_hub(hub_model_id, safe_serialization=False)
+        model.push_to_hub(hub_model_id)
        tokenizer.push_to_hub(hub_model_id)
        if processor:
            processor.push_to_hub(hub_model_id)
--- a/src/axolotl/core/attention/flex_block_mask.py
+++ b/src/axolotl/core/attention/flex_block_mask.py
@@ -1,158 +0,0 @@
 """
 monkeypatch for flex + packing
 """
 import sys
 from typing import Callable, Optional, Union
 import torch
 from torch.nn.attention.flex_attention import BlockMask
 from transformers import Cache, PretrainedConfig
 from transformers.masking_utils import (
    ALL_MASK_ATTENTION_FUNCTIONS,
    _preprocess_mask_arguments,
    and_masks,
    causal_mask_function,
    or_masks,
 )
 from transformers.utils import is_torch_greater_or_equal
 _is_torch_greater_or_equal_than_2_6 = is_torch_greater_or_equal("2.6", accept_dev=True)
 def create_causal_mask(
    config: PretrainedConfig,
    input_embeds: torch.Tensor,
    attention_mask: torch.Tensor,
    cache_position: torch.Tensor,
    past_key_values: Optional[Cache],
    or_mask_function: Optional[Callable] = None,
    and_mask_function: Optional[Callable] = None,
 ) -> Optional[Union[torch.Tensor, BlockMask]]:
    """
    Create a standard causal mask based on the attention implementation used (stored in the config). If `past_key_values`
    has an HybridCache structure, this function will return the mask corresponding to one of the "full_attention" layers (to align
    to what is needed in the `modeling_xxx.py` files).
    Args:
        config (`PretrainedConfig`):
            The model config.
        input_embeds (`torch.Tensor`):
            The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the
            batch size, query length and dtype.
        attention_mask (`torch.Tensor`, optional):
            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length).
            It can also be an already prepared 4D mask, in which case it is returned as-is.
        cache_position (`torch.Tensor`):
            A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
        past_key_values (`Cache`, optional):
            The past key values, if we use a cache.
        or_mask_function (`Callable`, optional):
            An optional mask function to combine with the causal mask function (by doing the union of both). This is
            useful to easily overlay another mask on top of the causal one, for example for image tokens handling.
        and_mask_function (`Callable`, optional):
            An optional mask function to combine with the causal mask function (by doing the intersection of both). This is
            useful to easily overlay another mask on top of the causal one, for example for image tokens handling.
    """
    # If we have an HybridCache structure, here we want to create the mask for the full layers
    if (
        past_key_values
        and hasattr(past_key_values, "is_sliding")
        and False in past_key_values.is_sliding
    ):
        layer_idx = past_key_values.is_sliding.index(False)
    else:
        layer_idx = 0
    original_attention_mask = (
        None
        if attention_mask is None
        else attention_mask.clone().to(cache_position.device)
    )
    early_exit, attention_mask, kv_length, kv_offset = _preprocess_mask_arguments(
        config, input_embeds, attention_mask, cache_position, past_key_values, layer_idx
    )
    if early_exit:
        return attention_mask
    batch_size, total_seq_len = cache_position.shape
    key_length = total_seq_len
    document_ids = torch.nn.functional.pad(
        original_attention_mask, value=0, pad=(0, key_length)
    )
    batch_size, dtype = input_embeds.shape[0], input_embeds.dtype
    if attention_mask is not None:
        def causal_doc_mask_mod(batch_idx, head_idx, q_idx, kv_idx):
            """
            Defines the logic of a block causal mask by combining both a standard causal mask
            and a block diagonal document mask.
            See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
            for an illustration.
            """
            causal_mask_ = q_idx >= kv_idx  # not valid when decoding
            document_mask = (
                document_ids[batch_idx, q_idx] == document_ids[batch_idx, kv_idx]
            )
            final_mask = causal_mask_ & document_mask
            return final_mask
        mask_factory_function = causal_doc_mask_mod
    else:
        mask_factory_function = causal_mask_function
    mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]
    # Do not allow skip if we are compiling (this is to match BC)
    allow_is_causal_skip = (
        not past_key_values.is_compileable if past_key_values is not None else True
    )
    # Allow slight deviations from causal mask
    if or_mask_function is not None:
        if not _is_torch_greater_or_equal_than_2_6:
            raise ValueError(
                "Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6"
            )
        mask_factory_function = or_masks(mask_factory_function, or_mask_function)
        allow_is_causal_skip = False
    if and_mask_function is not None:
        if not _is_torch_greater_or_equal_than_2_6:
            raise ValueError(
                "Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6"
            )
        mask_factory_function = and_masks(mask_factory_function, and_mask_function)
        allow_is_causal_skip = False
    # We now create the mask
    causal_mask = mask_interface(
        batch_size=batch_size,
        cache_position=cache_position,
        kv_length=kv_length,
        kv_offset=kv_offset,
        mask_function=mask_factory_function,
        attention_mask=attention_mask,
        allow_is_causal_skip=allow_is_causal_skip,  # additional kwarg for sdpa
        dtype=dtype,  # Additional kwarg for eager
        config=config,  # Pass the config as well, in case someone wants to easily have their own mask_interface
    )
    return causal_mask
 def patch_create_causal_mask(model_type):
    import transformers.masking_utils
    transformers.masking_utils.create_causal_mask = create_causal_mask
    if model_type:
        try:
            # Dynamically import the module and attention class
            module_path = f"transformers.models.{model_type}.modeling_{model_type}"
            module = __import__(module_path)
            module.create_causal_mask = create_causal_mask
            del sys.modules[module_path]
        except (ImportError, AttributeError) as e:
            raise ValueError(
                f"Could not import attention class for model_type: {model_type}. "
                f"Error: {str(e)}"
            ) from e
--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -216,7 +216,7 @@ class TrainerBuilderBase(abc.ABC):
    def _configure_warmup_and_logging(
        self, total_num_steps: int, training_args_kwargs: dict
    ):
-        warmup_steps = 0
+        warmup_steps: int | float = 0
        warmup_ratio = 0.0
        if self.cfg.warmup_steps is not None:
            warmup_steps = self.cfg.warmup_steps
@@ -230,6 +230,10 @@ class TrainerBuilderBase(abc.ABC):
        else:
            warmup_ratio = 0.03
        # transformers v5
        if warmup_ratio > 0.0 and warmup_steps == 0:
            warmup_steps = warmup_ratio
        if warmup_steps == 1:
            warmup_steps = 2
@@ -242,7 +246,6 @@ class TrainerBuilderBase(abc.ABC):
                else max(min(int(0.005 * total_num_steps), 10), 1)
            )
        training_args_kwargs["warmup_ratio"] = warmup_ratio
        training_args_kwargs["warmup_steps"] = warmup_steps
    def _configure_precision_settings(self, training_args_kwargs: dict):
@@ -406,6 +409,9 @@ class TrainerBuilderBase(abc.ABC):
            if self.cfg.hub_strategy:
                training_args_kwargs["hub_strategy"] = self.cfg.hub_strategy
            if self.cfg.hub_revision:
                training_args_kwargs["hub_revision"] = self.cfg.hub_revision
    def _configure_save_and_eval_strategy(self, training_args_kwargs: dict):
        # save_strategy and save_steps
        if self.cfg.save_steps:
@@ -530,9 +536,7 @@ class TrainerBuilderBase(abc.ABC):
            "loraplus_lr_ratio",
            "loraplus_lr_embedding",
            "output_dir",
            "save_safetensors",
            "save_only_model",
            "include_tokens_per_second",
            "weight_decay",
            "seed",
            "dion_momentum",
@@ -545,6 +549,7 @@ class TrainerBuilderBase(abc.ABC):
        arg_map = {
            "dion_learning_rate": "dion_lr",
            "include_num_input_tokens_seen": "include_tokens_per_second",
        }
        for kwarg, cfg_arg in arg_map.items():
            if hasattr(self.cfg, cfg_arg) and getattr(self.cfg, cfg_arg) is not None:
--- a/src/axolotl/core/builders/causal.py
+++ b/src/axolotl/core/builders/causal.py
@@ -72,7 +72,9 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        if self.cfg.include_tkps:
            callbacks.append(
                TokensPerSecondCallback(
-                    self.cfg.tensor_parallel_size, self.cfg.context_parallel_size
+                    self.cfg.tensor_parallel_size,
                    self.cfg.context_parallel_size,
                    resume_from_checkpoint=self.cfg.resume_from_checkpoint,
                )
            )
        return callbacks
@@ -244,7 +246,8 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            ddp_find_unused_parameters
        )
-        training_arguments_kwargs["group_by_length"] = self.cfg.group_by_length
+        if self.cfg.group_by_length:
            training_arguments_kwargs["train_sampling_strategy"] = "group_by_length"
        training_arguments_kwargs["curriculum_sampling"] = self.cfg.curriculum_sampling
        training_arguments_kwargs["sample_packing"] = bool(self.cfg.sample_packing)
@@ -371,6 +374,18 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            # https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html
            data_collator_kwargs["pad_to_multiple_of"] = multiple
        if self.cfg.use_eaft:
            from functools import partial
            from axolotl.monkeypatch.loss.eaft import eaft_loss
            configured_eaft_loss = partial(
                eaft_loss,
                alpha=self.cfg.eaft_alpha if self.cfg.eaft_alpha is not None else 1.0,
                k=self.cfg.eaft_k if self.cfg.eaft_k is not None else 20,
            )
            trainer_kwargs["compute_loss_func"] = configured_eaft_loss
        trainer_cls = self._get_trainer_cls()
        trainer_kwargs, trainer_cls = self.hook_pre_create_trainer(
@@ -435,7 +450,9 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                or self.cfg.micro_batch_size > 1
            ):
                return DataCollatorForSeq2Seq(self.tokenizer, **kwargs)
-            if not (self.cfg.sample_packing and self.cfg.pretrain_multipack_attn):
+            if not (self.cfg.sample_packing and self.cfg.pretrain_multipack_attn) or (
                self.cfg.micro_batch_size == 1 and is_eval is False
            ):
                return None
        if self.cfg.model_config_type == "mamba":
--- a/src/axolotl/core/builders/rl.py
+++ b/src/axolotl/core/builders/rl.py
@@ -11,7 +11,6 @@ from axolotl.core.trainers import (
 )
 from axolotl.core.trainers.dpo import DPOStrategy
 from axolotl.core.trainers.dpo.args import AxolotlDPOConfig
 from axolotl.core.trainers.grpo import GRPOStrategy
 from axolotl.integrations.base import PluginManager
 from axolotl.loaders.utils import ensure_dtype
 from axolotl.utils.callbacks.qat import QATCallback
@@ -52,12 +51,13 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
        trainer_cls = None
        trainer_cls_args = [self.model]
-        if self.cfg.rl is RLType.GRPO:
+        if self.cfg.rl in {RLType.GRPO, RLType.GDPO}:
            from axolotl.core.trainers.grpo import GRPOStrategy
            trainer_cls = GRPOStrategy.get_trainer_class(
                sequence_parallel=self.cfg.context_parallel_size > 1
            )
            trainer_cls_args.extend(GRPOStrategy.set_trainer_args(self.cfg))
            trainer_kwargs.update(GRPOStrategy.set_trainer_kwargs(self.cfg))
        elif self.cfg.rl in [RLType.DPO, RLType.IPO]:
@@ -134,19 +134,17 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            if self.cfg.cpo_alpha is not None:
                training_args_kwargs["cpo_alpha"] = self.cfg.cpo_alpha
-            # Handle when max_prompt_length == max_length from defaults
+            blocklist_args_kwargs.append("max_prompt_length")
            # CPOTrainer requires strictly less than
            if (
                training_args_kwargs["max_prompt_length"]
                == training_args_kwargs["max_length"]
            ):
                training_args_kwargs["max_prompt_length"] -= 1
        elif self.cfg.rl is RLType.ORPO:
            training_args_cls = AxolotlORPOConfig
            blocklist_args_kwargs.append("max_prompt_length")
        elif self.cfg.rl is RLType.KTO:
            training_args_cls = AxolotlKTOConfig
            # KTOConfig in TRL >= 0.27.0 no longer accepts max_prompt_length
            blocklist_args_kwargs.append("max_prompt_length")
            training_args_kwargs["desirable_weight"] = (
                self.cfg.kto_desirable_weight or 1.0
@@ -155,10 +153,16 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
                self.cfg.kto_undesirable_weight or 1.0
            )
-        elif self.cfg.rl is RLType.GRPO:
+        elif self.cfg.rl in {RLType.GRPO, RLType.GDPO}:
            from axolotl.core.trainers.grpo import GRPOStrategy
            training_args_cls = GRPOStrategy.get_training_args_class()
            training_args_kwargs.update(GRPOStrategy.set_training_args_kwargs(self.cfg))
            blocklist_args_kwargs = GRPOStrategy.get_blocklist_args_kwargs()
            if self.cfg.rl is RLType.GDPO:
                training_args_kwargs.setdefault(
                    "multi_objective_aggregation", "normalize_then_sum"
                )
        elif self.cfg.rl in [RLType.DPO, RLType.IPO]:
            training_args_cls = AxolotlDPOConfig
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -2,6 +2,7 @@
 from __future__ import annotations
 import json
 import math
 import os
 from collections import defaultdict
@@ -24,7 +25,7 @@ from torch.utils.data import (
 from transformers import PreTrainedModel, Trainer
 from transformers.trainer import TRAINING_ARGS_NAME
 from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, has_length, seed_worker
-from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME, is_peft_available
+from transformers.utils import SAFE_WEIGHTS_NAME, is_peft_available
 from trl.trainer.utils import pad_to_length
 from typing_extensions import override
@@ -50,6 +51,8 @@ from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
 LOG = get_logger(__name__)
 TOKENS_STATE_FILE = "tokens_state."
 REDUCTION_FNS = {
    "mean": torch.mean,
    "min": torch.min,
@@ -349,24 +352,33 @@ class AxolotlTrainer(
        #     return (loss, outputs) if return_outputs else loss
        # track number of tokens for tokens per second calculation
-        if self.args.include_tkps:
+        if self.args.include_tkps and model.training:
            inputs_key = "labels" if "labels" in inputs else "input_ids"
-            num_tokens = (inputs[inputs_key] != -100).sum()
+            trainable_tokens = (inputs[inputs_key] != -100).sum()
            total_tokens = inputs[inputs_key].numel()
            total_tokens = torch.tensor(total_tokens, device=inputs[inputs_key].device)
            if is_distributed():
                torch.distributed.all_reduce(
-                    num_tokens, op=torch.distributed.ReduceOp.SUM
+                    trainable_tokens, op=torch.distributed.ReduceOp.SUM
                )
-            if hasattr(self.state, "num_tokens"):
+                torch.distributed.all_reduce(
-                self.state.num_tokens = (
+                    total_tokens, op=torch.distributed.ReduceOp.SUM
                    self.state.num_tokens + (inputs[inputs_key] != -100).sum().cpu()
                )
            else:
                self.state.num_tokens = (inputs[inputs_key] != -100).sum().cpu()
-            if hasattr(self.state, "total_tokens"):
+            if not hasattr(self.state, "tokens"):
-                self.state.total_tokens += num_tokens
+                self.state.tokens = {
-            else:
+                    "trainable": torch.zeros(1),
-                self.state.total_tokens = num_tokens
+                    "total": torch.zeros(1),
                }
            # trainable tokens for throughput and total token slots for summaries
            self.state.tokens["trainable"] = (
                self.state.tokens["trainable"] + trainable_tokens.detach().cpu()
            )
            self.state.tokens["total"] = self.state.tokens["total"] + total_tokens.cpu()
            # Store per-step trainable tokens for throughput calculation
            self.state.tokens["trainable_tokens"] = trainable_tokens.detach().cpu()
        if self.args.orpo_alpha:
            return self.orpo_compute_loss(
@@ -638,17 +650,20 @@ class AxolotlTrainer(
            except (ValueError, TypeError, FileNotFoundError):
                pass
-        if self.args.include_tkps and train_eval == "train":
+        if (
            self.args.include_tkps
            and train_eval == "train"
            and hasattr(self.state, "tokens")
        ):
            # each rank will log its own tokens per second
            # for logging_steps > 1 we obtain a moving average of this metric
-            logs["tokens_per_second_per_gpu"] = round(
+            logs["tokens/train_per_sec_per_gpu"] = round(
                self.state.last_tokens_per_second.item() / self.args.logging_steps, 2
            )
-            if (
+            if "total" in self.state.tokens:
-                hasattr(self.state, "total_tokens")
+                logs["tokens/total"] = int(self.state.tokens["total"].item())
-                and self.state.total_tokens is not None
+            if "trainable" in self.state.tokens:
-            ):
+                logs["tokens/trainable"] = int(self.state.tokens["trainable"].item())
                logs["total_tokens"] = int(self.state.total_tokens.item())
        del self._stored_metrics[train_eval]
@@ -683,6 +698,19 @@ class AxolotlTrainer(
        run_dir = self._get_output_dir(trial=trial)
        output_dir = os.path.join(run_dir, checkpoint_folder)
        os.makedirs(output_dir, exist_ok=True)
        # Save total_tokens state if tracking is enabled
        if self.args.include_tkps and hasattr(self.state, "tokens"):
            tokens_state = {
                "total": int(torch.as_tensor(self.state.tokens.get("total", 0)).item()),
                "trainable": int(
                    torch.as_tensor(self.state.tokens.get("trainable", 0)).item()
                ),
            }
            tokens_state_path = os.path.join(output_dir, TOKENS_STATE_FILE)
            with open(tokens_state_path, "w", encoding="utf-8") as f:
                json.dump(tokens_state, f)
        return super()._save_checkpoint(model, trial, **kwargs)
    # TODO(wing): remove once https://github.com/huggingface/transformers/pull/39866/files is merged
@@ -691,6 +719,13 @@ class AxolotlTrainer(
        output_dir = output_dir if output_dir is not None else self.args.output_dir
        os.makedirs(output_dir, exist_ok=True)
        LOG.info(f"Saving model checkpoint to {output_dir}")
        if state_dict is None:
            state_dict = self.accelerator.get_state_dict(self.model)
        if state_dict is not None:
            state_dict = {
                k: v.clone() if isinstance(v, torch.Tensor) else v
                for k, v in state_dict.items()
            }
        supported_classes = (
            (PreTrainedModel,)
            if not is_peft_available()
@@ -710,43 +745,38 @@ class AxolotlTrainer(
                ).save_pretrained(
                    output_dir,
                    state_dict=state_dict,
                    safe_serialization=self.args.save_safetensors,
                )
            else:
                LOG.info(
                    "Trainer.model is not a `PreTrainedModel`, only saving its state dict."
                )
-                if self.args.save_safetensors:
+                safetensors.torch.save_file(
-                    safetensors.torch.save_file(
+                    state_dict,
-                        state_dict,
+                    os.path.join(output_dir, SAFE_WEIGHTS_NAME),
-                        os.path.join(output_dir, SAFE_WEIGHTS_NAME),
+                    metadata={"format": "pt"},
-                        metadata={"format": "pt"},
+                )
                    )
                else:
                    torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
        else:
            self.model.save_pretrained(
                output_dir,
                state_dict=state_dict,
                safe_serialization=self.args.save_safetensors,
                is_main_process=self.accelerator.is_main_process,
            )
-            if self.processing_class is not None:
+        if self.processing_class is not None:
-                self.processing_class.save_pretrained(output_dir)
+            self.processing_class.save_pretrained(output_dir)
-            elif (
+        elif (
-                self.data_collator is not None
+            self.data_collator is not None
-                and hasattr(self.data_collator, "tokenizer")
+            and hasattr(self.data_collator, "tokenizer")
-                and self.data_collator.tokenizer is not None
+            and self.data_collator.tokenizer is not None
-            ):
+        ):
-                LOG.info(
+            LOG.info(
-                    "Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`"
+                "Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`"
-                )
+            )
-                save_jinja_files = True
+            save_jinja_files = True
-                if self.axolotl_cfg:
+            if self.axolotl_cfg:
-                    save_jinja_files = self.axolotl_cfg.tokenizer_save_jinja_files
+                save_jinja_files = self.axolotl_cfg.tokenizer_save_jinja_files
-                self.data_collator.tokenizer.save_pretrained(
+            self.data_collator.tokenizer.save_pretrained(
-                    output_dir, save_jinja_files=save_jinja_files
+                output_dir, save_jinja_files=save_jinja_files
-                )
+            )
-            # Good practice: save your training arguments together with the trained model
+        # Good practice: save your training arguments together with the trained model
-            torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
+        torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
--- a/src/axolotl/core/trainers/dpo/trainer.py
+++ b/src/axolotl/core/trainers/dpo/trainer.py
@@ -57,16 +57,18 @@ class AxolotlDPOTrainer(
    def tokenize_row(
        features,
        processing_class,
-        max_prompt_length,
+        max_prompt_length: int | None = None,
-        max_completion_length,
+        max_completion_length: int | None = None,
-        add_special_tokens,
+        add_special_tokens: bool = True,
        is_chat: bool = False,
    ) -> Dict:
        res = DPOTrainer.tokenize_row(
            features,
            processing_class,
-            max_prompt_length,
+            max_prompt_length=max_prompt_length,
-            max_completion_length,
+            max_completion_length=max_completion_length,
-            add_special_tokens,
+            add_special_tokens=add_special_tokens,
            is_chat=is_chat,
        )
        # fix when the tokenizer doesn't have a bos_token_id, e.g. Qwen
        if processing_class.bos_token is None and res["prompt_input_ids"][0] is None:
--- a/Show More
+++ b/Show More