release tag (#2799 )

feat: remove evalfirst callback with built-in trainer arg (#2797 )
KD fix w/ online distillation (#2700 ) [skip ci]
2025-06-17 12:13:27 -04:00 · 2025-06-17 12:09:33 -04:00 · 2025-06-17 12:09:13 -04:00 · 2025-06-15 16:47:02 -04:00 · 2025-06-14 11:54:06 -07:00 · 2025-06-14 11:53:43 -07:00
400 changed files with 15971 additions and 8451 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -16,8 +16,9 @@ on:
 jobs:
  build-base:
    if: github.repository_owner == 'axolotl-ai-cloud'
+    timeout-minutes: 480
    # this job needs to be run on self-hosted GPU runners...
-    runs-on: axolotl-gpu-runner
+    runs-on: ubuntu-latest-m
    strategy:
      fail-fast: false
      matrix:
@@ -28,42 +29,50 @@ jobs:
            python_version: "3.11"
            pytorch: 2.5.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-base"
          - cuda: "124"
            cuda_version: 12.4.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.6.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-base"
          - cuda: "126"
            cuda_version: 12.6.3
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.6.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-base"
          - cuda: "126"
            cuda_version: 12.6.3
            cudnn_version: ""
            python_version: "3.11"
-            pytorch: 2.7.0
+            pytorch: 2.7.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-base"
          - cuda: "128"
            cuda_version: 12.6.3
            cudnn_version: ""
            python_version: "3.11"
-            pytorch: 2.7.0
+            pytorch: 2.7.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-base"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: nightly
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: next
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-base-nightly"
+#          # "next" is for release candidates of pytorch
+#          - cuda: "128"
+#            cuda_version: 12.8.1
+#            cudnn_version: ""
+#            python_version: "3.11"
+#            pytorch: next
+#            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+#            dockerfile: "Dockerfile-base-next"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -85,7 +94,60 @@ jobs:
        uses: docker/build-push-action@v4
        with:
          context: .
-          file: ${{ matrix.pytorch == 'nightly' && './docker/Dockerfile-base-nightly' || matrix.pytorch == 'next' && './docker/Dockerfile-base-next' || './docker/Dockerfile-base' }}
+          file: ./docker/${{ matrix.dockerfile }}
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+          labels: ${{ steps.metadata.outputs.labels }}
+          build-args: |
+            CUDA_VERSION=${{ matrix.cuda_version }}
+            CUDNN_VERSION=${{ matrix.cudnn_version }}
+            CUDA=${{ matrix.cuda }}
+            PYTHON_VERSION=${{ matrix.python_version }}
+            PYTORCH_VERSION=${{ matrix.pytorch }}
+            TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }}
+  build-base-uv:
+    if: github.repository_owner == 'axolotl-ai-cloud'
+    timeout-minutes: 480
+    runs-on: ubuntu-latest-m
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - cuda: "126"
+            cuda_version: 12.6.3
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: 2.6.0
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-uv-base"
+          - cuda: "128"
+            cuda_version: 12.8.1
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: 2.7.1
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-uv-base"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Docker metadata
+        id: metadata
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            axolotlai/axolotl-base-uv
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Build
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: ./docker/${{ matrix.dockerfile }}
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -9,6 +9,7 @@ on:
       - '.github/workflows/*.yml'
       - "*.[q]md"
       - "examples/**/*.y[a]?ml"
+       - ".pre-commit-config.yaml"
  workflow_dispatch:

 jobs:
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -29,7 +29,12 @@ jobs:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.7.0
+            pytorch: 2.7.1
+            axolotl_extras:
+          - cuda: 128
+            cuda_version: 12.8.1
+            python_version: "3.11"
+            pytorch: 2.7.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -92,7 +97,12 @@ jobs:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.7.0
+            pytorch: 2.7.1
+            axolotl_extras:
+          - cuda: 128
+            cuda_version: 12.8.1
+            python_version: "3.11"
+            pytorch: 2.7.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -3,7 +3,7 @@ name: docker-multigpu-tests-biweekly
 on:
  pull_request:
    paths:
-      - 'tests/e2e/multigpu/*.py'
+      - 'tests/e2e/multigpu/**.py'
      - 'requirements.txt'
      - 'setup.py'
      - 'pyproject.toml'
@@ -43,7 +43,7 @@ jobs:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.7.0
+            pytorch: 2.7.1
            axolotl_extras:
            num_gpus: 2
            nightly_build: "true"
@@ -59,7 +59,7 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==0.71.8 jinja2
+          pip install modal==1.0.2 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
--- a/.github/workflows/precommit-autoupdate.yml
+++ b/.github/workflows/precommit-autoupdate.yml
@@ -25,7 +25,6 @@ jobs:
          pre-commit autoupdate
          if [[ -n $(git status --porcelain) ]]; then
            echo "changes=true" >> $GITHUB_OUTPUT
-            git diff .pre-commit-config.yaml > pre-commit-update.diff
          fi

      - name: Create Pull Request
@@ -39,11 +38,3 @@ jobs:
          commit-message: "chore: update pre-commit hooks"
          body: |
            Automated PR to update pre-commit hooks to their latest versions.
-
-            <details>
-            <summary>Changes:</summary>
-
-            ```diff
-            ${{ steps.update.outputs.diff }}
-            ```
-            </details>
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -18,9 +18,96 @@ jobs:
        env:
          SKIP: no-commit-to-branch

+  preload-cache:
+    name: Preload HF cache
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python_version: ["3.11"]
+        pytorch_version: ["2.6.0"]
+    timeout-minutes: 20
+
+    env:
+      AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
+
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+
+      - name: Restore HF cache
+        id: hf-cache-restore
+        uses: actions/cache/restore@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ runner.os }}-hf-hub-cache-v2
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python_version }}
+          cache: 'pip' # caching pip dependencies
+
+      - name: upgrade pip
+        run: |
+          pip3 install --upgrade pip
+          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
+
+      - name: Install PyTorch
+        run: |
+          pip3 install torch==${{ matrix.pytorch_version }}
+
+      - name: Install dependencies
+        run: |
+          pip3 show torch
+          pip3 install --no-build-isolation -U -e .
+          python scripts/unsloth_install.py | sh
+          python scripts/cutcrossentropy_install.py | sh
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+
+      - name: Make sure PyTorch version wasn't clobbered
+        run: |
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
+
+      - name: Ensure axolotl CLI was installed
+        run: |
+          axolotl --help
+
+      - name: Pre-Download dataset fixture
+        run: |
+          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
+
+      - name: Run tests
+        run: |
+          pytest -v tests/conftest.py
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v5
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          files: ./coverage.xml
+          flags: unittests,pytorch-${{ matrix.pytorch_version }}
+          fail_ci_if_error: false
+
+      - name: cleanup pip cache
+        run: |
+          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
+
+      - name: Save HF cache
+        id: hf-cache
+        uses: actions/cache/save@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
+
  pytest:
    name: PyTest
    runs-on: ubuntu-latest
+    needs: [preload-cache]
    strategy:
      fail-fast: false
      max-parallel: 2
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -44,115 +44,26 @@ jobs:
        env:
          SKIP: no-commit-to-branch

-  preload-cache:
-    name: Preload HF cache
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        python_version: ["3.11"]
-        pytorch_version: ["2.6.0"]
-    timeout-minutes: 20
-
-    env:
-      AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
-
-    steps:
-      - name: Check out repository code
-        uses: actions/checkout@v4
-
-      - name: Restore HF cache
-        id: hf-cache-restore
-        uses: actions/cache/restore@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ runner.os }}-hf-hub-cache-v2
-
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python_version }}
-          cache: 'pip' # caching pip dependencies
-
-      - name: upgrade pip
-        run: |
-          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
-
-      - name: Install PyTorch
-        run: |
-          pip3 install torch==${{ matrix.pytorch_version }}
-
-      - name: Install dependencies
-        run: |
-          pip3 show torch
-          pip3 install --no-build-isolation -U -e .
-          python scripts/unsloth_install.py | sh
-          python scripts/cutcrossentropy_install.py | sh
-          pip3 install -r requirements-dev.txt -r requirements-tests.txt
-
-      - name: Make sure PyTorch version wasn't clobbered
-        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
-
-      - name: Ensure axolotl CLI was installed
-        run: |
-          axolotl --help
-
-      - name: Pre-Download dataset fixture
-        run: |
-          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
-
-      - name: Run tests
-        run: |
-          pytest -v tests/conftest.py
-
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v5
-        with:
-          token: ${{ secrets.CODECOV_TOKEN }}
-          files: ./coverage.xml
-          flags: unittests,pytorch-${{ matrix.pytorch_version }}
-          fail_ci_if_error: false
-
-      - name: cleanup pip cache
-        run: |
-          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
-
-      - name: Save HF cache
-        id: hf-cache
-        uses: actions/cache/save@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
-
  pytest:
    name: PyTest
    runs-on: ubuntu-latest
-    needs: [preload-cache]
+#    needs: [preload-cache]
    strategy:
      fail-fast: false
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
+        pytorch_version: ["2.5.1", "2.6.0", "2.7.1"]
    timeout-minutes: 20

    steps:
      - name: Check out repository code
        uses: actions/checkout@v4

-      - name: Restore HF cache
-        id: hf-cache-restore
-        uses: actions/cache/restore@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ runner.os }}-hf-hub-cache-v2
+      - name: Restore Cache from S3
+        id: hf-cache-restore-s3
+        run: |
+          mkdir -p /home/runner/.cache/huggingface/hub
+          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd

      - name: Setup Python
        uses: actions/setup-python@v5
@@ -210,26 +121,22 @@ jobs:
  pytest-sdist:
    name: PyTest from Source Dist
    runs-on: ubuntu-latest
-    needs: [preload-cache]
    strategy:
      fail-fast: false
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
+        pytorch_version: ["2.5.1", "2.6.0", "2.7.1"]
    timeout-minutes: 20

    steps:
      - name: Check out repository code
        uses: actions/checkout@v4

-      - name: Restore HF cache
-        id: hf-cache-restore
-        uses: actions/cache/restore@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ runner.os }}-hf-hub-cache-v2
+      - name: Restore Cache from S3
+        id: hf-cache-restore-s3
+        run: |
+          mkdir -p /home/runner/.cache/huggingface/hub
+          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd

      - name: Setup Python
        uses: actions/setup-python@v5
@@ -277,12 +184,124 @@ jobs:
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

  docker-e2e-tests-1st:
+    # Run this job first as a gate for running the remainder of the test matrix
    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
-    timeout-minutes: 90
+    timeout-minutes: 120
    needs: [pre-commit, pytest, pytest-sdist]

+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.6.0
+            num_gpus: 1
+            axolotl_extras: vllm
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.6.0
+            num_gpus: 1
+            axolotl_extras:
+            dockerfile: "Dockerfile-uv.jinja"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install Modal
+        run: |
+          python -m pip install --upgrade pip
+          pip install modal==1.0.2 jinja2
+      - name: Update env vars
+        run: |
+          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
+          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
+          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
+          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
+          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
+          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
+          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
+          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
+      - name: Run tests job on Modal
+        run: |
+          modal run cicd.e2e_tests
+
+  docker-e2e-tests:
+    if: github.repository_owner == 'axolotl-ai-cloud'
+    # this job needs to be run on self-hosted GPU runners...
+    runs-on: [self-hosted, modal]
+    timeout-minutes: 120
+    # Only run the remainder of the matrix if the first e2e check passed;
+    # this is to save on wasted compute costs for known failures that get caught in the first run
+    needs: [pre-commit, pytest, docker-e2e-tests-1st]
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.6.0
+            num_gpus: 1
+            axolotl_extras: llmcompressor
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.5.1
+            num_gpus: 1
+            axolotl_extras:
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.7.1
+            num_gpus: 1
+            axolotl_extras:
+          - cuda: 128
+            cuda_version: 12.8.1
+            python_version: "3.11"
+            pytorch: 2.7.1
+            num_gpus: 1
+            axolotl_extras:
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install Modal
+        run: |
+          python -m pip install --upgrade pip
+          pip install modal==1.0.2 jinja2
+      - name: Update env vars
+        run: |
+          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
+          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
+          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
+          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
+          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
+          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
+          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
+          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
+      - name: Run tests job on Modal
+        run: |
+          modal run cicd.e2e_tests
+
+  docker-e2e-cleanup:
+    runs-on: [self-hosted, modal]
+    timeout-minutes: 90
+    needs: [docker-e2e-tests]
+
    strategy:
      fail-fast: false
      matrix:
@@ -303,7 +322,7 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==0.71.8 jinja2
+          pip install modal==1.0.2 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
@@ -316,64 +335,4 @@ jobs:
          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
-          modal run cicd.e2e_tests
-
-  docker-e2e-tests:
-    if: github.repository_owner == 'axolotl-ai-cloud'
-    # this job needs to be run on self-hosted GPU runners...
-    runs-on: [self-hosted, modal]
-    timeout-minutes: 90
-    needs: [pre-commit, pytest, docker-e2e-tests-1st]
-
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.6.0
-            num_gpus: 1
-            axolotl_extras: llmcompressor
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.4.1
-            num_gpus: 1
-            axolotl_extras:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.5.1
-            num_gpus: 1
-            axolotl_extras:
-          - cuda: 126
-            cuda_version: 12.6.3
-            python_version: "3.11"
-            pytorch: 2.7.0
-            num_gpus: 1
-            axolotl_extras:
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Install Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: Install Modal
-        run: |
-          python -m pip install --upgrade pip
-          pip install modal==0.71.8 jinja2
-      - name: Update env vars
-        run: |
-          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
-          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
-          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
-          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
-          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
-          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
-          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
-      - name: Run tests job on Modal
-        run: |
-          modal run cicd.e2e_tests
+          modal run cicd.cleanup
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,15 +19,15 @@ repos:
    hooks:
      - id: isort
 -   repo: https://github.com/PyCQA/flake8
-    rev: 7.1.2
+    rev: 7.2.0
    hooks:
    - id: flake8
 -   repo: https://github.com/pylint-dev/pylint
-    rev: v3.3.6
+    rev: v3.3.7
    hooks:
    - id: pylint
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.15.0
+    rev: v1.16.0
    hooks:
    - id: mypy
      additional_dependencies:
--- a/.runpod/src/config/config.yaml
+++ b/.runpod/src/config/config.yaml
@@ -242,16 +242,12 @@
 # early_stopping_patience: 3

 # # Specify a scheduler and kwargs to use with the optimizer
-# lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
+# lr_scheduler: # 'one_cycle' | empty for cosine
 # lr_scheduler_kwargs:

 # # For one_cycle optim
 # lr_div_factor: # Learning rate div factor

-# # For log_sweep optim
-# log_sweep_min_lr:
-# log_sweep_max_lr:
-
 # # Specify optimizer
 # # Valid values are driven by the Transformers OptimizerNames class, see:
 # # https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
--- a/.runpod/src/handler.py
+++ b/.runpod/src/handler.py
@@ -57,8 +57,10 @@ async def handler(job):
    logger.info("Training Complete.")

    # Cleanup
-    del os.environ["WANDB_API_KEY"]
-    del os.environ["HF_TOKEN"]
+    if "WANDB_API_KEY" in os.environ:
+        del os.environ["WANDB_API_KEY"]
+    if "HF_TOKEN" in os.environ:
+        del os.environ["HF_TOKEN"]


 runpod.serverless.start({"handler": handler, "return_aggregate_stream": True})
--- a/README.md
+++ b/README.md
@@ -22,28 +22,32 @@
    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg" alt="multigpu-semi-weekly tests">
 </p>

-Axolotl is a tool designed to streamline post-training for various AI models.
-Post-training refers to any modifications or additional training performed on
-pre-trained models - including full model fine-tuning, parameter-efficient tuning (like
-LoRA and QLoRA), supervised fine-tuning (SFT), instruction tuning, and alignment
-techniques. With support for multiple model architectures and training configurations,
-Axolotl makes it easy to get started with these techniques.

-Axolotl is designed to work with YAML config files that contain everything you need to
-preprocess a dataset, train or fine-tune a model, run model inference or evaluation,
-and much more.
+## 🎉 Latest Updates
+
+- 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral) to start training your own Magistral models with Axolotl!
+- 2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the [docs](https://docs.axolotl.ai/docs/qat.html) to learn more!
+- 2025/04: Llama 4 support has been added in Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/llama-4) to start training your own Llama 4 models with Axolotl's linearized version!
+- 2025/03: Axolotl has implemented Sequence Parallelism (SP) support. Read the [blog](https://huggingface.co/blog/axolotl-ai-co/long-context-with-sequence-parallelism-in-axolotl) and [docs](https://docs.axolotl.ai/docs/sequence_parallelism.html) to learn how to scale your context length when fine-tuning.
+- 2025/03: (Beta) Fine-tuning Multimodal models is now supported in Axolotl. Check out the [docs](https://docs.axolotl.ai/docs/multimodal.html) to fine-tune your own!
+- 2025/02: Axolotl has added LoRA optimizations to reduce memory usage and improve training speed for LoRA and QLoRA in single GPU and multi-GPU training (DDP and DeepSpeed). Jump into the [docs](https://docs.axolotl.ai/docs/lora_optims.html) to give it a try.
+- 2025/02: Axolotl has added GRPO support. Dive into our [blog](https://huggingface.co/blog/axolotl-ai-co/training-llms-w-interpreter-feedback-wasm) and [GRPO example](https://github.com/axolotl-ai-cloud/grpo_code) and have some fun!
+- 2025/01: Axolotl has added Reward Modelling / Process Reward Modelling fine-tuning support. See [docs](https://docs.axolotl.ai/docs/reward_modelling.html).
+
+## ✨ Overview
+
+Axolotl is a tool designed to streamline post-training for various AI models.

 Features:

- Train various Huggingface models such as llama, pythia, falcon, mpt
- Supports fullfinetune, lora, qlora, relora, and gptq
- Customize configurations using a simple yaml file or CLI overwrite
- Load different dataset formats, use custom formats, or bring your own tokenized datasets
- Integrated with [xformers](https://github.com/facebookresearch/xformers), flash attention, [liger kernel](https://github.com/linkedin/Liger-Kernel), rope scaling, and multipacking
- Works with single GPU or multiple GPUs via FSDP or Deepspeed
- Easily run with Docker locally or on the cloud
- Log results and optionally checkpoints to wandb, mlflow or Comet
- And more!
+- **Multiple Model Support**: Train various models like LLaMA, Mistral, Mixtral, Pythia, and more. We are compatible with HuggingFace transformers causal language models.
+- **Training Methods**: Full fine-tuning, LoRA, QLoRA, GPTQ, QAT, Preference Tuning (DPO, IPO, KTO, ORPO), RL (GRPO), Multimodal, and Reward Modelling (RM) / Process Reward Modelling (PRM).
+- **Easy Configuration**: Re-use a single YAML file between dataset preprocess, training, evaluation, quantization, and inference.
+- **Performance Optimizations**: [Multipacking](https://docs.axolotl.ai/docs/multipack.html), [Flash Attention](https://github.com/Dao-AILab/flash-attention), [Xformers](https://github.com/facebookresearch/xformers), [Flex Attention](https://pytorch.org/blog/flexattention/), [Liger Kernel](https://github.com/linkedin/Liger-Kernel), [Cut Cross Entropy](https://github.com/apple/ml-cross-entropy/tree/main), Sequence Parallelism (SP), LoRA optimizations, Multi-GPU training (FSDP1, FSDP2, DeepSpeed), Multi-node training (Torchrun, Ray), and many more!
+- **Flexible Dataset Handling**: Load from local, HuggingFace, and cloud (S3, Azure, GCP, OCI) datasets.
+- **Cloud Ready**: We ship [Docker images](https://hub.docker.com/u/axolotlai) and also [PyPI packages](https://pypi.org/project/axolotl/) for use on cloud platforms and local hardware.
+
+

 ## 🚀 Quick Start

@@ -51,7 +55,7 @@ Features:

 - NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
 - Python 3.11
- PyTorch ≥2.4.1
+- PyTorch ≥2.5.1

 ### Installation

@@ -81,19 +85,12 @@ axolotl train examples/llama-3/lora-1b.yml

 That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/getting-started.html) for a more detailed walkthrough.

-## ✨ Key Features
-
- **Multiple Model Support**: Train various models like LLaMA, Mistral, Mixtral, Pythia, and more
- **Training Methods**: Full fine-tuning, LoRA, QLoRA, and more
- **Easy Configuration**: Simple YAML files to control your training setup
- **Performance Optimizations**: Flash Attention, xformers, multi-GPU training
- **Flexible Dataset Handling**: Use various formats and custom datasets
- **Cloud Ready**: Run on cloud platforms or local hardware

 ## 📚 Documentation

 - [Installation Options](https://docs.axolotl.ai/docs/installation.html) - Detailed setup instructions for different environments
 - [Configuration Guide](https://docs.axolotl.ai/docs/config.html) - Full configuration options and examples
+- [Dataset Loading](https://docs.axolotl.ai/docs/dataset_loading.html) - Loading datasets from various sources
 - [Dataset Guide](https://docs.axolotl.ai/docs/dataset-formats/) - Supported formats and how to use them
 - [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
 - [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
@@ -112,31 +109,6 @@ That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/ge

 Contributions are welcome! Please see our [Contributing Guide](https://github.com/axolotl-ai-cloud/axolotl/blob/main/.github/CONTRIBUTING.md) for details.

-## Supported Models
-
-|             | fp16/fp32 | lora | qlora | gptq | gptq w/flash attn | flash attn | xformers attn |
-|-------------|:----------|:-----|-------|------|-------------------|------------|--------------|
-| llama       | ✅         | ✅    | ✅     | ✅             | ✅                 | ✅          | ✅            |
-| Mistral     | ✅         | ✅    | ✅     | ✅             | ✅                 | ✅          | ✅            |
-| Mixtral-MoE | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
-| Mixtral8X22 | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
-| Pythia      | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
-| cerebras    | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
-| btlm        | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
-| mpt         | ✅         | ❌    | ❓     | ❌             | ❌                 | ❌          | ❓            |
-| falcon      | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
-| gpt-j       | ✅         | ✅    | ✅     | ❌             | ❌                 | ❓          | ❓            |
-| XGen        | ✅         | ❓    | ✅     | ❓             | ❓                 | ❓          | ✅            |
-| phi         | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
-| RWKV        | ✅         | ❓    | ❓     | ❓             | ❓                 | ❓          | ❓            |
-| Qwen        | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
-| Gemma       | ✅         | ✅    | ✅     | ❓             | ❓                 | ✅          | ❓            |
-| Jamba       | ✅         | ✅    | ✅     | ❓             | ❓                 | ✅          | ❓            |
-
-✅: supported
-❌: not supported
-❓: untested
-
 ## ❤️ Sponsors

 Thank you to our sponsors who help make Axolotl possible:
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -17,7 +17,9 @@ quartodoc:
        - convert
        - prompt_tokenizers
        - logging_config
-        - core.trainer_builder
+        - core.builders.base
+        - core.builders.causal
+        - core.builders.rl
        - core.training_args
        - core.chat.messages
        - core.chat.format.chatml
@@ -43,13 +45,37 @@ quartodoc:
        - cli.vllm_serve
        - cli.cloud.base
        - cli.cloud.modal_
+        - cli.quantize
    - title: Trainers
      desc: Training implementations
      contents:
        - core.trainers.base
        - core.trainers.trl
+        - core.trainers.mamba
+        - core.trainers.relora
        - core.trainers.dpo.trainer
        - core.trainers.grpo.trainer
+        - core.trainers.grpo.sampler
+        - core.trainers.utils
+    - title: Model Loading
+      desc: Functionality for loading and patching models, tokenizers, etc.
+      contents:
+        - loaders.model
+        - loaders.tokenizer
+        - loaders.processor
+        - loaders.adapter
+        - loaders.patch_manager
+        - loaders.constants
+    - title: Mixins
+      desc: Mixin classes for augmenting trainers
+      contents:
+        - core.trainers.mixins.optimizer
+        - core.trainers.mixins.rng_state_loader
+        - core.trainers.mixins.scheduler
+    - title: Context Managers
+      desc: Context managers for altering trainer behaviors
+      contents:
+        - utils.ctx_managers.sequence_parallel
    - title: Prompt Strategies
      desc: Prompt formatting strategies
      contents:
@@ -86,7 +112,7 @@ quartodoc:
        - kernels.swiglu
        - kernels.quantize
        - kernels.utils
-    - title: MonkeyPatches
+    - title: Monkey Patches
      desc: Runtime patches for model optimizations
      contents:
        - monkeypatch.llama_attn_hijack_flash
@@ -103,17 +129,16 @@ quartodoc:
        - monkeypatch.trainer_fsdp_optim
        - monkeypatch.transformers_fa_utils
        - monkeypatch.unsloth_
-        - monkeypatch.attention.mllama
        - monkeypatch.data.batch_dataset_fetcher
        - monkeypatch.mixtral
+        - monkeypatch.gradient_checkpointing.offload_cpu
+        - monkeypatch.gradient_checkpointing.offload_disk
    - title: Utils
      desc: Utility functions
      contents:
-        - utils.models
        - utils.tokenization
        - utils.chat_templates
        - utils.lora
-        - utils.lora_embeddings
        - utils.model_shard_quant
        - utils.bench
        - utils.freeze
@@ -124,7 +149,7 @@ quartodoc:
        - utils.optimizers.adopt
        - utils.data.pretraining
        - utils.data.sft
-        - utils.gradient_checkpointing.unsloth
+        - utils.quantization
    - title: Schemas
      desc: Pydantic data models for Axolotl config
      contents:
@@ -174,12 +199,14 @@ quartodoc:
        - utils.callbacks.lisa
        - utils.callbacks.mlflow_
        - utils.callbacks.comet_
-
+        - utils.callbacks.qat
 website:
  title: "Axolotl"
  description: "We make fine-tuning accessible, scalable, and fun"
  favicon: favicon.jpg

+  google-analytics: "G-9KYCVJBNMQ"
+
  navbar:
    logo: image/axolotl_logo_digital_white.svg
    title: false
@@ -232,6 +259,8 @@ website:
            - docs/lr_groups.qmd
            - docs/lora_optims.qmd
            - docs/dataset_loading.qmd
+            - docs/qat.qmd
+            - docs/quantize.qmd

        - section: "Core Concepts"
          contents:
--- a/cicd/Dockerfile-uv.jinja
+++ b/cicd/Dockerfile-uv.jinja
@@ -0,0 +1,52 @@
+FROM axolotlai/axolotl-base-uv:{{ BASE_TAG }}
+
+ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
+ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
+ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
+ENV CUDA="{{ CUDA }}"
+ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
+ENV GITHUB_REF="{{ GITHUB_REF }}"
+ENV GITHUB_SHA="{{ GITHUB_SHA }}"
+ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
+ENV HF_HOME="{{ HF_HOME }}"
+
+RUN apt-get update && \
+    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
+
+WORKDIR /workspace
+
+RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
+
+WORKDIR /workspace/axolotl
+
+RUN git fetch origin +$GITHUB_REF && \
+    git checkout FETCH_HEAD
+
+# If AXOLOTL_EXTRAS is set, append it in brackets
+RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
+        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
+        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
+        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
+        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
+        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
+    fi
+
+RUN uv pip install packaging==23.2 setuptools==75.8.0
+RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
+        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+    else \
+        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
+    fi
+
+RUN python scripts/unsloth_install.py --uv | sh
+RUN python scripts/cutcrossentropy_install.py --uv | sh
+
+# So we can test the Docker image
+RUN uv pip install -r requirements-dev.txt -r requirements-tests.txt
+
+# fix so that git fetch/pull from remote works
+RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
+    git config --get remote.origin.fetch
+
+# helper for huggingface-login cli
+RUN git config --global credential.helper store
--- a/src/axolotl/monkeypatch/attention/ring_attn/adapters/init.py
+++ b/src/axolotl/monkeypatch/attention/ring_attn/adapters/init.py
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -18,7 +18,7 @@ pytest -v --durations=10 \
  --cov-append

 # Run patched tests excluding lora kernels with coverage append
-pytest -v --durations=10 \
+pytest --full-trace -vvv --durations=10 \
  --ignore=tests/e2e/patched/lora_kernels \
  /workspace/axolotl/tests/e2e/patched \
  --cov=axolotl \
--- a/cicd/cleanup.py
+++ b/cicd/cleanup.py
@@ -0,0 +1,19 @@
+"""Modal app to run axolotl GPU cleanup"""
+
+from .single_gpu import VOLUME_CONFIG, app, cicd_image, run_cmd
+
+
+@app.function(
+    image=cicd_image,
+    timeout=60 * 60,
+    cpu=8.0,
+    memory=131072,
+    volumes=VOLUME_CONFIG,
+)
+def cleanup():
+    run_cmd("./cicd/cleanup.sh", "/workspace/axolotl")
+
+
+@app.local_entrypoint()
+def main():
+    cleanup.remote()
--- a/cicd/cleanup.sh
+++ b/cicd/cleanup.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+set -e
+
+# cleanup old cache files for datasets processing and intermediate mappings
+find /workspace/data/huggingface-cache/hub/datasets -name "cache-*" -type f -mtime +1 -exec rm {} \;
+find /workspace/data/huggingface-cache/hub/datasets -name "*.lock" -type f -mtime +1 -exec rm {} \;
--- a/cicd/e2e_tests.py
+++ b/cicd/e2e_tests.py
@@ -1,75 +1,12 @@
 """Modal app to run axolotl GPU tests"""

-# pylint: disable=duplicate-code
-
-import os
-import pathlib
-import tempfile
-
-import jinja2
-import modal
-from jinja2 import select_autoescape
-from modal import App, Image
-
-cicd_path = pathlib.Path(__file__).parent.resolve()
-
-template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
-template_env = jinja2.Environment(
-    loader=template_loader, autoescape=select_autoescape()
-)
-df_template = template_env.get_template("Dockerfile.jinja")
-
-df_args = {
-    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
-    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
-    "CUDA": os.environ.get("CUDA", "121"),
-    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
-    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
-    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
-    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
-    "HF_HOME": "/workspace/data/huggingface-cache/hub",
-}
-
-dockerfile_contents = df_template.render(**df_args)
-
-temp_dir = tempfile.mkdtemp()
-with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
-    f.write(dockerfile_contents)
-
-cicd_image = Image.from_dockerfile(
-    pathlib.Path(temp_dir) / "Dockerfile",
-    context_mount=None,
-    force_build=True,
-    gpu="A10G",
-).env(df_args)
-
-app = App("Axolotl CI/CD", secrets=[])
-
-hf_cache_volume = modal.Volume.from_name(
-    "axolotl-ci-hf-hub-cache", create_if_missing=True
-)
-VOLUME_CONFIG = {
-    "/workspace/data/huggingface-cache/hub": hf_cache_volume,
-}
-
-N_GPUS = int(os.environ.get("N_GPUS", 1))
-GPU_CONFIG = modal.gpu.L40S(count=N_GPUS)
-
-
-def run_cmd(cmd: str, run_folder: str):
-    import subprocess  # nosec
-
-    # Propagate errors from subprocess.
-    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
-        exit(exit_code)  # pylint: disable=consider-using-sys-exit
+from .single_gpu import GPU_CONFIG, VOLUME_CONFIG, app, cicd_image, run_cmd


@app.function(
    image=cicd_image,
    gpu=GPU_CONFIG,
-    timeout=60 * 60,
+    timeout=90 * 60,  # 90 min
    cpu=8.0,
    memory=131072,
    volumes=VOLUME_CONFIG,
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -24,9 +24,9 @@ df_template = template_env.get_template("Dockerfile.jinja")
 df_args = {
    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
-    "CUDA": os.environ.get("CUDA", "121"),
+    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.5.1"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu124-2.5.1"),
+    "CUDA": os.environ.get("CUDA", "124"),
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
@@ -55,7 +55,7 @@ VOLUME_CONFIG = {
 }

 N_GPUS = int(os.environ.get("N_GPUS", 2))
-GPU_CONFIG = modal.gpu.H100(count=N_GPUS)
+GPU_CONFIG = f"H100:{N_GPUS}"


 def run_cmd(cmd: str, run_folder: str):
@@ -70,7 +70,7 @@ def run_cmd(cmd: str, run_folder: str):
    image=cicd_image,
    gpu=GPU_CONFIG,
    timeout=90 * 60,
-    cpu=8.0,
+    cpu=16.0,
    memory=131072 * N_GPUS,
    volumes=VOLUME_CONFIG,
 )
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
@@ -0,0 +1,68 @@
+"""Modal app to run axolotl GPU tests"""
+
+# pylint: disable=duplicate-code
+
+import os
+import pathlib
+import tempfile
+
+import jinja2
+import modal
+import modal.experimental
+from jinja2 import select_autoescape
+from modal import App
+
+cicd_path = pathlib.Path(__file__).parent.resolve()
+
+template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
+template_env = jinja2.Environment(
+    loader=template_loader, autoescape=select_autoescape()
+)
+dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja")
+df_template = template_env.get_template(dockerfile)
+
+df_args = {
+    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
+    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
+    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.5.1"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu124-2.5.1"),
+    "CUDA": os.environ.get("CUDA", "124"),
+    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
+    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
+    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
+    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
+    "HF_HOME": "/workspace/data/huggingface-cache/hub",
+}
+
+dockerfile_contents = df_template.render(**df_args)
+
+temp_dir = tempfile.mkdtemp()
+with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
+    f.write(dockerfile_contents)
+
+cicd_image = modal.experimental.raw_dockerfile_image(
+    pathlib.Path(temp_dir) / "Dockerfile",
+    # context_mount=None,
+    force_build=True,
+    # gpu="A10G",
+).env(df_args)
+
+app = App("Axolotl CI/CD", secrets=[])
+
+hf_cache_volume = modal.Volume.from_name(
+    "axolotl-ci-hf-hub-cache", create_if_missing=True
+)
+VOLUME_CONFIG = {
+    "/workspace/data/huggingface-cache/hub": hf_cache_volume,
+}
+
+N_GPUS = int(os.environ.get("N_GPUS", 1))
+GPU_CONFIG = f"L40S:{N_GPUS}"
+
+
+def run_cmd(cmd: str, run_folder: str):
+    import subprocess  # nosec
+
+    # Propagate errors from subprocess.
+    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
+        exit(exit_code)  # pylint: disable=consider-using-sys-exit
--- a/codecov.yml
+++ b/codecov.yml
@@ -19,7 +19,7 @@ coverage:
        if_no_uploads: error
        if_not_found: success
        if_ci_failed: error
-        only_pulls: false
+        only_pulls: true
        flags: null
        paths: null
    patch:
--- a/deepspeed_configs/zero2_torch_compile.json
+++ b/deepspeed_configs/zero2_torch_compile.json
@@ -0,0 +1,31 @@
+{
+  "compile": {
+    "disable": false,
+    "backend": "inductor"
+  },
+  "zero_optimization": {
+    "stage": 2,
+    "offload_optimizer": {
+      "device": "cpu"
+    },
+    "contiguous_gradients": true,
+    "overlap_comm": true
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "fp16": {
+    "enabled": "auto",
+    "auto_cast": false,
+    "loss_scale": 0,
+    "initial_scale_power": 32,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "wall_clock_breakdown": false
+}
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -38,6 +38,6 @@ RUN git lfs install --skip-repo && \
    # The base image ships with `pydantic==1.8.2` which is not working
    pip3 install -U --no-cache-dir pydantic==1.10.10

-RUN if [ "$PYTORCH_VERSION" = "2.7.0" ] ; then \
+RUN if [ "$PYTORCH_VERSION" = "2.7.1" ] ; then \
        pip3 install flash-attn==2.7.4.post1; \
    fi
--- a/docker/Dockerfile-base-next
+++ b/docker/Dockerfile-base-next
@@ -29,7 +29,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
 WORKDIR /workspace

 RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
-    python3 -m pip install --no-cache-dir -U torch==2.7.0 --extra-index-url https://download.pytorch.org/whl/test/cu$CUDA && \
+    python3 -m pip install --no-cache-dir -U torch==2.7.1 --extra-index-url https://download.pytorch.org/whl/test/cu$CUDA && \
    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"

--- a/docker/Dockerfile-uv-base
+++ b/docker/Dockerfile-uv-base
@@ -0,0 +1,40 @@
+ARG CUDA_VERSION="12.6.3"
+ARG CUDNN_VERSION=""
+ARG UBUNTU_VERSION="22.04"
+ARG MAX_JOBS=4
+
+FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
+
+ARG PYTHON_VERSION="3.11"
+ARG PYTORCH_VERSION="2.6.0"
+ARG CUDA="126"
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
+
+ENV PYTHON_VERSION=$PYTHON_VERSION
+ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
+ENV UV_TORCH_BACKEND="cu${CUDA}"
+
+RUN apt-get update \
+    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config curl && rm -rf /var/lib/apt/lists/* \
+    && git lfs install --skip-repo \
+    && curl -LsSf https://astral.sh/uv/install.sh | sh
+
+ENV PATH="/root/.local/bin:${PATH}"
+
+RUN uv python install ${PYTHON_VERSION}
+
+WORKDIR /workspace
+
+RUN uv venv --no-project --relocatable axolotl-venv
+
+ENV PATH="/workspace/axolotl-venv/bin:${PATH}"
+
+RUN uv pip install packaging setuptools wheel psutil \
+    && uv pip install torch==${PYTORCH_VERSION} \
+    && uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" \
+    && uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" \
+    && uv pip install awscli pydantic
+
+RUN if [ "$PYTORCH_VERSION" = "2.7.1" ] ; then \
+        uv pip install --no-build-isolation flash-attn==2.7.4.post1; \
+    fi
--- a/docs/cli.qmd
+++ b/docs/cli.qmd
@@ -209,6 +209,16 @@ axolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir

 This would be necessary to use with other frameworks. If you have an adapter, merge it with the non-quantized linearized model before delinearizing.

+### quantize
+
+Quantizes a model using the quantization configuration specified in your YAML file.
+
+```bash
+axolotl quantize config.yml
+```
+
+See [Quantization](./quantize.qmd) for more details.
+

 ## Legacy CLI Usage

--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -27,6 +27,8 @@ trust_remote_code:
 tokenizer_use_fast:
 # Whether to use the legacy tokenizer setting, defaults to True
 tokenizer_legacy:
+# Whether to use mistral-common tokenizer. If set to True, it will use the mistral-common tokenizer.
+tokenizer_use_mistral_common:
 # Resize the model embeddings when new tokens are added to multiples of 32
 # This is reported to improve training speed on some models
 resize_token_embeddings_to_32x:
@@ -65,6 +67,20 @@ bnb_config_kwargs:
  bnb_4bit_quant_type: nf4
  bnb_4bit_use_double_quant: true

+# quantization aware training
+qat:
+  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
+  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4" and "int8"
+  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
+  fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after
+
+# post-training quantization
+quantization:
+  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are uintX for X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8
+  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
+  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
+  quantize_embedding: # Optional[bool] = False. Whether to quantize the embedding layer.
+

 # Whether you are training a 4-bit GPTQ quantized model
 gptq: true
@@ -98,8 +114,10 @@ plugins:
  # - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

 # A list of one or more datasets to finetune the model with
+# See https://docs.axolotl.ai/docs/dataset_loading.html for guide on loading datasets
+# See https://docs.axolotl.ai/docs/dataset-formats/ for guide on dataset formats
 datasets:
-  # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
+  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory
  - path: vicgalle/alpaca-gpt4
    # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]
    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
@@ -157,6 +175,10 @@ datasets:
    # Key containing the messages (default: "messages")
    field_messages: messages

+    # Key containing the tools (default: "tools")
+    # Must be a list[dict] and follow [JSON schema](https://json-schema.org/learn/getting-started-step-by-step).
+    field_tools: tools
+
    # Key containing the system message (default: "system")
    # If the system message is not present in the dataset sample, it will be loaded from the field_system property.
    field_system: system
@@ -221,7 +243,7 @@ datasets:
 # The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.
 shuffle_merged_datasets: true

-Deduplicates datasets and test_datasets with identical entries.
+# Deduplicates datasets and test_datasets with identical entries.
 dataset_exact_deduplication: true

 # A list of one or more datasets to eval the model with.
@@ -270,10 +292,25 @@ trl:

  num_generations: # Optional[int]. Number of generations to sample.
  log_completions: # Optional[bool]. Whether to log completions.
+  num_completions_to_print: # Optional[int]. Number of completions to print when log_completions is True.

  sync_ref_model: # Optional[bool]. Whether to sync the reference model.
  ref_model_mixup_alpha: # Optional[float]. Mixup alpha for the reference model.
  ref_model_sync_steps: # Optional[int]. Sync steps for the reference model.
+  scale_rewards: # Optional[bool]. Whether to scale rewards by their standard deviation.
+
+  temperature: # Optional[float]. Sampling temperature for the GRPO policy.
+  top_p: # Optional[float]. Top-p sampling probability for the generation policy.
+  top_k: # Optional[int]. Top-k sampling for the generation policy.
+  min_p: # Optional[float]. Minimum probability for the generation policy.
+  repetition_penalty: # Optional[float]. Penalty for tokens that appear in prompt and generated text.
+
+  num_iterations: # Optional[int]. Number of iterations per batch (μ) for GRPO.
+  epsilon: # Optional[float]. Epsilon value for clipping in the GRPO algorithm.
+  epsilon_high: # Optional[float]. Upper-bound epsilon value for clipping in the GRPO algorithm.
+  use_liger_loss: # Optional[bool]. Whether to use Liger loss for GRPO.
+  loss_type: # Optional[str]. Loss formulation to use. Supported values: grpo, bnpo, dr_grpo.
+  mask_truncated_completions: # Optional[bool]. Whether to exclude truncated completions from loss calculation.


 # reward modelling: `True` or `False`
@@ -483,6 +520,7 @@ output_dir: ./completed-model
 # setting to `auto` will enable torch compile when torch>=2.5.1
 torch_compile:  # Optional[Union[Literal["auto"], bool]]
 torch_compile_backend:  # Optional[str]
+torch_compile_mode:  # 'default' | 'reduce-overhead' | 'max-autotune'

 # Training hyperparameters

@@ -505,6 +543,7 @@ save_strategy: # Set to `"no"` to skip checkpoint saves, `"epoch"` at end of eac
 save_steps: # Leave empty to save at each epoch, integer for every N steps. float for fraction of total steps
 saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
 save_total_limit: # Checkpoints saved at a time
+save_only_model: # Save only the model weights, skipping the optimizer. Using this means you can't resume from checkpoints.
 # Maximum number of iterations to train for. It precedes num_epochs which means that
 # if both are set, num_epochs will not be guaranteed.
 # e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
@@ -528,7 +567,7 @@ profiler_steps: # enable the pytorch profiler to capture the first N steps of tr
 loss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)
 loss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)

-# Save model as safetensors (require safetensors package)
+# Save model as safetensors (require safetensors package). Default True
 save_safetensors:

 # Whether to mask out or include the human's prompt from the training labels
@@ -538,7 +577,7 @@ train_on_inputs: false
 # Note that training loss may have an oscillating pattern with this enabled.
 group_by_length: false

-# Whether to use gradient checkpointing. Available options are: true, false, "offload".
+# Whether to use gradient checkpointing. Available options are: true, false, "offload", "offload_disk".
 # https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
 gradient_checkpointing: false
 # additional kwargs to pass to the trainer for gradient checkpointing
@@ -550,7 +589,24 @@ gradient_checkpointing: false
 early_stopping_patience: 3

 # Specify a scheduler and kwargs to use with the optimizer
-lr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | 'linear' | 'cosine_with_restarts' | 'polynomial' | 'constant' | 'constant_with_warmup' | 'inverse_sqrt' | 'reduce_lr_on_plateau' | 'cosine_with_min_lr' | 'warmup_stable_decay' | empty for cosine
+# Valid values are driven by the Transformers SchedulerType class, see:
+# https://github.com/huggingface/transformers/blob/5f4ecf2d9f867a1255131d2461d75793c0cf1db2/src/transformers/trainer_utils.py#L420
+# Valid values include
+# - 'linear'
+# - 'cosine' (default)
+# - 'cosine_with_restarts'
+# - 'polynomial'
+# - 'constant'
+# - 'constant_with_warmup'
+# - 'inverse_sqrt'
+# - 'reduce_lr_on_plateau'
+# - 'cosine_with_min_lr'
+# - 'warmup_stable_decay'
+
+# Additional schedulers include:
+# - 'one_cycle'
+# - 'rex'
+lr_scheduler:
 lr_scheduler_kwargs:
 cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
 cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)
@@ -568,7 +624,7 @@ lr_div_factor: # Learning rate div factor
 #
 # Valid values for 'optimizer' include:
 # - adamw_torch
-# - adamw_torch_fused
+# - adamw_torch_fused (default)
 # - adamw_torch_xla
 # - adamw_torch_npu_fused
 # - adamw_apex_fused
@@ -612,6 +668,7 @@ lr_div_factor: # Learning rate div factor
 # - optimi_adamw
 # - ao_adamw_8bit
 # - ao_adamw_fp8
+# - came_pytorch
 optimizer:
 # Dictionary of arguments to pass to the optimizer
 optim_args:
@@ -631,7 +688,9 @@ weight_decay:
 # adamw hyperparams
 adam_beta1:
 adam_beta2:
+adam_beta3:  # only used for CAME Optimizer
 adam_epsilon:
+adam_epsilon2:  # only used for CAME Optimizer
 # Gradient clipping max norm
 max_grad_norm:

--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -52,7 +52,9 @@ We recommend checking the below examples for other usecases.

 ### Examples

-1. (Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.
+#### Training on last message
+
+(Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.

 ```yaml
 datasets:
@@ -66,7 +68,9 @@ datasets:
 If you receive an error like "`chat_template` choice is `tokenizer_default` but tokenizer's `chat_template` is null.", it means the tokenizer does not have a default `chat_template`. Follow the examples below instead to set a custom `chat_template`.
 :::

-2. Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages.
+#### Overriding default chat template
+
+Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages.

 ```yaml
 chat_template: gemma # this overwrites the tokenizer's chat_template
@@ -76,7 +80,13 @@ datasets:
    roles_to_train: ["assistant"]  # default value
 ```

-3. Using the tokenizer_config.json's chat template or `chatml` as fallback if the former's chat template does not exist, on OpenAI messages format, training on all assistant messages.
+::: {.callout-note}
+If you want to use built-in chat_template, use `chat_template: tokenizer_default` (this is set by default).
+:::
+
+#### Using default chat template with fallback
+
+Using the tokenizer_config.json's chat template or `chatml` as fallback if the former's chat template does not exist, on OpenAI messages format, training on all assistant messages.

 ```yaml
 chat_template: tokenizer_default_fallback_chatml # this overwrites the tokenizer's chat_template
@@ -85,7 +95,9 @@ datasets:
    type: chat_template
 ```

-4. Using a custom jinja template on OpenAI messages format, training on all assistant messages.
+#### Custom Jinja template
+
+Using a custom jinja template on OpenAI messages format, training on all assistant messages.

 ```yaml
 # chat_template: jinja # `jinja` will be implied if the `chat_template_jinja` is set and this field is empty
@@ -100,7 +112,9 @@ datasets:
 Please make sure that your `tokenizer.eos_token` is same as EOS (End-of-Sequence) token in template. Otherwise, set `eos_token` under `special_tokens: `.
 :::

-5. If you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the `eot_tokens: ` config. The handling of EOT tokens follows `train_on_eos: ` which defaults to turn.
+#### Using template with different token for EOT and EOS
+
+- If you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the `eot_tokens: ` config. The handling of EOT tokens follows `train_on_eos: ` which defaults to turn.

 ```yaml
 eot_tokens:
@@ -125,7 +139,7 @@ Using `eot_tokens` requires each token that exists in `chat_template` to be a si
 You can add those tokens as new tokens under `tokens: ` or (recommended) override unused added_tokens via `added_tokens_overrides: `. See [config](../config.qmd) for more details.
 :::

-6. Continuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set `train_on_eos: last`.
+- Continuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set `train_on_eos: last`.

 ```yaml
 eot_tokens:
@@ -145,7 +159,73 @@ If EOS token only appears at the end of a prompt, `train_on_eos: last` is equiva
 :::


-7. (Advanced) Using fine-grained control over tokens and turns to train in a conversation
+#### Using tool use
+
+Instead of passing `tools` via the system prompt, an alternative method would be to have the `tools` in a separate column and loaded via `chat_template` to let the template dynamically build it.
+
+```json
+{
+    "tools": [
+        {
+            "type": "...",
+            "function": {
+                "name": "...",
+                "description": "...",
+                "parameters": {
+                    "type": "...",
+                    "properties": {
+                        // ...
+                    },
+                    "required": ["..."],
+                },
+            },
+        },
+    ],
+    "messages": [
+        // ...
+        {
+            "role": "assistant", // call the function via assistant
+            "tool_calls": [
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "...",
+                        "arguments": {
+                            "...": "...",
+                        }
+                    }
+                }
+            ]
+        },
+        {
+            "role": "tool",
+            "name": "...",
+            "content": "..."
+        },
+    ],
+}
+```
+
+::: {.callout-note}
+Tools need to follow [JSON schema](https://json-schema.org/learn/getting-started-step-by-step).
+:::
+
+```yaml
+chat_template: llama4
+datasets:
+  - path: ...
+    type: chat_template
+    # field_tools: tools # default is `tools`
+```
+
+::: {.callout-tip}
+Look into the `chat_template` you are using to see if it supports `tools` and what the expected role is for the tool answer. In the example above, the tool answer is expected to be in the `tool` or `ipython` role for `llama4` template.
+:::
+
+
+#### Using fine-grained control over token masking
+
+(Advanced) Using fine-grained control over tokens and turns to train in a conversation

 For a data sample that looks like:

@@ -196,7 +276,9 @@ datasets:
 It is not necessary to set both `message_field_training` and `message_field_training_detail` at once.
 :::

-8. (For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.
+#### Reasoning split
+
+(For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.

 ```yaml
 datasets:
--- a/docs/dataset-formats/index.qmd
+++ b/docs/dataset-formats/index.qmd
@@ -36,10 +36,6 @@ It is typically recommended to save your dataset as `.jsonl` due to its flexibil

 Axolotl supports loading from a Hugging Face hub repo or from local files.

-::: {.callout-important}
-For pre-training only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts.
-:::
-
 ### Pre-training from Hugging Face hub datasets

 As an example, to train using a Hugging Face dataset `hf_org/name`, you can pass the following config:
@@ -77,18 +73,21 @@ datasets:
    type: completion
 ```

-From local files (either example works):
+From local files:

 ```yaml
 datasets:
  - path: A.jsonl
    type: completion

-  - path: json
-    data_files: ["A.jsonl", "B.jsonl", "C.jsonl"]
+  - path: B.jsonl
    type: completion
 ```

+::: {.callout-important}
+For `completion` only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts. If you are interested in having this for `pretraining_dataset` too, please let us know or help make a PR!
+:::
+
 ### Pre-training dataset configuration tips

 #### Setting max_steps
--- a/docs/dataset_loading.qmd
+++ b/docs/dataset_loading.qmd
@@ -54,7 +54,7 @@ datasets:

 #### Files

-Usually, to load a JSON file, you would do something like this:
+To load a JSON file, you would do something like this:

 ```python
 from datasets import load_dataset
@@ -66,20 +66,12 @@ Which translates to the following config:

 ```yaml
 datasets:
-  - path: json
-    data_files: /path/to/your/file.jsonl
-```
-
-However, to make things easier, we have added a few shortcuts for loading local dataset files.
-
-You can just point the `path` to the file or directory along with the `ds_type` to load the dataset. The below example shows for a JSON file:
-
-```yaml
-datasets:
-  - path: /path/to/your/file.jsonl
+  - path: data.json
    ds_type: json
 ```

+In the example above, it can be seen that we can just point the `path` to the file or directory along with the `ds_type` to load the dataset.
+
 This works for CSV, JSON, Parquet, and Arrow files.

 ::: {.callout-tip}
--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -8,6 +8,10 @@ format:

 This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai).

+::: {.callout-important}
+For Blackwell GPUs, please use the tags with Pytorch 2.7.1 and CUDA 12.8.
+:::
+
 ## Base

 The base image is the most minimal image that can install Axolotl. It is based on the `nvidia/cuda` image. It includes python, torch, git, git-lfs, awscli, pydantic, and more.
@@ -28,11 +32,10 @@ main-base-py{python_version}-cu{cuda_version}-{pytorch_version}

 Tags examples:

- `main-base-py3.11-cu128-2.7.0`
- `main-base-py3.11-cu126-2.7.0`
+- `main-base-py3.11-cu128-2.7.1`
+- `main-base-py3.11-cu126-2.7.1`
 - `main-base-py3.11-cu124-2.6.0`
 - `main-base-py3.11-cu124-2.5.1`
- `main-base-py3.11-cu124-2.4.1`

 ## Main

@@ -73,12 +76,10 @@ Tags examples:
 - `main-py3.11-cu126-2.7.0`
 - `main-py3.11-cu124-2.6.0`
 - `main-py3.11-cu124-2.5.1`
- `main-py3.11-cu124-2.4.1`
 - `main-latest`
 - `main-20250303-py3.11-cu124-2.6.0`
 - `main-20250303-py3.11-cu124-2.5.1`
- `main-20250303-py3.11-cu124-2.4.1`
- `0.7.1`
+- `0.9.2`

 ## Cloud

--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -110,3 +110,17 @@ description: Frequently asked questions
 > A: If `eot_tokens: ` is not provided, the default behavior is the same as before. EOS tokens used to delimit turns are masked/unmasked depending on whether the turn is trainable.

 > Internally, `eot_tokens: tokenizer.eos_token` and `train_on_eot: train_on_eos` (which defaults to `turn`). This transition helps clarify the naming and behavior of EOT/EOS tokens.
+
+**Q: `Data processing error: CAS service error`**
+
+> A: Try disabling XET with `export HF_HUB_DISABLE_XET=1`
+
+**Q: `torch._inductor.exc.LoweringException: NoValidChoicesError: No choices to select, please consider adding ATEN into max_autotune_gemm_backends config (defined in torch/_inductor/config.py) to allow at least one choice. `**
+
+> A: Depending on the version of torch, you may need to include this in your YAML:
+
+> ```yaml
+> flex_attn_compile_kwargs:
+>   dynamic: false
+>   mode: max-autotune-no-cudagraphs
+> ```
--- a/docs/getting-started.qmd
+++ b/docs/getting-started.qmd
@@ -104,7 +104,7 @@ the `alpaca` dataset format, which has the following format:
 Please see our [Dataset Formats](dataset-formats) for more dataset formats and how to
 format them.

-2. Prepare your JSONL data in the specified format (in this case, the expected `alpaca
+2. Prepare your JSONL data in the specified format (in this case, the expected `alpaca`
 format):

 ```json
@@ -120,6 +120,12 @@ axolotl train my_training.yml

 ## Common Tasks {#sec-common-tasks}

+::: {.callout-tip}
+
+The same yaml file is used for training, inference, and merging.
+
+:::
+
 ### Testing Your Model {#sec-testing}

 After training, test your model:
@@ -128,6 +134,16 @@ After training, test your model:
 axolotl inference my_training.yml --lora-model-dir="./outputs/lora-out"
 ```

+More details can be found in [Inference](inference.qmd).
+
+### Using a UI {#sec-ui}
+
+Launch a Gradio interface:
+
+```bash
+axolotl inference my_training.yml --lora-model-dir="./outputs/lora-out" --gradio
+```
+
 ### Preprocessing Data {#sec-preprocessing}

 For large datasets, preprocess first:
@@ -136,14 +152,22 @@ For large datasets, preprocess first:
 axolotl preprocess my_training.yml
 ```

-### Using a UI {#sec-ui}
+Please make sure to set `dataset_prepared_path: ` in your config to set the path to save the prepared dataset.

-Launch a Gradio interface:
+More details can be found in [Dataset Preprocessing](dataset_preprocessing.qmd).
+
+### Merging LoRA weights {#sec-merging-lora}
+
+To merge the LoRA weights back into the base model, run:

 ```bash
-axolotl inference my_training.yml --lora-model-dir="./outputs/lora-out" --gradio
+axolotl merge-lora my_training.yml --lora-model-dir="./outputs/lora-out"
 ```

+The merged model will be saved in the `{output_dir}/merged` directory.
+
+More details can be found in [Merging LoRA weights](inference.qmd#sec-merging).
+
 ## Next Steps {#sec-next-steps}

 Now that you have the basics, you might want to:
@@ -156,6 +180,7 @@ Now that you have the basics, you might want to:
 Check our other guides for details on these topics:

 - [Configuration Guide](config.qmd) - Full configuration options
+- [Dataset Loading](dataset_loading.qmd) - Loading datasets from various sources
 - [Dataset Formats](dataset-formats) - Working with different data formats
 - [Multi-GPU Training](multi-gpu.qmd)
 - [Multi-Node Training](multi-node.qmd)
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -15,7 +15,7 @@ This guide covers all the ways you can install and set up Axolotl for your envir

 - NVIDIA GPU (Ampere architecture or newer for `bf16` and Flash Attention) or AMD GPU
 - Python ≥3.10
- PyTorch ≥2.4.1
+- PyTorch ≥2.5.1

 ## Installation Methods {#sec-installation-methods}

@@ -25,6 +25,10 @@ Please make sure to have Pytorch installed before installing Axolotl in your loc
 Follow the instructions at: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
 :::

+::: {.callout-important}
+For Blackwell GPUs, please use Pytorch 2.7.0 and CUDA 12.8.
+:::
+
 ### PyPI Installation (Recommended) {#sec-pypi}

 ```{.bash}
@@ -37,6 +41,40 @@ installed) in order not to clobber it, and so that we set the correct version of
 dependencies that are specific to the PyTorch version or other installed
 co-dependencies.

+### uv Installation {#sec-uv}
+
+uv is a fast, reliable Python package installer and resolver built in Rust. It offers significant performance improvements over pip and provides better dependency resolution, making it an excellent choice for complex environments.
+
+Install uv if not already installed
+```{.bash}
+curl -LsSf https://astral.sh/uv/install.sh | sh
+source $HOME/.local/bin/env
+```
+
+Choose your CUDA version to use with PyTorch; e.g. `cu124`, `cu126`, `cu128`,
+then create the venv and activate
+```{.bash}
+export UV_TORCH_BACKEND=cu126
+uv venv --no-project --relocatable
+source .venv/bin/activate
+```
+
+Install PyTorch
+- PyTorch 2.6.0 recommended
+```{.bash}
+uv pip install packaging setuptools wheel
+uv pip install torch==2.6.0
+uv pip install awscli pydantic
+```
+
+Install axolotl from PyPi
+```{.bash}
+uv pip install --no-build-isolation axolotl[deepspeed,flash-attn]
+
+# optionally install with vLLM if you're using torch==2.6.0 and want to train w/ GRPO
+uv pip install --no-build-isolation axolotl[deepspeed,flash-attn,vllm]
+```
+
 ### Edge/Development Build {#sec-edge-build}

 For the latest features between releases:
@@ -72,6 +110,10 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it \
 ```
 :::

+::: {.callout-important}
+For Blackwell GPUs, please use `axolotlai/axolotl:main-py3.11-cu128-2.7.0` or the cloud variant `axolotlai/axolotl-cloud:main-py3.11-cu128-2.7.0`.
+:::
+
 Please refer to the [Docker documentation](docker.qmd) for more information on the different Docker images that are available.

 ## Cloud Environments {#sec-cloud}
--- a/docs/lora_optims.qmd
+++ b/docs/lora_optims.qmd
@@ -84,6 +84,10 @@ lora_qkv_kernel: true
 lora_o_kernel: true
 ```

+::: {.callout-note}
+Currently, LoRA kernels are not supported for RLHF training, only SFT.
+:::
+
 ## Requirements

 - One or more NVIDIA or AMD GPUs (in order to use the Triton kernels)
--- a/docs/multi-gpu.qmd
+++ b/docs/multi-gpu.qmd
@@ -87,20 +87,7 @@ We support sequence parallelism (SP) via the
 allows one to split up sequences across GPUs, which is useful in the event that a
 single sequence causes OOM errors during model training.

-First, install `ring-flash-attn`, recommended via `pip install axolotl[ring-flash-attn]`,
-or from source with `pip install .[ring-flash-attn]`.
-
-Your Axolotl YAML config should contain the following lines:
-
-```{.yaml}
-sequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU
-flash_attention: true  # Required with sequence parallelism
-
-# Optional; strides across the key dimension. Larger values use more memory but will make training faster.
-heads_k_stride: 1
-```
-
-See our [dedicated guide](sequence_parallelism.qmd) for more details.
+See our [dedicated guide](sequence_parallelism.qmd) for more information.

 ### FSDP + QLoRA {#sec-fsdp-qlora}

--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -43,7 +43,7 @@ datasets:
 # leave the vision model and vision tower frozen
 # load_in_8bit: true
 adapter: lora
-lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 # (optional) if you want to resize images to a set size
 image_size: 512
--- a/docs/qat.qmd
+++ b/docs/qat.qmd
@@ -0,0 +1,32 @@
+---
+title: "Quantization Aware Training (QAT)"
+back-to-top-navigation: true
+toc: true
+toc-expand: 2
+toc-depth: 4
+---
+
+## Overview
+
+[Quantization Aware Training](https://pytorch.org/blog/introduction-to-quantization-on-pytorch/#quantization-aware-training) (QAT) is a technique for improving the accuracy of models which are quantized
+by applying "fake" quantizations to the model's weights (and optionally, activations) during training. This fake
+quantization allows for the model to adjust for noise introduced by the quantization, so when the model is eventually
+quantized, the accuracy loss is minimized. We use the quantization techniques implemented in [torchao](https://github.com/pytorch/ao) to provide
+support for QAT and post-training quantization (PTQ) in axolotl.
+
+We recommend reviewing the excellent QAT tutorial in the [torchtune library](https://pytorch.org/torchtune/main/tutorials/qat_finetune.html#quantizing-the-qat-model),
+and the QAT documentation in the [torchao library](https://github.com/pytorch/ao/tree/main/torchao/quantization/qat), for more details.
+
+## Configuring QAT in Axolotl
+
+To enable QAT in axolotl, add the following to your configuration file:
+
+```yaml
+qat:
+  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
+  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4" and "int8"
+  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
+  fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after
+```
+
+Once you have finished training, you must quantize your model by using the same quantization configuration which you used to train the model with. You can use the [`quantize`](./quantize.qmd) command to do this.
--- a/docs/quantize.qmd
+++ b/docs/quantize.qmd
@@ -0,0 +1,53 @@
+---
+title: "Quantization with torchao"
+back-to-top-navigation: true
+toc: true
+toc-expand: 2
+toc-depth: 4
+---
+
+Quantization is a technique to lower the memory footprint of your model, potentially at the cost of accuracy or model performance. We support quantizing your model using the [torchao](https://github.com/pytorch/ao) library. Quantization is supported for both post-training quantization (PTQ) and quantization-aware training (QAT).
+
+
+::: {.callout-note}
+
+We do not currently support quantization techniques such as GGUF/GPTQ,EXL2 at the moment.
+
+:::
+
+## Configuring Quantization in Axolotl
+
+Quantization is configured using the `quantization` key in your configuration file.
+
+```yaml
+base_model: # The path to the model to quantize.
+quantization:
+  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are uintX for X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8
+  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
+  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
+  quantize_embedding: # Optional[bool] = False. Whether to quantize the embedding layer.
+
+output_dir:  # The path to the output directory.
+```
+
+Once quantization is complete, your quantized model will be saved in the `{output_dir}/quantized` directory.
+
+You may also use the `quantize` command to quantize a model which has been trained with [QAT](./qat.md) - you can do this by using the existing QAT configuration file which
+you used to train the model:
+
+```yaml
+# qat.yml
+qat:
+  activation_dtype: int8
+  weight_dtype: int8
+  group_size: 256
+  quantize_embedding: true
+
+output_dir: # The path to the output directory used during training where the final checkpoint has been saved.
+```
+
+```bash
+axolotl quantize qat.yml
+```
+
+This ensures that an identical quantization configuration is used to quantize the model as was used to train it.
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -16,7 +16,8 @@ feedback. Various methods include, but not limited to:
 - [Identity Preference Optimization (IPO)](#ipo)
 - [Kahneman-Tversky Optimization (KTO)](#kto)
 - [Odds Ratio Preference Optimization (ORPO)](#orpo)
- Proximal Policy Optimization (PPO) (not yet supported in axolotl)
+- [Group Relative Policy Optimization (GRPO)](#grpo)
+- Proximal Policy Optimization (PPO) (not yet supported in axolotl, if you're interested in contributing, please reach out!)


 ## RLHF using Axolotl
@@ -499,7 +500,7 @@ The input format is a simple JSON input with customizable fields based on the ab
 ### GRPO

 ::: {.callout-tip}
-Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/axolotl-cookbook/tree/main/grpo#training-an-r1-style-large-language-model-using-grpo).
+Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/grpo_code).
 :::

 In the latest GRPO implementation, `vLLM` is used to significantly speedup trajectory generation during training. In this example, we're using 4 GPUs - 2 for training, and 2 for vLLM:
@@ -582,7 +583,20 @@ datasets:

 To see other examples of custom reward functions, please see [TRL GRPO Docs](https://github.com/huggingface/trl/blob/main/docs/source/grpo_trainer.md#using-a-custom-reward-function).

-To see description of the configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/config/models/input/v0_4_1/trl.py).
+To see all configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/v0.9.2/src/axolotl/utils/schemas/trl.py).
+
+#### GRPO with DAPO/Dr. GRPO loss
+
+The DAPO paper and subsequently Dr. GRPO paper proposed an alternative loss function for GRPO to remediate the penalty in longer responses.
+
+```yaml
+trl:
+  loss_type: dr_grpo
+  # Normalizes loss based on max completion length (default: 256)
+  max_completion_length:
+```
+
+For more information, see [GRPO docs](https://huggingface.co/docs/trl/v0.17.0/en/grpo_trainer#loss-types).

 ### SimPO

--- a/docs/sequence_parallelism.qmd
+++ b/docs/sequence_parallelism.qmd
@@ -3,8 +3,6 @@ title: Sequence Parallelism
 description: Train with long sequences split across multiple GPUs.
 ---

-# Sequence Parallelism
-
 Sequence parallelism is a technique that splits sequences across multiple GPUs,
 allowing you to train with very long sequences that wouldn't fit on a single GPU. Each
 GPU processes a different portion of the sequence, and the results are aggregated
@@ -27,7 +25,7 @@ To enable sequence parallelism, add the following to your configuration file:
 sequence_parallel_degree: 4  # Split sequences across 4 GPUs
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 heads_k_stride: 1
-# Optional; one of "varlen_llama3", "batch_ring", "batch_zigzag", "batch_stripe". Defaults to
+# Optional; one of "varlen_llama3" or "batch_ring". Defaults to
 # "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
 ring_attn_func:
 ```
@@ -43,7 +41,7 @@ When sequence parallelism is enabled:

 1. Each sequence is divided into equal chunks across the GPUs in a sequence parallel group
 2. The data collator handles the chunking of input_ids, attention_mask, labels, and position_ids
-3. Position IDs are adjusted to maintain proper relative positions, especially for packed sequences
+3. Position IDs are adjusted to maintain proper relative positions
 4. The trainer uses special ring communication patterns for attention operations

 ## Requirements
@@ -69,9 +67,11 @@ sequence_len: 8192
 ...

 sequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU
-flash_attention: true  # Required with sequence parallelism
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 heads_k_stride: 1
+# Optional; one of "varlen_llama3" or "batch_ring". Defaults to
+# "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
+ring_attn_func:

 ...
 ```
--- a/examples/cerebras/btlm-ft.yml
+++ b/examples/cerebras/btlm-ft.yml
@@ -59,7 +59,9 @@ gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1

-attention: flash
+flash_attention: true
+sdp_attention:
+flash_optimum:

 gptq_groupsize:
 gptq_model_v1:
--- a/examples/cerebras/qlora.yml
+++ b/examples/cerebras/qlora.yml
@@ -39,7 +39,8 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 10
--- a/examples/code-llama/13b/lora.yml
+++ b/examples/code-llama/13b/lora.yml
@@ -45,8 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/code-llama/13b/qlora.yml
+++ b/examples/code-llama/13b/qlora.yml
@@ -46,8 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/code-llama/34b/lora.yml
+++ b/examples/code-llama/34b/lora.yml
@@ -45,8 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/code-llama/34b/qlora.yml
+++ b/examples/code-llama/34b/qlora.yml
@@ -46,8 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/code-llama/7b/lora.yml
+++ b/examples/code-llama/7b/lora.yml
@@ -45,8 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/code-llama/7b/qlora.yml
+++ b/examples/code-llama/7b/qlora.yml
@@ -46,8 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/cohere/command-r-7b-qlora.yml
+++ b/examples/cohere/command-r-7b-qlora.yml
@@ -49,8 +49,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -112,7 +112,9 @@
    "early_stopping_patience:\n",
    "resume_from_checkpoint:\n",
    "logging_steps: 1\n",
-    "attention: sdpa\n",
+    "xformers_attention:\n",
+    "flash_attention: false\n",
+    "sdp_attention: true\n",
    "\n",
    "warmup_steps: 1\n",
    "max_steps: 25\n",
--- a/examples/dbrx/16bit-lora.yaml
+++ b/examples/dbrx/16bit-lora.yaml
@@ -52,8 +52,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch:
--- a/examples/dbrx/8bit-lora.yaml
+++ b/examples/dbrx/8bit-lora.yaml
@@ -55,8 +55,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch:
--- a/examples/dbrx/fft-ds-zero3.yaml
+++ b/examples/dbrx/fft-ds-zero3.yaml
@@ -39,8 +39,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch:
--- a/examples/deepseek-v2/fft-fsdp-16b.yaml
+++ b/examples/deepseek-v2/fft-fsdp-16b.yaml
@@ -35,8 +35,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 2
--- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -59,8 +59,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 2
--- a/examples/falcon/config-7b-lora.yml
+++ b/examples/falcon/config-7b-lora.yml
@@ -43,7 +43,8 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 40
--- a/examples/falcon/config-7b-qlora.yml
+++ b/examples/falcon/config-7b-qlora.yml
@@ -73,7 +73,8 @@ early_stopping_patience: 3
 resume_from_checkpoint:
 auto_resume_from_checkpoints: true
 logging_steps: 1
-attention: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 10
--- a/examples/falcon/config-7b.yml
+++ b/examples/falcon/config-7b.yml
@@ -40,7 +40,8 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 40
--- a/examples/gemma/qlora.yml
+++ b/examples/gemma/qlora.yml
@@ -47,8 +47,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/gemma2/qlora.yml
+++ b/examples/gemma2/qlora.yml
@@ -53,8 +53,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/gemma2/reward-model.yaml
+++ b/examples/gemma2/reward-model.yaml
@@ -43,8 +43,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/gemma3/gemma-3-1b-qlora.yml
+++ b/examples/gemma3/gemma-3-1b-qlora.yml
@@ -57,8 +57,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/gemma3/gemma-3-4b-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-qlora.yml
@@ -28,7 +28,7 @@ pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 wandb_project:
 wandb_entity:
@@ -51,7 +51,8 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 logging_steps: 1
-attention: flash
+flash_attention: true
+eager_attention:

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/gemma3/gemma-3-4b-vision-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-vision-qlora.yml
@@ -30,7 +30,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 wandb_project:
 wandb_entity:
@@ -53,7 +53,8 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 logging_steps: 1
-attention: flash
+flash_attention: true
+eager_attention:

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/gptj/qlora.yml
+++ b/examples/gptj/qlora.yml
@@ -36,7 +36,8 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 10
--- a/examples/jamba/qlora.yaml
+++ b/examples/jamba/qlora.yaml
@@ -47,8 +47,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch:
--- a/examples/jamba/qlora_deepspeed.yaml
+++ b/examples/jamba/qlora_deepspeed.yaml
@@ -46,8 +46,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch:
--- a/examples/jamba/qlora_fsdp_large.yaml
+++ b/examples/jamba/qlora_fsdp_large.yaml
@@ -45,8 +45,7 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: true
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 1
--- a/examples/jeopardy-bot/config.yml
+++ b/examples/jeopardy-bot/config.yml
@@ -37,7 +37,8 @@ bf16: auto
 tf32: true
 resume_from_checkpoint:
 logging_steps: 5
-attention: xformers
+xformers_attention: true
+flash_attention:
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
--- a/examples/llama-2/fft_optimized.yml
+++ b/examples/llama-2/fft_optimized.yml
@@ -42,8 +42,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
 flash_attn_fuse_qkv: false
--- a/examples/llama-2/gptq-lora.yml
+++ b/examples/llama-2/gptq-lora.yml
@@ -53,7 +53,9 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
+flash_attention:
+sdp_attention:
+flash_optimum:
 warmup_steps: 100
 evals_per_epoch: 4
 saves_per_epoch: 1
--- a/examples/llama-2/lisa.yml
+++ b/examples/llama-2/lisa.yml
@@ -46,8 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
 flash_attn_fuse_qkv: false
--- a/examples/llama-2/loftq.yml
+++ b/examples/llama-2/loftq.yml
@@ -45,8 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -45,8 +45,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-2/qlora-fsdp.yml
+++ b/examples/llama-2/qlora-fsdp.yml
@@ -48,8 +48,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -46,8 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-2/relora.yml
+++ b/examples/llama-2/relora.yml
@@ -48,8 +48,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3-vision/lora-11b.yaml
+++ b/examples/llama-3-vision/lora-11b.yaml
@@ -29,7 +29,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 wandb_project:
 wandb_entity:
@@ -50,7 +50,8 @@ tf32: true

 gradient_checkpointing: true
 logging_steps: 1
-attention: flash
+flash_attention: true
+eager_attention:

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/llama-3/3b-qat-fsdp2.yaml
+++ b/examples/llama-3/3b-qat-fsdp2.yaml
@@ -0,0 +1,79 @@
+base_model: meta-llama/Llama-3.2-3B
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
+plugins:
+  - axolotl.integrations.liger.LigerPlugin
+
+liger_rope: true
+liger_rms_norm: true
+liger_glu_activation: true
+liger_layer_norm: true
+liger_fused_linear_cross_entropy: true
+
+datasets:
+  - path: yahma/alpaca-cleaned
+    type: alpaca
+
+output_dir: ./outputs/qat_out/
+
+sample_packing: true
+pad_to_sequence_len: true
+sequence_len: 512
+
+flex_attention: true
+flex_attn_compile_kwargs:
+  dynamic: false
+  mode: max-autotune-no-cudagraphs
+
+qat:
+  activation_dtype: int8
+  weight_dtype: int4
+  group_size: 32
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 1
+micro_batch_size: 16
+num_epochs: 1
+optimizer: adamw_torch_fused
+
+cosine_constant_lr_ratio: 0
+cosine_min_lr_ratio: 1.0
+learning_rate: 2e-5
+save_only_model: true
+bf16: true
+
+resume_from_checkpoint:
+logging_steps: 1
+
+evals_per_epoch: 1
+saves_per_epoch: 1
+
+warmup_steps: 10
+weight_decay: 0.0
+fsdp:
+  - full_shard
+  - auto_wrap
+
+fsdp_config:
+  fsdp_version: 2
+  fsdp_offload_params: false
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sharding_strategy: FULL_SHARD
+  fsdp_reshard_after_forward: true
+  fsdp_activation_checkpointing: true
+
+special_tokens:
+  pad_token: <|end_of_text|>
--- a/examples/llama-3/fft-8b-liger-fsdp.yaml
+++ b/examples/llama-3/fft-8b-liger-fsdp.yaml
@@ -49,8 +49,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 2
--- a/examples/llama-3/fft-8b.yaml
+++ b/examples/llama-3/fft-8b.yaml
@@ -34,8 +34,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 2
--- a/examples/llama-3/instruct-dpo-lora-8b.yml
+++ b/examples/llama-3/instruct-dpo-lora-8b.yml
@@ -5,6 +5,10 @@ tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name

+special_tokens:
+  pad_token: <|finetune_right_pad_id|>
+  eos_token: <|eot_id|>
+
 load_in_8bit: true
 load_in_4bit: false

@@ -61,8 +65,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/instruct-lora-8b.yml
+++ b/examples/llama-3/instruct-lora-8b.yml
@@ -56,8 +56,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/lora-1b-deduplicate-dpo.yml
+++ b/examples/llama-3/lora-1b-deduplicate-dpo.yml
@@ -77,8 +77,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/lora-1b-deduplicate-sft.yml
+++ b/examples/llama-3/lora-1b-deduplicate-sft.yml
@@ -53,8 +53,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/lora-1b-kernels.yml
+++ b/examples/llama-3/lora-1b-kernels.yml
@@ -54,8 +54,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/llama-3/lora-1b-ray.yml
+++ b/examples/llama-3/lora-1b-ray.yml
@@ -48,8 +48,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/llama-3/lora-1b-sample-packing-sequentially.yml
+++ b/examples/llama-3/lora-1b-sample-packing-sequentially.yml
@@ -55,8 +55,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/lora-1b.yml
+++ b/examples/llama-3/lora-1b.yml
@@ -5,7 +5,7 @@ base_model: NousResearch/Llama-3.2-1B
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
-dataset_prepared_path: last_run_prepared
+
 val_set_size: 0.1
 output_dir: ./outputs/lora-out

@@ -38,6 +38,7 @@ wandb_log_model:
 gradient_accumulation_steps: 2
 micro_batch_size: 2
 num_epochs: 1
+
 optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
@@ -48,8 +49,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/llama-3/lora-8b.yml
+++ b/examples/llama-3/lora-8b.yml
@@ -49,8 +49,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/qlora-1b-kto.yaml
+++ b/examples/llama-3/qlora-1b-kto.yaml
@@ -53,8 +53,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 20
 evals_per_epoch: 4
--- a/examples/llama-3/qlora-1b.yml
+++ b/examples/llama-3/qlora-1b.yml
@@ -51,8 +51,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/llama-3/qlora-fsdp-405b.yaml
+++ b/examples/llama-3/qlora-fsdp-405b.yaml
@@ -39,8 +39,7 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: true
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/qlora-fsdp-70b.yaml
+++ b/examples/llama-3/qlora-fsdp-70b.yaml
@@ -48,8 +48,7 @@ gradient_checkpointing_kwargs:
  use_reentrant: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/qlora.yml
+++ b/examples/llama-3/qlora.yml
@@ -46,8 +46,7 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-attention: flash
-
+flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-4/README.md
+++ b/examples/llama-4/README.md
@@ -34,3 +34,5 @@ We provide a script to delinearize Llama 4 linearized models into regular Huggin
 ```bash
 axolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir
 ```
+
+Note: This only works with the non-quantized linearized model. If you have an adapter, merge it with the *non-quantized linearized* model before delinearizing.
--- a/examples/llava/lora-7b.yaml
+++ b/examples/llava/lora-7b.yaml
@@ -25,7 +25,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 wandb_project:
 wandb_entity:
@@ -46,7 +46,8 @@ tf32: true

 gradient_checkpointing: true
 logging_steps: 1
-attention: flash
+flash_attention: true
+eager_attention:

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/magistral/README.md
+++ b/examples/magistral/README.md
@@ -0,0 +1,71 @@
+# Finetune Magistral Small with Axolotl
+
+Magistral Small is a 24B parameter opensource model from MistralAI found on [HuggingFace](https://huggingface.co/mistralai/Magistral-Small-2506). This guide shows how to fine-tune it with Axolotl with multi-turn conversations with proper masking.
+
+MistralAI has also released a proprietary medium-sized version called Magistral Medium.
+
+Thanks to the team at MistralAI for giving us early access to prepare for this release.
+
+## Getting started
+
+1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Magistral is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
+
+    Here is an example of how to install from main for pip:
+
+```bash
+# Ensure you have Pytorch installed (Pytorch 2.6.0 recommended)
+git clone https://github.com/axolotl-ai-cloud/axolotl.git
+cd axolotl
+
+pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation -e '.[flash-attn,mistral]'
+```
+
+2. Download the example config:
+
+```bash
+axolotl fetch examples
+```
+
+3. Run the finetuning example:
+
+```bash
+axolotl train examples/magistral/magistral-small-qlora.yaml
+```
+
+This config uses about 24GB VRAM.
+
+Let us know how it goes. Happy finetuning! 🚀
+
+### TIPS
+
+- For inference, the official MistralAI team recommends `top_p: 0.95` and `temperature: 0.7` with `max_tokens: 40960`.
+- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
+- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
+- The dataset format is the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
+
+## Optimization Guides
+
+- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
+- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
+- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
+
+## Limitations
+
+We only support the `mistral-common` tokenizer for Supervised Fine-tuning at the moment and for `type: chat_template` only.
+
+The tokenizer does not work with `dataset.map` with multiprocessing, so we had to disable it. In addition, we do not support overriding tokens yet.
+
+## Related Resources
+
+- [MistralAI Magistral Blog](https://mistral.ai/news/magistral/)
+- [Axolotl Docs](https://docs.axolotl.ai)
+- [Axolotl Website](https://axolotl.ai)
+- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
+- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
+
+
+## Future Work
+
+- Add parity to Preference Tuning, RL, Multi-modal, etc.
+- Add parity to other tokenizer configs like overriding tokens.
--- a/examples/magistral/magistral-small-fsdp-qlora.yaml
+++ b/examples/magistral/magistral-small-fsdp-qlora.yaml
@@ -0,0 +1,72 @@
+base_model: mistralai/Magistral-Small-2506
+
+# Enable to use mistral-common tokenizer
+tokenizer_use_mistral_common: true
+
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: true
+
+datasets:
+  - path: fozziethebeat/alpaca_messages_2k_test
+    type: chat_template
+
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.1
+output_dir: ./outputs/lora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 2048
+sample_packing: true
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_target_modules:
+  - gate_proj
+  - down_proj
+  - up_proj
+  - q_proj
+  - v_proj
+  - k_proj
+  - o_proj
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 1
+optimizer: adamw_torch_fused
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing:
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 1
+saves_per_epoch: 1
+
+fsdp:
+  - full_shard
+  - auto_wrap
+fsdp_config:
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_transformer_layer_cls_to_wrap: MistralDecoderLayer
+  fsdp_activation_checkpointing: true
--- a/Show More
+++ b/Show More