chore: refactor normalize_attn to use mapping and loop

fix: set replit mpt model to use eager attention
fixes from PR feedback
2025-05-07 17:10:18 +07:00 · 2025-05-07 17:10:18 +07:00 · 2025-05-07 17:10:18 +07:00 · 2025-05-07 17:10:18 +07:00
401 changed files with 8481 additions and 16054 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -16,9 +16,8 @@ on:
 jobs:
  build-base:
    if: github.repository_owner == 'axolotl-ai-cloud'
-    timeout-minutes: 480
    # this job needs to be run on self-hosted GPU runners...
-    runs-on: ubuntu-latest-m
+    runs-on: axolotl-gpu-runner
    strategy:
      fail-fast: false
      matrix:
@@ -29,50 +28,42 @@ jobs:
            python_version: "3.11"
            pytorch: 2.5.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
          - cuda: "124"
            cuda_version: 12.4.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.6.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
          - cuda: "126"
            cuda_version: 12.6.3
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.6.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
          - cuda: "126"
            cuda_version: 12.6.3
            cudnn_version: ""
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
          - cuda: "128"
            cuda_version: 12.6.3
            cudnn_version: ""
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: nightly
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base-nightly"
-#          # "next" is for release candidates of pytorch
-#          - cuda: "128"
-#            cuda_version: 12.8.1
-#            cudnn_version: ""
-#            python_version: "3.11"
-#            pytorch: next
-#            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-#            dockerfile: "Dockerfile-base-next"
+          - cuda: "128"
+            cuda_version: 12.8.1
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: next
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -94,60 +85,7 @@ jobs:
        uses: docker/build-push-action@v4
        with:
          context: .
-          file: ./docker/${{ matrix.dockerfile }}
-          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-          labels: ${{ steps.metadata.outputs.labels }}
-          build-args: |
-            CUDA_VERSION=${{ matrix.cuda_version }}
-            CUDNN_VERSION=${{ matrix.cudnn_version }}
-            CUDA=${{ matrix.cuda }}
-            PYTHON_VERSION=${{ matrix.python_version }}
-            PYTORCH_VERSION=${{ matrix.pytorch }}
-            TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }}
-  build-base-uv:
-    if: github.repository_owner == 'axolotl-ai-cloud'
-    timeout-minutes: 480
-    runs-on: ubuntu-latest-m
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - cuda: "126"
-            cuda_version: 12.6.3
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.6.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-uv-base"
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.7.1
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-uv-base"
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Docker metadata
-        id: metadata
-        uses: docker/metadata-action@v5
-        with:
-          images: |
-            axolotlai/axolotl-base-uv
-      - name: Login to Docker Hub
-        uses: docker/login-action@v2
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Build
-        uses: docker/build-push-action@v4
-        with:
-          context: .
-          file: ./docker/${{ matrix.dockerfile }}
+          file: ${{ matrix.pytorch == 'nightly' && './docker/Dockerfile-base-nightly' || matrix.pytorch == 'next' && './docker/Dockerfile-base-next' || './docker/Dockerfile-base' }}
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -9,7 +9,6 @@ on:
       - '.github/workflows/*.yml'
       - "*.[q]md"
       - "examples/**/*.y[a]?ml"
-       - ".pre-commit-config.yaml"
  workflow_dispatch:

 jobs:
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -29,12 +29,7 @@ jobs:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.7.1
-            axolotl_extras:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -97,12 +92,7 @@ jobs:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.7.1
-            axolotl_extras:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -3,7 +3,7 @@ name: docker-multigpu-tests-biweekly
 on:
  pull_request:
    paths:
-      - 'tests/e2e/multigpu/**.py'
+      - 'tests/e2e/multigpu/*.py'
      - 'requirements.txt'
      - 'setup.py'
      - 'pyproject.toml'
@@ -43,7 +43,7 @@ jobs:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            axolotl_extras:
            num_gpus: 2
            nightly_build: "true"
@@ -59,7 +59,7 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==1.0.2 jinja2
+          pip install modal==0.71.8 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
--- a/.github/workflows/precommit-autoupdate.yml
+++ b/.github/workflows/precommit-autoupdate.yml
@@ -25,6 +25,7 @@ jobs:
          pre-commit autoupdate
          if [[ -n $(git status --porcelain) ]]; then
            echo "changes=true" >> $GITHUB_OUTPUT
+            git diff .pre-commit-config.yaml > pre-commit-update.diff
          fi

      - name: Create Pull Request
@@ -38,3 +39,11 @@ jobs:
          commit-message: "chore: update pre-commit hooks"
          body: |
            Automated PR to update pre-commit hooks to their latest versions.
+
+            <details>
+            <summary>Changes:</summary>
+
+            ```diff
+            ${{ steps.update.outputs.diff }}
+            ```
+            </details>
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -18,96 +18,9 @@ jobs:
        env:
          SKIP: no-commit-to-branch

-  preload-cache:
-    name: Preload HF cache
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        python_version: ["3.11"]
-        pytorch_version: ["2.6.0"]
-    timeout-minutes: 20
-
-    env:
-      AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
-
-    steps:
-      - name: Check out repository code
-        uses: actions/checkout@v4
-
-      - name: Restore HF cache
-        id: hf-cache-restore
-        uses: actions/cache/restore@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ runner.os }}-hf-hub-cache-v2
-
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python_version }}
-          cache: 'pip' # caching pip dependencies
-
-      - name: upgrade pip
-        run: |
-          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
-
-      - name: Install PyTorch
-        run: |
-          pip3 install torch==${{ matrix.pytorch_version }}
-
-      - name: Install dependencies
-        run: |
-          pip3 show torch
-          pip3 install --no-build-isolation -U -e .
-          python scripts/unsloth_install.py | sh
-          python scripts/cutcrossentropy_install.py | sh
-          pip3 install -r requirements-dev.txt -r requirements-tests.txt
-
-      - name: Make sure PyTorch version wasn't clobbered
-        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
-
-      - name: Ensure axolotl CLI was installed
-        run: |
-          axolotl --help
-
-      - name: Pre-Download dataset fixture
-        run: |
-          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
-
-      - name: Run tests
-        run: |
-          pytest -v tests/conftest.py
-
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v5
-        with:
-          token: ${{ secrets.CODECOV_TOKEN }}
-          files: ./coverage.xml
-          flags: unittests,pytorch-${{ matrix.pytorch_version }}
-          fail_ci_if_error: false
-
-      - name: cleanup pip cache
-        run: |
-          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
-
-      - name: Save HF cache
-        id: hf-cache
-        uses: actions/cache/save@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
-
  pytest:
    name: PyTest
    runs-on: ubuntu-latest
-    needs: [preload-cache]
    strategy:
      fail-fast: false
      max-parallel: 2
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -44,26 +44,115 @@ jobs:
        env:
          SKIP: no-commit-to-branch

-  pytest:
-    name: PyTest
+  preload-cache:
+    name: Preload HF cache
    runs-on: ubuntu-latest
-#    needs: [preload-cache]
    strategy:
      fail-fast: false
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.5.1", "2.6.0", "2.7.1"]
+        pytorch_version: ["2.6.0"]
+    timeout-minutes: 20
+
+    env:
+      AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
+
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+
+      - name: Restore HF cache
+        id: hf-cache-restore
+        uses: actions/cache/restore@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ runner.os }}-hf-hub-cache-v2
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python_version }}
+          cache: 'pip' # caching pip dependencies
+
+      - name: upgrade pip
+        run: |
+          pip3 install --upgrade pip
+          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
+
+      - name: Install PyTorch
+        run: |
+          pip3 install torch==${{ matrix.pytorch_version }}
+
+      - name: Install dependencies
+        run: |
+          pip3 show torch
+          pip3 install --no-build-isolation -U -e .
+          python scripts/unsloth_install.py | sh
+          python scripts/cutcrossentropy_install.py | sh
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+
+      - name: Make sure PyTorch version wasn't clobbered
+        run: |
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
+
+      - name: Ensure axolotl CLI was installed
+        run: |
+          axolotl --help
+
+      - name: Pre-Download dataset fixture
+        run: |
+          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
+
+      - name: Run tests
+        run: |
+          pytest -v tests/conftest.py
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v5
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          files: ./coverage.xml
+          flags: unittests,pytorch-${{ matrix.pytorch_version }}
+          fail_ci_if_error: false
+
+      - name: cleanup pip cache
+        run: |
+          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
+
+      - name: Save HF cache
+        id: hf-cache
+        uses: actions/cache/save@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
+
+  pytest:
+    name: PyTest
+    runs-on: ubuntu-latest
+    needs: [preload-cache]
+    strategy:
+      fail-fast: false
+      matrix:
+        python_version: ["3.11"]
+        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
    timeout-minutes: 20

    steps:
      - name: Check out repository code
        uses: actions/checkout@v4

-      - name: Restore Cache from S3
-        id: hf-cache-restore-s3
-        run: |
-          mkdir -p /home/runner/.cache/huggingface/hub
-          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd
+      - name: Restore HF cache
+        id: hf-cache-restore
+        uses: actions/cache/restore@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ runner.os }}-hf-hub-cache-v2

      - name: Setup Python
        uses: actions/setup-python@v5
@@ -121,22 +210,26 @@ jobs:
  pytest-sdist:
    name: PyTest from Source Dist
    runs-on: ubuntu-latest
+    needs: [preload-cache]
    strategy:
      fail-fast: false
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.5.1", "2.6.0", "2.7.1"]
+        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
    timeout-minutes: 20

    steps:
      - name: Check out repository code
        uses: actions/checkout@v4

-      - name: Restore Cache from S3
-        id: hf-cache-restore-s3
-        run: |
-          mkdir -p /home/runner/.cache/huggingface/hub
-          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd
+      - name: Restore HF cache
+        id: hf-cache-restore
+        uses: actions/cache/restore@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ runner.os }}-hf-hub-cache-v2

      - name: Setup Python
        uses: actions/setup-python@v5
@@ -184,11 +277,10 @@ jobs:
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

  docker-e2e-tests-1st:
-    # Run this job first as a gate for running the remainder of the test matrix
    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
-    timeout-minutes: 120
+    timeout-minutes: 90
    needs: [pre-commit, pytest, pytest-sdist]

    strategy:
@@ -201,13 +293,6 @@ jobs:
            pytorch: 2.6.0
            num_gpus: 1
            axolotl_extras: vllm
-          - cuda: 126
-            cuda_version: 12.6.3
-            python_version: "3.11"
-            pytorch: 2.6.0
-            num_gpus: 1
-            axolotl_extras:
-            dockerfile: "Dockerfile-uv.jinja"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -218,7 +303,7 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==1.0.2 jinja2
+          pip install modal==0.71.8 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
@@ -229,7 +314,6 @@ jobs:
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
          modal run cicd.e2e_tests
@@ -238,9 +322,7 @@ jobs:
    if: github.repository_owner == 'axolotl-ai-cloud'
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
-    timeout-minutes: 120
-    # Only run the remainder of the matrix if the first e2e check passed;
-    # this is to save on wasted compute costs for known failures that get caught in the first run
+    timeout-minutes: 90
    needs: [pre-commit, pytest, docker-e2e-tests-1st]

    strategy:
@@ -253,6 +335,12 @@ jobs:
            pytorch: 2.6.0
            num_gpus: 1
            axolotl_extras: llmcompressor
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.4.1
+            num_gpus: 1
+            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -262,13 +350,7 @@ jobs:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.7.1
-            num_gpus: 1
-            axolotl_extras:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            num_gpus: 1
            axolotl_extras:
    steps:
@@ -281,7 +363,7 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==1.0.2 jinja2
+          pip install modal==0.71.8 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
@@ -292,47 +374,6 @@ jobs:
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
          modal run cicd.e2e_tests
-
-  docker-e2e-cleanup:
-    runs-on: [self-hosted, modal]
-    timeout-minutes: 90
-    needs: [docker-e2e-tests]
-
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.6.0
-            num_gpus: 1
-            axolotl_extras: vllm
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Install Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: Install Modal
-        run: |
-          python -m pip install --upgrade pip
-          pip install modal==1.0.2 jinja2
-      - name: Update env vars
-        run: |
-          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
-          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
-          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
-          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
-          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
-          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
-          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
-      - name: Run tests job on Modal
-        run: |
-          modal run cicd.cleanup
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,15 +19,15 @@ repos:
    hooks:
      - id: isort
 -   repo: https://github.com/PyCQA/flake8
-    rev: 7.2.0
+    rev: 7.1.2
    hooks:
    - id: flake8
 -   repo: https://github.com/pylint-dev/pylint
-    rev: v3.3.7
+    rev: v3.3.6
    hooks:
    - id: pylint
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.16.0
+    rev: v1.15.0
    hooks:
    - id: mypy
      additional_dependencies:
--- a/.runpod/src/config/config.yaml
+++ b/.runpod/src/config/config.yaml
@@ -242,12 +242,16 @@
 # early_stopping_patience: 3

 # # Specify a scheduler and kwargs to use with the optimizer
-# lr_scheduler: # 'one_cycle' | empty for cosine
+# lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
 # lr_scheduler_kwargs:

 # # For one_cycle optim
 # lr_div_factor: # Learning rate div factor

+# # For log_sweep optim
+# log_sweep_min_lr:
+# log_sweep_max_lr:
+
 # # Specify optimizer
 # # Valid values are driven by the Transformers OptimizerNames class, see:
 # # https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
--- a/.runpod/src/handler.py
+++ b/.runpod/src/handler.py
@@ -57,10 +57,8 @@ async def handler(job):
    logger.info("Training Complete.")

    # Cleanup
-    if "WANDB_API_KEY" in os.environ:
-        del os.environ["WANDB_API_KEY"]
-    if "HF_TOKEN" in os.environ:
-        del os.environ["HF_TOKEN"]
+    del os.environ["WANDB_API_KEY"]
+    del os.environ["HF_TOKEN"]


 runpod.serverless.start({"handler": handler, "return_aggregate_stream": True})
--- a/README.md
+++ b/README.md
@@ -1,177 +1,152 @@
-<div align="center">
-  <a href="https://github.com/axolotl-ai-cloud/axolotl">
-    <img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/docs/logo.png" alt="Axolotl Logo" width="250" style="margin-bottom: 20px;"/>
-  </a>
-  <h1><span style="color: #4CAF50;">Axolotl: Fine-tune LLMs with Unprecedented Ease & Power!</span> 🚀</h1>
-  <p style="font-size: 1.1em; color: #555;">Your ultimate toolkit for efficient, scalable, and versatile large language model fine-tuning.</p>
+<p align="center">
+    <picture>
+        <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/887513285d98132142bf5db2a74eb5e0928787f1/image/axolotl_logo_digital_white.svg">
+        <source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/887513285d98132142bf5db2a74eb5e0928787f1/image/axolotl_logo_digital_black.svg">
+        <img alt="Axolotl" src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/887513285d98132142bf5db2a74eb5e0928787f1/image/axolotl_logo_digital_black.svg" width="400" height="104" style="max-width: 100%;">
+    </picture>
+</p>

-  <p>
-    <a href="https://discord.gg/HhrNrHJPRb" target="_blank">
-      <img src="https://img.shields.io/discord/1070542385153273887?label=Discord&logo=discord&logoColor=white&color=7289DA" alt="Discord Community" style="margin: 5px;">
-    </a>
-    <a href="https://docs.axolotl.ai/" target="_blank">
-      <img src="https://img.shields.io/badge/Documentation-blue?style=flat&logo=readthedocs&logoColor=white" alt="Official Documentation" style="margin: 5px;">
-    </a>
-    <a href="https://pypi.org/project/axolotl/" target="_blank">
-      <img src="https://img.shields.io/pypi/v/axolotl?label=PyPI&logo=pypi&logoColor=white&color=blue" alt="PyPI Package" style="margin: 5px;">
-    </a>
-    <a href="https://github.com/axolotl-ai-cloud/axolotl/releases" target="_blank">
-      <img src="https://img.shields.io/github/downloads/axolotl-ai-cloud/axolotl/total?label=Downloads&color=green" alt="GitHub Downloads" style="margin: 5px;">
-    </a>
-  </p>
-  <br>
-</div>
+<p align="center">
+    <img src="https://img.shields.io/github/license/axolotl-ai-cloud/axolotl.svg?color=blue" alt="GitHub License">
+    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg" alt="tests">
+    <a href="https://codecov.io/gh/axolotl-ai-cloud/axolotl"><img src="https://codecov.io/gh/axolotl-ai-cloud/axolotl/branch/main/graph/badge.svg" alt="codecov"></a>
+    <a href="https://github.com/axolotl-ai-cloud/axolotl/releases"><img src="https://img.shields.io/github/release/axolotl-ai-cloud/axolotl.svg" alt="Releases"></a>
+    <br/>
+    <a href="https://github.com/axolotl-ai-cloud/axolotl/graphs/contributors"><img src="https://img.shields.io/github/contributors-anon/axolotl-ai-cloud/axolotl?color=yellow&style=flat-square" alt="contributors" style="height: 20px;"></a>
+    <img src="https://img.shields.io/github/stars/axolotl-ai-cloud/axolotl" alt="GitHub Repo stars">
+    <br/>
+    <a href="https://discord.com/invite/HhrNrHJPRb"><img src="https://img.shields.io/badge/discord-7289da.svg?style=flat-square&logo=discord" alt="discord" style="height: 20px;"></a>
+    <a href="https://twitter.com/axolotl_ai"><img src="https://img.shields.io/twitter/follow/axolotl_ai?style=social" alt="twitter" style="height: 20px;"></a>
+    <br/>
+    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests-nightly.yml/badge.svg" alt="tests-nightly">
+    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg" alt="multigpu-semi-weekly tests">
+</p>

---
+Axolotl is a tool designed to streamline post-training for various AI models.
+Post-training refers to any modifications or additional training performed on
+pre-trained models - including full model fine-tuning, parameter-efficient tuning (like
+LoRA and QLoRA), supervised fine-tuning (SFT), instruction tuning, and alignment
+techniques. With support for multiple model architectures and training configurations,
+Axolotl makes it easy to get started with these techniques.

-<div style="background-color: #f0f8ff; padding: 25px; border-radius: 12px; margin-bottom: 30px; border: 1px solid #d0e8ff;">
-  <h2 style="color: #0056b3; text-align: center; margin-top: 0;">🎉 Latest Innovations & Updates!</h2>
-  <ul style="list-style-type: none; padding-left: 0;">
-    <li style="margin-bottom: 10px; border-left: 4px solid #6495ED; padding-left: 10px;"><strong><span style="color: #2E8B57;">2025/06:</span> Magistral with mistral-common tokenizer support!</strong> Dive into <a href="https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral" style="color: #007bff; text-decoration: none;">examples</a> to train your own Magistral models.</li>
-    <li style="margin-bottom: 10px; border-left: 4px solid #6495ED; padding-left: 10px;"><strong><span style="color: #2E8B57;">2025/05:</span> Quantization Aware Training (QAT) support!</strong> Explore the <a href="https://docs.axolotl.ai/docs/qat.html" style="color: #007bff; text-decoration: none;">docs</a> to learn more.</li>
-    <li style="margin-bottom: 10px; border-left: 4px solid #6495ED; padding-left: 10px;"><strong><span style="color: #2E8B57;">2025/04:</span> Llama 4 support!</strong> See <a href="https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/llama-4" style="color: #007bff; text-decoration: none;">examples</a> to train Llama 4 with Axolotl's linearized version!</li>
-    <li style="margin-bottom: 10px; border-left: 4px solid #6495ED; padding-left: 10px;"><strong><span style="color: #2E8B57;">2025/03:</span> Sequence Parallelism (SP) support!</strong> Scale your context length. Read the <a href="https://huggingface.co/blog/axolotl-ai-co/long-context-with-sequence-parallelism-in-axolotl" style="color: #007bff; text-decoration: none;">blog</a> and <a href="https://docs.axolotl.ai/docs/sequence_parallelism.html" style="color: #007bff; text-decoration: none;">docs</a>.</li>
-    <li style="margin-bottom: 10px; border-left: 4px solid #6495ED; padding-left: 10px;"><strong><span style="color: #2E8B57;">2025/03:</span> (Beta) Fine-tuning Multimodal models!</strong> Check out the <a href="https://docs.axolotl.ai/docs/multimodal.html" style="color: #007bff; text-decoration: none;">docs</a>.</li>
-    <li style="margin-bottom: 10px; border-left: 4px solid #6495ED; padding-left: 10px;"><strong><span style="color: #2E8B57;">2025/02:</span> LoRA optimizations!</strong> Reduce memory and improve speed. Jump into the <a href="https://docs.axolotl.ai/docs/lora_optims.html" style="color: #007bff; text-decoration: none;">docs</a>.</li>
-    <li style="margin-bottom: 10px; border-left: 4px solid #6495ED; padding-left: 10px;"><strong><span style="color: #2E8B57;">2025/02:</span> GRPO support!</strong> Dive into our <a href="https://huggingface.co/blog/axolotl-ai-co/training-llms-w-interpreter-feedback-wasm" style="color: #007bff; text-decoration: none;">blog</a> and <a href="https://github.com/axolotl-ai-cloud/grpo_code" style="color: #007bff; text-decoration: none;">GRPO example</a>.</li>
-    <li style="margin-bottom: 0px; border-left: 4px solid #6495ED; padding-left: 10px;"><strong><span style="color: #2E8B57;">2025/01:</span> Reward Modelling / Process Reward Modelling fine-tuning!</strong> See <a href="https://docs.axolotl.ai/docs/reward_modelling.html" style="color: #007bff; text-decoration: none;">docs</a>.</li>
-  </ul>
-</div>
+Axolotl is designed to work with YAML config files that contain everything you need to
+preprocess a dataset, train or fine-tune a model, run model inference or evaluation,
+and much more.

-<h2 style="color: #FF5733;"><span style="margin-right: 10px;">✨</span> Axolotl Overview: Your LLM Fine-tuning Powerhouse!</h2>
+Features:

-<div style="background-color: #fffacd; padding: 20px; border-radius: 10px; margin-bottom: 30px; border: 1px solid #ffd700;">
-  <p style="font-size: 1.1em; color: #333; text-align: center;">Axolotl is a powerful, flexible, and user-friendly tool designed to supercharge your post-training workflows for a wide range of cutting-edge AI models.</p>
-</div>
+- Train various Huggingface models such as llama, pythia, falcon, mpt
+- Supports fullfinetune, lora, qlora, relora, and gptq
+- Customize configurations using a simple yaml file or CLI overwrite
+- Load different dataset formats, use custom formats, or bring your own tokenized datasets
+- Integrated with [xformers](https://github.com/facebookresearch/xformers), flash attention, [liger kernel](https://github.com/linkedin/Liger-Kernel), rope scaling, and multipacking
+- Works with single GPU or multiple GPUs via FSDP or Deepspeed
+- Easily run with Docker locally or on the cloud
+- Log results and optionally checkpoints to wandb, mlflow or Comet
+- And more!

-<div style="display: flex; flex-wrap: wrap; justify-content: space-around; gap: 20px; margin-bottom: 40px;">
-  <div style="flex: 1 1 45%; background-color: #f9f9f9; padding: 20px; border-radius: 10px; border: 1px solid #eee; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
-    <h3 style="color: #4CAF50; margin-top: 0;"><span style="margin-right: 5px;">🤖</span> Broad Model Compatibility</h3>
-    <ul style="list-style-type: disc; padding-left: 20px;">
-      <li>Train a vast array of models including LLaMA, Mistral, Mixtral, Pythia, and many more.</li>
-      <li>Fully compatible with HuggingFace transformers causal language models, ensuring wide adoption.</li>
-    </ul>
-  </div>
+## 🚀 Quick Start

-  <div style="flex: 1 1 45%; background-color: #f9f9f9; padding: 20px; border-radius: 10px; border: 1px solid #eee; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
-    <h3 style="color: #4CAF50; margin-top: 0;"><span style="margin-right: 5px;">🔧</span> Diverse Training Methodologies</h3>
-    <ul style="list-style-type: disc; padding-left: 20px;">
-      <li>Full fine-tuning, LoRA, QLoRA, GPTQ, QAT.</li>
-      <li>Preference Tuning: DPO, IPO, KTO, ORPO.</li>
-      <li>Advanced RL: GRPO.</li>
-      <li>Multimodal and Reward Modelling (RM) / Process Reward Modelling (PRM).</li>
-    </ul>
-  </div>
+**Requirements**:

-  <div style="flex: 1 1 45%; background-color: #f9f9f9; padding: 20px; border-radius: 10px; border: 1px solid #eee; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
-    <h3 style="color: #4CAF50; margin-top: 0;"><span style="margin-right: 5px;">⚙️</span> Streamlined Configuration</h3>
-    <ul style="list-style-type: disc; padding-left: 20px;">
-      <li>Utilize a single, intuitive YAML file across dataset preprocess, training, evaluation, quantization, and inference.</li>
-    </ul>
-  </div>
+- NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
+- Python 3.11
+- PyTorch ≥2.4.1

-  <div style="flex: 1 1 45%; background-color: #f9f9f9; padding: 20px; border-radius: 10px; border: 1px solid #eee; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
-    <h3 style="color: #4CAF50; margin-top: 0;"><span style="margin-right: 5px;">⚡</span> Cutting-Edge Performance Optimizations</h3>
-    <ul style="list-style-type: disc; padding-left: 20px;">
-      <li><a href="https://docs.axolotl.ai/docs/multipack.html" style="color: #007bff;">Multipacking</a>, <a href="https://github.com/Dao-AILab/flash-attention" style="color: #007bff;">Flash Attention</a>, <a href="https://github.com/facebookresearch/xformers" style="color: #007bff;">Xformers</a>, <a href="https://pytorch.org/blog/flexattention/" style="color: #007bff;">Flex Attention</a>, <a href="https://github.com/linkedin/Liger-Kernel" style="color: #007bff;">Liger Kernel</a>, <a href="https://github.com/apple/ml-cross-entropy/tree/main" style="color: #007bff;">Cut Cross Entropy</a>.</li>
-      <li>Sequence Parallelism (SP), LoRA optimizations.</li>
-      <li>Multi-GPU training (FSDP1, FSDP2, DeepSpeed), Multi-node training (Torchrun, Ray), and many more!</li>
-    </ul>
-  </div>
+### Installation

-  <div style="flex: 1 1 45%; background-color: #f9f9f9; padding: 20px; border-radius: 10px; border: 1px solid #eee; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
-    <h3 style="color: #4CAF50; margin-top: 0;"><span style="margin-right: 5px;">📂</span> Flexible Data Handling</h3>
-    <ul style="list-style-type: disc; padding-left: 20px;">
-      <li>Load datasets from local paths, HuggingFace Hub, and major cloud providers (S3, Azure, GCP, OCI).</li>
-    </ul>
-  </div>
-
-  <div style="flex: 1 1 45%; background-color: #f9f9f9; padding: 20px; border-radius: 10px; border: 1px solid #eee; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
-    <h3 style="color: #4CAF50; margin-top: 0;"><span style="margin-right: 5px;">☁️</span> Cloud-Ready & Deployable</h3>
-    <ul style="list-style-type: disc; padding-left: 20px;">
-      <li>Official <a href="https://hub.docker.com/u/axolotlai" style="color: #007bff;">Docker images</a> and <a href="https://pypi.org/project/axolotl/" style="color: #007bff;">PyPI packages</a> for seamless integration on cloud platforms and local hardware.</li>
-    </ul>
-  </div>
-</div>
-
-<h2 style="color: #007bff;"><span style="margin-right: 10px;">🚀</span> Quick Start: Get Fine-tuning in Minutes!</h2>
-
-<div style="background-color: #e6f7ff; padding: 25px; border-radius: 12px; margin-bottom: 30px; border: 1px solid #cceeff;">
-  <h3 style="color: #0056b3; margin-top: 0;">Requirements:</h3>
-  <ul style="list-style-type: none; padding-left: 0;">
-    <li style="margin-bottom: 5px;"><span style="color: #333; font-weight: bold;">▶ NVIDIA GPU</span> (Ampere or newer for `bf16` and Flash Attention) or AMD GPU</li>
-    <li style="margin-bottom: 5px;"><span style="color: #333; font-weight: bold;">▶ Python 3.11</span></li>
-    <li style="margin-bottom: 5px;"><span style="color: #333; font-weight: bold;">▶ PyTorch ≥2.5.1</span></li>
-  </ul>
-
-  <h3 style="color: #0056b3;">Installation:</h3>
-  <pre><code style="background-color: #eef; padding: 15px; border-radius: 8px; display: block; overflow-x: auto;">pip3 install -U packaging==23.2 setuptools==75.8.0 wheel ninja
+```bash
+pip3 install -U packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]

 # Download example axolotl configs, deepspeed configs
 axolotl fetch examples
-axolotl fetch deepspeed_configs  # OPTIONAL</code></pre>
-  <p style="font-size: 0.9em; color: #555;">Other installation approaches are described <a href="https://docs.axolotl.ai/docs/installation.html" style="color: #007bff; text-decoration: none;">here</a>.</p>
+axolotl fetch deepspeed_configs  # OPTIONAL
+```

-  <h3 style="color: #0056b3;">Your First Fine-tune:</h3>
-  <pre><code style="background-color: #eef; padding: 15px; border-radius: 8px; display: block; overflow-x: auto;"># Fetch axolotl examples
+Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html).
+
+### Your First Fine-tune
+
+```bash
+# Fetch axolotl examples
 axolotl fetch examples

 # Or, specify a custom path
 axolotl fetch examples --dest path/to/folder

 # Train a model using LoRA
-axolotl train examples/llama-3/lora-1b.yml</code></pre>
-  <p style="text-align: center; font-size: 1.1em; font-weight: bold; margin-top: 20px;">
-    That's it! Check out our <a href="https://docs.axolotl.ai/docs/getting-started.html" style="background-color: #28a745; color: white; padding: 12px 25px; border-radius: 8px; text-decoration: none; display: inline-block; transition: background-color 0.3s ease;"> Getting Started Guide ➜</a> for a more detailed walkthrough.
-  </p>
-</div>
+axolotl train examples/llama-3/lora-1b.yml
+```

-<h2 style="color: #8A2BE2;"><span style="margin-right: 10px;">📚</span> Comprehensive Documentation: Unlock Axolotl's Full Potential</h2>
+That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/getting-started.html) for a more detailed walkthrough.

-<div style="background-color: #f7f0ff; padding: 25px; border-radius: 12px; margin-bottom: 30px; border: 1px solid #e0caff;">
-  <p style="text-align: center; font-size: 1.1em; color: #333;">Dive deep into Axolotl's capabilities with our extensive documentation:</p>
-  <ul style="list-style-type: none; padding-left: 0; text-align: center;">
-    <li style="margin-bottom: 10px;"><a href="https://docs.axolotl.ai/docs/installation.html" style="color: #5d2b99; text-decoration: none; font-weight: bold;"> Installation Options</a> - Detailed setup instructions for different environments</li>
-    <li style="margin-bottom: 10px;"><a href="https://docs.axolotl.ai/docs/config.html" style="color: #5d2b99; text-decoration: none; font-weight: bold;"> Configuration Guide</a> - Full configuration options and examples</li>
-    <li style="margin-bottom: 10px;"><a href="https://docs.axolotl.ai/docs/dataset_loading.html" style="color: #5d2b99; text-decoration: none; font-weight: bold;"> Dataset Loading</a> - Loading datasets from various sources</li>
-    <li style="margin-bottom: 10px;"><a href="https://docs.axolotl.ai/docs/dataset-formats/" style="color: #5d2b99; text-decoration: none; font-weight: bold;"> Dataset Guide</a> - Supported formats and how to use them</li>
-    <li style="margin-bottom: 10px;"><a href="https://docs.axolotl.ai/docs/multi-gpu.html" style="color: #5d2b99; text-decoration: none; font-weight: bold;"> Multi-GPU Training</a></li>
-    <li style="margin-bottom: 10px;"><a href="https://docs.axolotl.ai/docs/multi-node.html" style="color: #5d2b99; text-decoration: none; font-weight: bold;"> Multi-Node Training</a></li>
-    <li style="margin-bottom: 10px;"><a href="https://docs.axolotl.ai/docs/multipack.html" style="color: #5d2b99; text-decoration: none; font-weight: bold;"> Multipacking</a></li>
-    <li style="margin-bottom: 10px;"><a href="https://docs.axolotl.ai/docs/api/" style="color: #5d2b99; text-decoration: none; font-weight: bold;"> API Reference</a> - Auto-generated code documentation</li>
-    <li style="margin-bottom: 0px;"><a href="https://docs.axolotl.ai/docs/faq.html" style="color: #5d2b99; text-decoration: none; font-weight: bold;">❓ FAQ</a> - Frequently asked questions</li>
-  </ul>
-</div>
+## ✨ Key Features

-<h2 style="color: #FF8C00;"><span style="margin-right: 10px;">🤝</span> Need Help? We're Here for You!</h2>
-<ul style="list-style-type: none; padding-left: 0;">
-    <li style="margin-bottom: 10px;"><span style="font-size: 1.2em; color: #7289DA;"></span> Join our vibrant <a href="https://discord.gg/HhrNrHJPRb" style="color: #7289DA; text-decoration: none; font-weight: bold;">Discord community</a> for real-time support and discussions.</li>
-    <li style="margin-bottom: 10px;"><span style="font-size: 1.2em; color: #555;"></span> Explore our <a href="https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/" style="color: #FF8C00; text-decoration: none; font-weight: bold;">Examples</a> directory for practical use cases.</li>
-    <li style="margin-bottom: 10px;"><span style="font-size: 1.2em; color: #555;"></span> Read our <a href="https://docs.axolotl.ai/docs/debugging.html" style="color: #FF8C00; text-decoration: none; font-weight: bold;">Debugging Guide</a> for troubleshooting tips.</li>
-    <li style="margin-bottom: 0px;"><span style="font-size: 1.2em; color: #007bff;">✉</span> Need dedicated support? Please contact <a href="mailto:wing@axolotl.ai" style="color: #007bff; text-decoration: none; font-weight: bold;">wing@axolotl.ai</a> for professional assistance options.</li>
-</ul>
+- **Multiple Model Support**: Train various models like LLaMA, Mistral, Mixtral, Pythia, and more
+- **Training Methods**: Full fine-tuning, LoRA, QLoRA, and more
+- **Easy Configuration**: Simple YAML files to control your training setup
+- **Performance Optimizations**: Flash Attention, xformers, multi-GPU training
+- **Flexible Dataset Handling**: Use various formats and custom datasets
+- **Cloud Ready**: Run on cloud platforms or local hardware

-<h2 style="color: #FF1493;"><span style="margin-right: 10px;">🌟</span> Contribute to Axolotl!</h2>
-<p style="font-size: 1.1em;">
-  Contributions are always welcome and highly appreciated! Axolotl thrives on community support. Please see our <a href="https://github.com/axolotl-ai-cloud/axolotl/blob/main/.github/CONTRIBUTING.md" style="color: #FF1493; text-decoration: none; font-weight: bold;">Contributing Guide</a> for details on how you can help make Axolotl even better.
-</p>
+## 📚 Documentation

-<div align="center" style="margin-top: 40px; padding: 25px; background-color: #f8f8f8; border-radius: 12px; border: 1px solid #eee;">
-  <h2 style="color: #FF69B4; margin-bottom: 20px;">❤️ Our Esteemed Sponsors</h2>
-  <p style="font-size: 1.1em; color: #555;">A huge thank you to our visionary sponsors who provide the essential resources to keep Axolotl at the forefront of LLM fine-tuning:</p>
-  <a href="https://www.modal.com?utm_source=github&utm_medium=github&utm_campaign=axolotl" target="_blank" style="display: inline-block; margin: 20px;">
-    <img src="https://assets-global.website-files.com/6247c4c1d68352614b7e87ae/63b27b3b44b82d02c8163f4f_logo-dark-square.png" alt="Modal Logo" width="180" style="vertical-align: middle; border-radius: 8px; box-shadow: 0 4px 10px rgba(0,0,0,0.15);"/>
-  </a>
-  <p style="font-size: 0.9em; color: #777; margin-top: 20px;">
-    <strong>Modal:</strong> Revolutionizing cloud computing for Gen AI. Run jobs, deploy models, and fine-tune LLMs at scale with ease.
-  </p>
-  <p style="font-size: 1em; color: #555; margin-top: 30px;">
-    Interested in powering the future of Axolotl? <span style="font-weight: bold; color: #FF69B4;">Become a sponsor!</span> Contact us at <a href="mailto:wing@axolotl.ai" style="color: #007bff; text-decoration: none;">wing@axolotl.ai</a>
-  </p>
-</div>
+- [Installation Options](https://docs.axolotl.ai/docs/installation.html) - Detailed setup instructions for different environments
+- [Configuration Guide](https://docs.axolotl.ai/docs/config.html) - Full configuration options and examples
+- [Dataset Guide](https://docs.axolotl.ai/docs/dataset-formats/) - Supported formats and how to use them
+- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
+- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
+- [Multipacking](https://docs.axolotl.ai/docs/multipack.html)
+- [API Reference](https://docs.axolotl.ai/docs/api/) - Auto-generated code documentation
+- [FAQ](https://docs.axolotl.ai/docs/faq.html) - Frequently asked questions

-<h2 style="color: #6A5ACD;"><span style="margin-right: 10px;">📜</span> License</h2>
-<p style="font-size: 1.1em;">
-  This project is proudly licensed under the <span style="font-weight: bold; color: #6A5ACD;">Apache 2.0 License</span>. See the <a href="LICENSE" style="color: #007bff; text-decoration: none;">LICENSE</a> file for full details.
-</p>
+## 🤝 Getting Help
+
+- Join our [Discord community](https://discord.gg/HhrNrHJPRb) for support
+- Check out our [Examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/) directory
+- Read our [Debugging Guide](https://docs.axolotl.ai/docs/debugging.html)
+- Need dedicated support? Please contact [✉️wing@axolotl.ai](mailto:wing@axolotl.ai) for options
+
+## 🌟 Contributing
+
+Contributions are welcome! Please see our [Contributing Guide](https://github.com/axolotl-ai-cloud/axolotl/blob/main/.github/CONTRIBUTING.md) for details.
+
+## Supported Models
+
+|             | fp16/fp32 | lora | qlora | gptq | gptq w/flash attn | flash attn | xformers attn |
+|-------------|:----------|:-----|-------|------|-------------------|------------|--------------|
+| llama       | ✅         | ✅    | ✅     | ✅             | ✅                 | ✅          | ✅            |
+| Mistral     | ✅         | ✅    | ✅     | ✅             | ✅                 | ✅          | ✅            |
+| Mixtral-MoE | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
+| Mixtral8X22 | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
+| Pythia      | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
+| cerebras    | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
+| btlm        | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
+| mpt         | ✅         | ❌    | ❓     | ❌             | ❌                 | ❌          | ❓            |
+| falcon      | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
+| gpt-j       | ✅         | ✅    | ✅     | ❌             | ❌                 | ❓          | ❓            |
+| XGen        | ✅         | ❓    | ✅     | ❓             | ❓                 | ❓          | ✅            |
+| phi         | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
+| RWKV        | ✅         | ❓    | ❓     | ❓             | ❓                 | ❓          | ❓            |
+| Qwen        | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
+| Gemma       | ✅         | ✅    | ✅     | ❓             | ❓                 | ✅          | ❓            |
+| Jamba       | ✅         | ✅    | ✅     | ❓             | ❓                 | ✅          | ❓            |
+
+✅: supported
+❌: not supported
+❓: untested
+
+## ❤️ Sponsors
+
+Thank you to our sponsors who help make Axolotl possible:
+
+- [Modal](https://www.modal.com?utm_source=github&utm_medium=github&utm_campaign=axolotl) - Modal lets you run
+jobs in the cloud, by just writing a few lines of Python. Customers use Modal to deploy Gen AI models at large scale,
+fine-tune large language models, run protein folding simulations, and much more.
+
+Interested in sponsoring? Contact us at [wing@axolotl.ai](mailto:wing@axolotl.ai)
+
+## 📜 License
+
+This project is licensed under the Apache 2.0 License - see the [LICENSE](LICENSE) file for details.
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -17,9 +17,7 @@ quartodoc:
        - convert
        - prompt_tokenizers
        - logging_config
-        - core.builders.base
-        - core.builders.causal
-        - core.builders.rl
+        - core.trainer_builder
        - core.training_args
        - core.chat.messages
        - core.chat.format.chatml
@@ -45,37 +43,13 @@ quartodoc:
        - cli.vllm_serve
        - cli.cloud.base
        - cli.cloud.modal_
-        - cli.quantize
    - title: Trainers
      desc: Training implementations
      contents:
        - core.trainers.base
        - core.trainers.trl
-        - core.trainers.mamba
-        - core.trainers.relora
        - core.trainers.dpo.trainer
        - core.trainers.grpo.trainer
-        - core.trainers.grpo.sampler
-        - core.trainers.utils
-    - title: Model Loading
-      desc: Functionality for loading and patching models, tokenizers, etc.
-      contents:
-        - loaders.model
-        - loaders.tokenizer
-        - loaders.processor
-        - loaders.adapter
-        - loaders.patch_manager
-        - loaders.constants
-    - title: Mixins
-      desc: Mixin classes for augmenting trainers
-      contents:
-        - core.trainers.mixins.optimizer
-        - core.trainers.mixins.rng_state_loader
-        - core.trainers.mixins.scheduler
-    - title: Context Managers
-      desc: Context managers for altering trainer behaviors
-      contents:
-        - utils.ctx_managers.sequence_parallel
    - title: Prompt Strategies
      desc: Prompt formatting strategies
      contents:
@@ -112,7 +86,7 @@ quartodoc:
        - kernels.swiglu
        - kernels.quantize
        - kernels.utils
-    - title: Monkey Patches
+    - title: MonkeyPatches
      desc: Runtime patches for model optimizations
      contents:
        - monkeypatch.llama_attn_hijack_flash
@@ -129,16 +103,17 @@ quartodoc:
        - monkeypatch.trainer_fsdp_optim
        - monkeypatch.transformers_fa_utils
        - monkeypatch.unsloth_
+        - monkeypatch.attention.mllama
        - monkeypatch.data.batch_dataset_fetcher
        - monkeypatch.mixtral
-        - monkeypatch.gradient_checkpointing.offload_cpu
-        - monkeypatch.gradient_checkpointing.offload_disk
    - title: Utils
      desc: Utility functions
      contents:
+        - utils.models
        - utils.tokenization
        - utils.chat_templates
        - utils.lora
+        - utils.lora_embeddings
        - utils.model_shard_quant
        - utils.bench
        - utils.freeze
@@ -149,7 +124,7 @@ quartodoc:
        - utils.optimizers.adopt
        - utils.data.pretraining
        - utils.data.sft
-        - utils.quantization
+        - utils.gradient_checkpointing.unsloth
    - title: Schemas
      desc: Pydantic data models for Axolotl config
      contents:
@@ -199,14 +174,12 @@ quartodoc:
        - utils.callbacks.lisa
        - utils.callbacks.mlflow_
        - utils.callbacks.comet_
-        - utils.callbacks.qat
+
 website:
  title: "Axolotl"
  description: "We make fine-tuning accessible, scalable, and fun"
  favicon: favicon.jpg

-  google-analytics: "G-9KYCVJBNMQ"
-
  navbar:
    logo: image/axolotl_logo_digital_white.svg
    title: false
@@ -259,8 +232,6 @@ website:
            - docs/lr_groups.qmd
            - docs/lora_optims.qmd
            - docs/dataset_loading.qmd
-            - docs/qat.qmd
-            - docs/quantize.qmd

        - section: "Core Concepts"
          contents:
--- a/cicd/Dockerfile-uv.jinja
+++ b/cicd/Dockerfile-uv.jinja
@@ -1,52 +0,0 @@
-FROM axolotlai/axolotl-base-uv:{{ BASE_TAG }}
-
-ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
-ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
-ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
-ENV CUDA="{{ CUDA }}"
-ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
-ENV GITHUB_REF="{{ GITHUB_REF }}"
-ENV GITHUB_SHA="{{ GITHUB_SHA }}"
-ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
-ENV HF_HOME="{{ HF_HOME }}"
-
-RUN apt-get update && \
-    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
-
-WORKDIR /workspace
-
-RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
-
-WORKDIR /workspace/axolotl
-
-RUN git fetch origin +$GITHUB_REF && \
-    git checkout FETCH_HEAD
-
-# If AXOLOTL_EXTRAS is set, append it in brackets
-RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
-        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
-        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
-        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
-        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
-        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
-    fi
-
-RUN uv pip install packaging==23.2 setuptools==75.8.0
-RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
-    else \
-        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
-    fi
-
-RUN python scripts/unsloth_install.py --uv | sh
-RUN python scripts/cutcrossentropy_install.py --uv | sh
-
-# So we can test the Docker image
-RUN uv pip install -r requirements-dev.txt -r requirements-tests.txt
-
-# fix so that git fetch/pull from remote works
-RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
-    git config --get remote.origin.fetch
-
-# helper for huggingface-login cli
-RUN git config --global credential.helper store
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -18,7 +18,7 @@ pytest -v --durations=10 \
  --cov-append

 # Run patched tests excluding lora kernels with coverage append
-pytest --full-trace -vvv --durations=10 \
+pytest -v --durations=10 \
  --ignore=tests/e2e/patched/lora_kernels \
  /workspace/axolotl/tests/e2e/patched \
  --cov=axolotl \
--- a/cicd/cleanup.py
+++ b/cicd/cleanup.py
@@ -1,19 +0,0 @@
-"""Modal app to run axolotl GPU cleanup"""
-
-from .single_gpu import VOLUME_CONFIG, app, cicd_image, run_cmd
-
-
-@app.function(
-    image=cicd_image,
-    timeout=60 * 60,
-    cpu=8.0,
-    memory=131072,
-    volumes=VOLUME_CONFIG,
-)
-def cleanup():
-    run_cmd("./cicd/cleanup.sh", "/workspace/axolotl")
-
-
-@app.local_entrypoint()
-def main():
-    cleanup.remote()
--- a/cicd/cleanup.sh
+++ b/cicd/cleanup.sh
@@ -1,6 +0,0 @@
-#!/bin/bash
-set -e
-
-# cleanup old cache files for datasets processing and intermediate mappings
-find /workspace/data/huggingface-cache/hub/datasets -name "cache-*" -type f -mtime +1 -exec rm {} \;
-find /workspace/data/huggingface-cache/hub/datasets -name "*.lock" -type f -mtime +1 -exec rm {} \;
--- a/cicd/e2e_tests.py
+++ b/cicd/e2e_tests.py
@@ -1,12 +1,75 @@
 """Modal app to run axolotl GPU tests"""

-from .single_gpu import GPU_CONFIG, VOLUME_CONFIG, app, cicd_image, run_cmd
+# pylint: disable=duplicate-code
+
+import os
+import pathlib
+import tempfile
+
+import jinja2
+import modal
+from jinja2 import select_autoescape
+from modal import App, Image
+
+cicd_path = pathlib.Path(__file__).parent.resolve()
+
+template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
+template_env = jinja2.Environment(
+    loader=template_loader, autoescape=select_autoescape()
+)
+df_template = template_env.get_template("Dockerfile.jinja")
+
+df_args = {
+    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
+    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
+    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
+    "CUDA": os.environ.get("CUDA", "121"),
+    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
+    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
+    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
+    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
+    "HF_HOME": "/workspace/data/huggingface-cache/hub",
+}
+
+dockerfile_contents = df_template.render(**df_args)
+
+temp_dir = tempfile.mkdtemp()
+with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
+    f.write(dockerfile_contents)
+
+cicd_image = Image.from_dockerfile(
+    pathlib.Path(temp_dir) / "Dockerfile",
+    context_mount=None,
+    force_build=True,
+    gpu="A10G",
+).env(df_args)
+
+app = App("Axolotl CI/CD", secrets=[])
+
+hf_cache_volume = modal.Volume.from_name(
+    "axolotl-ci-hf-hub-cache", create_if_missing=True
+)
+VOLUME_CONFIG = {
+    "/workspace/data/huggingface-cache/hub": hf_cache_volume,
+}
+
+N_GPUS = int(os.environ.get("N_GPUS", 1))
+GPU_CONFIG = modal.gpu.L40S(count=N_GPUS)
+
+
+def run_cmd(cmd: str, run_folder: str):
+    import subprocess  # nosec
+
+    # Propagate errors from subprocess.
+    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
+        exit(exit_code)  # pylint: disable=consider-using-sys-exit


@app.function(
    image=cicd_image,
    gpu=GPU_CONFIG,
-    timeout=90 * 60,  # 90 min
+    timeout=60 * 60,
    cpu=8.0,
    memory=131072,
    volumes=VOLUME_CONFIG,
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -24,9 +24,9 @@ df_template = template_env.get_template("Dockerfile.jinja")
 df_args = {
    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.5.1"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu124-2.5.1"),
-    "CUDA": os.environ.get("CUDA", "124"),
+    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
+    "CUDA": os.environ.get("CUDA", "121"),
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
@@ -55,7 +55,7 @@ VOLUME_CONFIG = {
 }

 N_GPUS = int(os.environ.get("N_GPUS", 2))
-GPU_CONFIG = f"H100:{N_GPUS}"
+GPU_CONFIG = modal.gpu.H100(count=N_GPUS)


 def run_cmd(cmd: str, run_folder: str):
@@ -70,7 +70,7 @@ def run_cmd(cmd: str, run_folder: str):
    image=cicd_image,
    gpu=GPU_CONFIG,
    timeout=90 * 60,
-    cpu=16.0,
+    cpu=8.0,
    memory=131072 * N_GPUS,
    volumes=VOLUME_CONFIG,
 )
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
@@ -1,68 +0,0 @@
-"""Modal app to run axolotl GPU tests"""
-
-# pylint: disable=duplicate-code
-
-import os
-import pathlib
-import tempfile
-
-import jinja2
-import modal
-import modal.experimental
-from jinja2 import select_autoescape
-from modal import App
-
-cicd_path = pathlib.Path(__file__).parent.resolve()
-
-template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
-template_env = jinja2.Environment(
-    loader=template_loader, autoescape=select_autoescape()
-)
-dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja")
-df_template = template_env.get_template(dockerfile)
-
-df_args = {
-    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
-    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.5.1"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu124-2.5.1"),
-    "CUDA": os.environ.get("CUDA", "124"),
-    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
-    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
-    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
-    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
-    "HF_HOME": "/workspace/data/huggingface-cache/hub",
-}
-
-dockerfile_contents = df_template.render(**df_args)
-
-temp_dir = tempfile.mkdtemp()
-with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
-    f.write(dockerfile_contents)
-
-cicd_image = modal.experimental.raw_dockerfile_image(
-    pathlib.Path(temp_dir) / "Dockerfile",
-    # context_mount=None,
-    force_build=True,
-    # gpu="A10G",
-).env(df_args)
-
-app = App("Axolotl CI/CD", secrets=[])
-
-hf_cache_volume = modal.Volume.from_name(
-    "axolotl-ci-hf-hub-cache", create_if_missing=True
-)
-VOLUME_CONFIG = {
-    "/workspace/data/huggingface-cache/hub": hf_cache_volume,
-}
-
-N_GPUS = int(os.environ.get("N_GPUS", 1))
-GPU_CONFIG = f"L40S:{N_GPUS}"
-
-
-def run_cmd(cmd: str, run_folder: str):
-    import subprocess  # nosec
-
-    # Propagate errors from subprocess.
-    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
-        exit(exit_code)  # pylint: disable=consider-using-sys-exit
--- a/codecov.yml
+++ b/codecov.yml
@@ -19,7 +19,7 @@ coverage:
        if_no_uploads: error
        if_not_found: success
        if_ci_failed: error
-        only_pulls: true
+        only_pulls: false
        flags: null
        paths: null
    patch:
--- a/deepspeed_configs/zero2_torch_compile.json
+++ b/deepspeed_configs/zero2_torch_compile.json
@@ -1,31 +0,0 @@
-{
-  "compile": {
-    "disable": false,
-    "backend": "inductor"
-  },
-  "zero_optimization": {
-    "stage": 2,
-    "offload_optimizer": {
-      "device": "cpu"
-    },
-    "contiguous_gradients": true,
-    "overlap_comm": true
-  },
-  "bf16": {
-    "enabled": "auto"
-  },
-  "fp16": {
-    "enabled": "auto",
-    "auto_cast": false,
-    "loss_scale": 0,
-    "initial_scale_power": 32,
-    "loss_scale_window": 1000,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
-  "train_batch_size": "auto",
-  "train_micro_batch_size_per_gpu": "auto",
-  "wall_clock_breakdown": false
-}
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -38,6 +38,6 @@ RUN git lfs install --skip-repo && \
    # The base image ships with `pydantic==1.8.2` which is not working
    pip3 install -U --no-cache-dir pydantic==1.10.10

-RUN if [ "$PYTORCH_VERSION" = "2.7.1" ] ; then \
+RUN if [ "$PYTORCH_VERSION" = "2.7.0" ] ; then \
        pip3 install flash-attn==2.7.4.post1; \
    fi
--- a/docker/Dockerfile-base-next
+++ b/docker/Dockerfile-base-next
@@ -29,7 +29,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
 WORKDIR /workspace

 RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
-    python3 -m pip install --no-cache-dir -U torch==2.7.1 --extra-index-url https://download.pytorch.org/whl/test/cu$CUDA && \
+    python3 -m pip install --no-cache-dir -U torch==2.7.0 --extra-index-url https://download.pytorch.org/whl/test/cu$CUDA && \
    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"

--- a/docker/Dockerfile-uv-base
+++ b/docker/Dockerfile-uv-base
@@ -1,40 +0,0 @@
-ARG CUDA_VERSION="12.6.3"
-ARG CUDNN_VERSION=""
-ARG UBUNTU_VERSION="22.04"
-ARG MAX_JOBS=4
-
-FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
-
-ARG PYTHON_VERSION="3.11"
-ARG PYTORCH_VERSION="2.6.0"
-ARG CUDA="126"
-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
-
-ENV PYTHON_VERSION=$PYTHON_VERSION
-ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
-ENV UV_TORCH_BACKEND="cu${CUDA}"
-
-RUN apt-get update \
-    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config curl && rm -rf /var/lib/apt/lists/* \
-    && git lfs install --skip-repo \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh
-
-ENV PATH="/root/.local/bin:${PATH}"
-
-RUN uv python install ${PYTHON_VERSION}
-
-WORKDIR /workspace
-
-RUN uv venv --no-project --relocatable axolotl-venv
-
-ENV PATH="/workspace/axolotl-venv/bin:${PATH}"
-
-RUN uv pip install packaging setuptools wheel psutil \
-    && uv pip install torch==${PYTORCH_VERSION} \
-    && uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" \
-    && uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" \
-    && uv pip install awscli pydantic
-
-RUN if [ "$PYTORCH_VERSION" = "2.7.1" ] ; then \
-        uv pip install --no-build-isolation flash-attn==2.7.4.post1; \
-    fi
--- a/docs/cli.qmd
+++ b/docs/cli.qmd
@@ -209,16 +209,6 @@ axolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir

 This would be necessary to use with other frameworks. If you have an adapter, merge it with the non-quantized linearized model before delinearizing.

-### quantize
-
-Quantizes a model using the quantization configuration specified in your YAML file.
-
-```bash
-axolotl quantize config.yml
-```
-
-See [Quantization](./quantize.qmd) for more details.
-

 ## Legacy CLI Usage

--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -27,8 +27,6 @@ trust_remote_code:
 tokenizer_use_fast:
 # Whether to use the legacy tokenizer setting, defaults to True
 tokenizer_legacy:
-# Whether to use mistral-common tokenizer. If set to True, it will use the mistral-common tokenizer.
-tokenizer_use_mistral_common:
 # Resize the model embeddings when new tokens are added to multiples of 32
 # This is reported to improve training speed on some models
 resize_token_embeddings_to_32x:
@@ -67,20 +65,6 @@ bnb_config_kwargs:
  bnb_4bit_quant_type: nf4
  bnb_4bit_use_double_quant: true

-# quantization aware training
-qat:
-  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
-  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4" and "int8"
-  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
-  fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after
-
-# post-training quantization
-quantization:
-  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are uintX for X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8
-  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
-  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
-  quantize_embedding: # Optional[bool] = False. Whether to quantize the embedding layer.
-

 # Whether you are training a 4-bit GPTQ quantized model
 gptq: true
@@ -114,10 +98,8 @@ plugins:
  # - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

 # A list of one or more datasets to finetune the model with
-# See https://docs.axolotl.ai/docs/dataset_loading.html for guide on loading datasets
-# See https://docs.axolotl.ai/docs/dataset-formats/ for guide on dataset formats
 datasets:
-  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory
+  # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
  - path: vicgalle/alpaca-gpt4
    # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]
    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
@@ -175,10 +157,6 @@ datasets:
    # Key containing the messages (default: "messages")
    field_messages: messages

-    # Key containing the tools (default: "tools")
-    # Must be a list[dict] and follow [JSON schema](https://json-schema.org/learn/getting-started-step-by-step).
-    field_tools: tools
-
    # Key containing the system message (default: "system")
    # If the system message is not present in the dataset sample, it will be loaded from the field_system property.
    field_system: system
@@ -243,7 +221,7 @@ datasets:
 # The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.
 shuffle_merged_datasets: true

-# Deduplicates datasets and test_datasets with identical entries.
+Deduplicates datasets and test_datasets with identical entries.
 dataset_exact_deduplication: true

 # A list of one or more datasets to eval the model with.
@@ -292,25 +270,10 @@ trl:

  num_generations: # Optional[int]. Number of generations to sample.
  log_completions: # Optional[bool]. Whether to log completions.
-  num_completions_to_print: # Optional[int]. Number of completions to print when log_completions is True.

  sync_ref_model: # Optional[bool]. Whether to sync the reference model.
  ref_model_mixup_alpha: # Optional[float]. Mixup alpha for the reference model.
  ref_model_sync_steps: # Optional[int]. Sync steps for the reference model.
-  scale_rewards: # Optional[bool]. Whether to scale rewards by their standard deviation.
-
-  temperature: # Optional[float]. Sampling temperature for the GRPO policy.
-  top_p: # Optional[float]. Top-p sampling probability for the generation policy.
-  top_k: # Optional[int]. Top-k sampling for the generation policy.
-  min_p: # Optional[float]. Minimum probability for the generation policy.
-  repetition_penalty: # Optional[float]. Penalty for tokens that appear in prompt and generated text.
-
-  num_iterations: # Optional[int]. Number of iterations per batch (μ) for GRPO.
-  epsilon: # Optional[float]. Epsilon value for clipping in the GRPO algorithm.
-  epsilon_high: # Optional[float]. Upper-bound epsilon value for clipping in the GRPO algorithm.
-  use_liger_loss: # Optional[bool]. Whether to use Liger loss for GRPO.
-  loss_type: # Optional[str]. Loss formulation to use. Supported values: grpo, bnpo, dr_grpo.
-  mask_truncated_completions: # Optional[bool]. Whether to exclude truncated completions from loss calculation.


 # reward modelling: `True` or `False`
@@ -520,7 +483,6 @@ output_dir: ./completed-model
 # setting to `auto` will enable torch compile when torch>=2.5.1
 torch_compile:  # Optional[Union[Literal["auto"], bool]]
 torch_compile_backend:  # Optional[str]
-torch_compile_mode:  # 'default' | 'reduce-overhead' | 'max-autotune'

 # Training hyperparameters

@@ -543,7 +505,6 @@ save_strategy: # Set to `"no"` to skip checkpoint saves, `"epoch"` at end of eac
 save_steps: # Leave empty to save at each epoch, integer for every N steps. float for fraction of total steps
 saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
 save_total_limit: # Checkpoints saved at a time
-save_only_model: # Save only the model weights, skipping the optimizer. Using this means you can't resume from checkpoints.
 # Maximum number of iterations to train for. It precedes num_epochs which means that
 # if both are set, num_epochs will not be guaranteed.
 # e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
@@ -567,7 +528,7 @@ profiler_steps: # enable the pytorch profiler to capture the first N steps of tr
 loss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)
 loss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)

-# Save model as safetensors (require safetensors package). Default True
+# Save model as safetensors (require safetensors package)
 save_safetensors:

 # Whether to mask out or include the human's prompt from the training labels
@@ -577,7 +538,7 @@ train_on_inputs: false
 # Note that training loss may have an oscillating pattern with this enabled.
 group_by_length: false

-# Whether to use gradient checkpointing. Available options are: true, false, "offload", "offload_disk".
+# Whether to use gradient checkpointing. Available options are: true, false, "offload".
 # https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
 gradient_checkpointing: false
 # additional kwargs to pass to the trainer for gradient checkpointing
@@ -589,24 +550,7 @@ gradient_checkpointing: false
 early_stopping_patience: 3

 # Specify a scheduler and kwargs to use with the optimizer
-# Valid values are driven by the Transformers SchedulerType class, see:
-# https://github.com/huggingface/transformers/blob/5f4ecf2d9f867a1255131d2461d75793c0cf1db2/src/transformers/trainer_utils.py#L420
-# Valid values include
-# - 'linear'
-# - 'cosine' (default)
-# - 'cosine_with_restarts'
-# - 'polynomial'
-# - 'constant'
-# - 'constant_with_warmup'
-# - 'inverse_sqrt'
-# - 'reduce_lr_on_plateau'
-# - 'cosine_with_min_lr'
-# - 'warmup_stable_decay'
-
-# Additional schedulers include:
-# - 'one_cycle'
-# - 'rex'
-lr_scheduler:
+lr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | 'linear' | 'cosine_with_restarts' | 'polynomial' | 'constant' | 'constant_with_warmup' | 'inverse_sqrt' | 'reduce_lr_on_plateau' | 'cosine_with_min_lr' | 'warmup_stable_decay' | empty for cosine
 lr_scheduler_kwargs:
 cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
 cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)
@@ -624,7 +568,7 @@ lr_div_factor: # Learning rate div factor
 #
 # Valid values for 'optimizer' include:
 # - adamw_torch
-# - adamw_torch_fused (default)
+# - adamw_torch_fused
 # - adamw_torch_xla
 # - adamw_torch_npu_fused
 # - adamw_apex_fused
@@ -668,7 +612,6 @@ lr_div_factor: # Learning rate div factor
 # - optimi_adamw
 # - ao_adamw_8bit
 # - ao_adamw_fp8
-# - came_pytorch
 optimizer:
 # Dictionary of arguments to pass to the optimizer
 optim_args:
@@ -688,9 +631,7 @@ weight_decay:
 # adamw hyperparams
 adam_beta1:
 adam_beta2:
-adam_beta3:  # only used for CAME Optimizer
 adam_epsilon:
-adam_epsilon2:  # only used for CAME Optimizer
 # Gradient clipping max norm
 max_grad_norm:

--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -52,9 +52,7 @@ We recommend checking the below examples for other usecases.

 ### Examples

-#### Training on last message
-
-(Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.
+1. (Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.

 ```yaml
 datasets:
@@ -68,9 +66,7 @@ datasets:
 If you receive an error like "`chat_template` choice is `tokenizer_default` but tokenizer's `chat_template` is null.", it means the tokenizer does not have a default `chat_template`. Follow the examples below instead to set a custom `chat_template`.
 :::

-#### Overriding default chat template
-
-Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages.
+2. Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages.

 ```yaml
 chat_template: gemma # this overwrites the tokenizer's chat_template
@@ -80,13 +76,7 @@ datasets:
    roles_to_train: ["assistant"]  # default value
 ```

-::: {.callout-note}
-If you want to use built-in chat_template, use `chat_template: tokenizer_default` (this is set by default).
-:::
-
-#### Using default chat template with fallback
-
-Using the tokenizer_config.json's chat template or `chatml` as fallback if the former's chat template does not exist, on OpenAI messages format, training on all assistant messages.
+3. Using the tokenizer_config.json's chat template or `chatml` as fallback if the former's chat template does not exist, on OpenAI messages format, training on all assistant messages.

 ```yaml
 chat_template: tokenizer_default_fallback_chatml # this overwrites the tokenizer's chat_template
@@ -95,9 +85,7 @@ datasets:
    type: chat_template
 ```

-#### Custom Jinja template
-
-Using a custom jinja template on OpenAI messages format, training on all assistant messages.
+4. Using a custom jinja template on OpenAI messages format, training on all assistant messages.

 ```yaml
 # chat_template: jinja # `jinja` will be implied if the `chat_template_jinja` is set and this field is empty
@@ -112,9 +100,7 @@ datasets:
 Please make sure that your `tokenizer.eos_token` is same as EOS (End-of-Sequence) token in template. Otherwise, set `eos_token` under `special_tokens: `.
 :::

-#### Using template with different token for EOT and EOS
-
- If you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the `eot_tokens: ` config. The handling of EOT tokens follows `train_on_eos: ` which defaults to turn.
+5. If you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the `eot_tokens: ` config. The handling of EOT tokens follows `train_on_eos: ` which defaults to turn.

 ```yaml
 eot_tokens:
@@ -139,7 +125,7 @@ Using `eot_tokens` requires each token that exists in `chat_template` to be a si
 You can add those tokens as new tokens under `tokens: ` or (recommended) override unused added_tokens via `added_tokens_overrides: `. See [config](../config.qmd) for more details.
 :::

- Continuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set `train_on_eos: last`.
+6. Continuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set `train_on_eos: last`.

 ```yaml
 eot_tokens:
@@ -159,73 +145,7 @@ If EOS token only appears at the end of a prompt, `train_on_eos: last` is equiva
 :::


-#### Using tool use
-
-Instead of passing `tools` via the system prompt, an alternative method would be to have the `tools` in a separate column and loaded via `chat_template` to let the template dynamically build it.
-
-```json
-{
-    "tools": [
-        {
-            "type": "...",
-            "function": {
-                "name": "...",
-                "description": "...",
-                "parameters": {
-                    "type": "...",
-                    "properties": {
-                        // ...
-                    },
-                    "required": ["..."],
-                },
-            },
-        },
-    ],
-    "messages": [
-        // ...
-        {
-            "role": "assistant", // call the function via assistant
-            "tool_calls": [
-                {
-                    "type": "function",
-                    "function": {
-                        "name": "...",
-                        "arguments": {
-                            "...": "...",
-                        }
-                    }
-                }
-            ]
-        },
-        {
-            "role": "tool",
-            "name": "...",
-            "content": "..."
-        },
-    ],
-}
-```
-
-::: {.callout-note}
-Tools need to follow [JSON schema](https://json-schema.org/learn/getting-started-step-by-step).
-:::
-
-```yaml
-chat_template: llama4
-datasets:
-  - path: ...
-    type: chat_template
-    # field_tools: tools # default is `tools`
-```
-
-::: {.callout-tip}
-Look into the `chat_template` you are using to see if it supports `tools` and what the expected role is for the tool answer. In the example above, the tool answer is expected to be in the `tool` or `ipython` role for `llama4` template.
-:::
-
-
-#### Using fine-grained control over token masking
-
-(Advanced) Using fine-grained control over tokens and turns to train in a conversation
+7. (Advanced) Using fine-grained control over tokens and turns to train in a conversation

 For a data sample that looks like:

@@ -276,9 +196,7 @@ datasets:
 It is not necessary to set both `message_field_training` and `message_field_training_detail` at once.
 :::

-#### Reasoning split
-
-(For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.
+8. (For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.

 ```yaml
 datasets:
--- a/docs/dataset-formats/index.qmd
+++ b/docs/dataset-formats/index.qmd
@@ -36,6 +36,10 @@ It is typically recommended to save your dataset as `.jsonl` due to its flexibil

 Axolotl supports loading from a Hugging Face hub repo or from local files.

+::: {.callout-important}
+For pre-training only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts.
+:::
+
 ### Pre-training from Hugging Face hub datasets

 As an example, to train using a Hugging Face dataset `hf_org/name`, you can pass the following config:
@@ -73,21 +77,18 @@ datasets:
    type: completion
 ```

-From local files:
+From local files (either example works):

 ```yaml
 datasets:
  - path: A.jsonl
    type: completion

-  - path: B.jsonl
+  - path: json
+    data_files: ["A.jsonl", "B.jsonl", "C.jsonl"]
    type: completion
 ```

-::: {.callout-important}
-For `completion` only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts. If you are interested in having this for `pretraining_dataset` too, please let us know or help make a PR!
-:::
-
 ### Pre-training dataset configuration tips

 #### Setting max_steps
--- a/docs/dataset_loading.qmd
+++ b/docs/dataset_loading.qmd
@@ -54,7 +54,7 @@ datasets:

 #### Files

-To load a JSON file, you would do something like this:
+Usually, to load a JSON file, you would do something like this:

 ```python
 from datasets import load_dataset
@@ -66,11 +66,19 @@ Which translates to the following config:

 ```yaml
 datasets:
-  - path: data.json
-    ds_type: json
+  - path: json
+    data_files: /path/to/your/file.jsonl
 ```

-In the example above, it can be seen that we can just point the `path` to the file or directory along with the `ds_type` to load the dataset.
+However, to make things easier, we have added a few shortcuts for loading local dataset files.
+
+You can just point the `path` to the file or directory along with the `ds_type` to load the dataset. The below example shows for a JSON file:
+
+```yaml
+datasets:
+  - path: /path/to/your/file.jsonl
+    ds_type: json
+```

 This works for CSV, JSON, Parquet, and Arrow files.

--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -8,10 +8,6 @@ format:

 This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai).

-::: {.callout-important}
-For Blackwell GPUs, please use the tags with Pytorch 2.7.1 and CUDA 12.8.
-:::
-
 ## Base

 The base image is the most minimal image that can install Axolotl. It is based on the `nvidia/cuda` image. It includes python, torch, git, git-lfs, awscli, pydantic, and more.
@@ -32,10 +28,11 @@ main-base-py{python_version}-cu{cuda_version}-{pytorch_version}

 Tags examples:

- `main-base-py3.11-cu128-2.7.1`
- `main-base-py3.11-cu126-2.7.1`
+- `main-base-py3.11-cu128-2.7.0`
+- `main-base-py3.11-cu126-2.7.0`
 - `main-base-py3.11-cu124-2.6.0`
 - `main-base-py3.11-cu124-2.5.1`
+- `main-base-py3.11-cu124-2.4.1`

 ## Main

@@ -76,10 +73,12 @@ Tags examples:
 - `main-py3.11-cu126-2.7.0`
 - `main-py3.11-cu124-2.6.0`
 - `main-py3.11-cu124-2.5.1`
+- `main-py3.11-cu124-2.4.1`
 - `main-latest`
 - `main-20250303-py3.11-cu124-2.6.0`
 - `main-20250303-py3.11-cu124-2.5.1`
- `0.9.2`
+- `main-20250303-py3.11-cu124-2.4.1`
+- `0.7.1`

 ## Cloud

--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -110,17 +110,3 @@ description: Frequently asked questions
 > A: If `eot_tokens: ` is not provided, the default behavior is the same as before. EOS tokens used to delimit turns are masked/unmasked depending on whether the turn is trainable.

 > Internally, `eot_tokens: tokenizer.eos_token` and `train_on_eot: train_on_eos` (which defaults to `turn`). This transition helps clarify the naming and behavior of EOT/EOS tokens.
-
-**Q: `Data processing error: CAS service error`**
-
-> A: Try disabling XET with `export HF_HUB_DISABLE_XET=1`
-
-**Q: `torch._inductor.exc.LoweringException: NoValidChoicesError: No choices to select, please consider adding ATEN into max_autotune_gemm_backends config (defined in torch/_inductor/config.py) to allow at least one choice. `**
-
-> A: Depending on the version of torch, you may need to include this in your YAML:
-
-> ```yaml
-> flex_attn_compile_kwargs:
->   dynamic: false
->   mode: max-autotune-no-cudagraphs
-> ```
--- a/docs/getting-started.qmd
+++ b/docs/getting-started.qmd
@@ -104,7 +104,7 @@ the `alpaca` dataset format, which has the following format:
 Please see our [Dataset Formats](dataset-formats) for more dataset formats and how to
 format them.

-2. Prepare your JSONL data in the specified format (in this case, the expected `alpaca`
+2. Prepare your JSONL data in the specified format (in this case, the expected `alpaca
 format):

 ```json
@@ -120,12 +120,6 @@ axolotl train my_training.yml

 ## Common Tasks {#sec-common-tasks}

-::: {.callout-tip}
-
-The same yaml file is used for training, inference, and merging.
-
-:::
-
 ### Testing Your Model {#sec-testing}

 After training, test your model:
@@ -134,16 +128,6 @@ After training, test your model:
 axolotl inference my_training.yml --lora-model-dir="./outputs/lora-out"
 ```

-More details can be found in [Inference](inference.qmd).
-
-### Using a UI {#sec-ui}
-
-Launch a Gradio interface:
-
-```bash
-axolotl inference my_training.yml --lora-model-dir="./outputs/lora-out" --gradio
-```
-
 ### Preprocessing Data {#sec-preprocessing}

 For large datasets, preprocess first:
@@ -152,22 +136,14 @@ For large datasets, preprocess first:
 axolotl preprocess my_training.yml
 ```

-Please make sure to set `dataset_prepared_path: ` in your config to set the path to save the prepared dataset.
+### Using a UI {#sec-ui}

-More details can be found in [Dataset Preprocessing](dataset_preprocessing.qmd).
-
-### Merging LoRA weights {#sec-merging-lora}
-
-To merge the LoRA weights back into the base model, run:
+Launch a Gradio interface:

 ```bash
-axolotl merge-lora my_training.yml --lora-model-dir="./outputs/lora-out"
+axolotl inference my_training.yml --lora-model-dir="./outputs/lora-out" --gradio
 ```

-The merged model will be saved in the `{output_dir}/merged` directory.
-
-More details can be found in [Merging LoRA weights](inference.qmd#sec-merging).
-
 ## Next Steps {#sec-next-steps}

 Now that you have the basics, you might want to:
@@ -180,7 +156,6 @@ Now that you have the basics, you might want to:
 Check our other guides for details on these topics:

 - [Configuration Guide](config.qmd) - Full configuration options
- [Dataset Loading](dataset_loading.qmd) - Loading datasets from various sources
 - [Dataset Formats](dataset-formats) - Working with different data formats
 - [Multi-GPU Training](multi-gpu.qmd)
 - [Multi-Node Training](multi-node.qmd)
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -15,7 +15,7 @@ This guide covers all the ways you can install and set up Axolotl for your envir

 - NVIDIA GPU (Ampere architecture or newer for `bf16` and Flash Attention) or AMD GPU
 - Python ≥3.10
- PyTorch ≥2.5.1
+- PyTorch ≥2.4.1

 ## Installation Methods {#sec-installation-methods}

@@ -25,10 +25,6 @@ Please make sure to have Pytorch installed before installing Axolotl in your loc
 Follow the instructions at: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
 :::

-::: {.callout-important}
-For Blackwell GPUs, please use Pytorch 2.7.0 and CUDA 12.8.
-:::
-
 ### PyPI Installation (Recommended) {#sec-pypi}

 ```{.bash}
@@ -41,40 +37,6 @@ installed) in order not to clobber it, and so that we set the correct version of
 dependencies that are specific to the PyTorch version or other installed
 co-dependencies.

-### uv Installation {#sec-uv}
-
-uv is a fast, reliable Python package installer and resolver built in Rust. It offers significant performance improvements over pip and provides better dependency resolution, making it an excellent choice for complex environments.
-
-Install uv if not already installed
-```{.bash}
-curl -LsSf https://astral.sh/uv/install.sh | sh
-source $HOME/.local/bin/env
-```
-
-Choose your CUDA version to use with PyTorch; e.g. `cu124`, `cu126`, `cu128`,
-then create the venv and activate
-```{.bash}
-export UV_TORCH_BACKEND=cu126
-uv venv --no-project --relocatable
-source .venv/bin/activate
-```
-
-Install PyTorch
- PyTorch 2.6.0 recommended
-```{.bash}
-uv pip install packaging setuptools wheel
-uv pip install torch==2.6.0
-uv pip install awscli pydantic
-```
-
-Install axolotl from PyPi
-```{.bash}
-uv pip install --no-build-isolation axolotl[deepspeed,flash-attn]
-
-# optionally install with vLLM if you're using torch==2.6.0 and want to train w/ GRPO
-uv pip install --no-build-isolation axolotl[deepspeed,flash-attn,vllm]
-```
-
 ### Edge/Development Build {#sec-edge-build}

 For the latest features between releases:
@@ -110,10 +72,6 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it \
 ```
 :::

-::: {.callout-important}
-For Blackwell GPUs, please use `axolotlai/axolotl:main-py3.11-cu128-2.7.0` or the cloud variant `axolotlai/axolotl-cloud:main-py3.11-cu128-2.7.0`.
-:::
-
 Please refer to the [Docker documentation](docker.qmd) for more information on the different Docker images that are available.

 ## Cloud Environments {#sec-cloud}
--- a/docs/lora_optims.qmd
+++ b/docs/lora_optims.qmd
@@ -84,10 +84,6 @@ lora_qkv_kernel: true
 lora_o_kernel: true
 ```

-::: {.callout-note}
-Currently, LoRA kernels are not supported for RLHF training, only SFT.
-:::
-
 ## Requirements

 - One or more NVIDIA or AMD GPUs (in order to use the Triton kernels)
--- a/docs/multi-gpu.qmd
+++ b/docs/multi-gpu.qmd
@@ -87,7 +87,20 @@ We support sequence parallelism (SP) via the
 allows one to split up sequences across GPUs, which is useful in the event that a
 single sequence causes OOM errors during model training.

-See our [dedicated guide](sequence_parallelism.qmd) for more information.
+First, install `ring-flash-attn`, recommended via `pip install axolotl[ring-flash-attn]`,
+or from source with `pip install .[ring-flash-attn]`.
+
+Your Axolotl YAML config should contain the following lines:
+
+```{.yaml}
+sequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU
+flash_attention: true  # Required with sequence parallelism
+
+# Optional; strides across the key dimension. Larger values use more memory but will make training faster.
+heads_k_stride: 1
+```
+
+See our [dedicated guide](sequence_parallelism.qmd) for more details.

 ### FSDP + QLoRA {#sec-fsdp-qlora}

--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -43,7 +43,7 @@ datasets:
 # leave the vision model and vision tower frozen
 # load_in_8bit: true
 adapter: lora
-lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 # (optional) if you want to resize images to a set size
 image_size: 512
--- a/docs/qat.qmd
+++ b/docs/qat.qmd
@@ -1,32 +0,0 @@
---
-title: "Quantization Aware Training (QAT)"
-back-to-top-navigation: true
-toc: true
-toc-expand: 2
-toc-depth: 4
---
-
-## Overview
-
-[Quantization Aware Training](https://pytorch.org/blog/introduction-to-quantization-on-pytorch/#quantization-aware-training) (QAT) is a technique for improving the accuracy of models which are quantized
-by applying "fake" quantizations to the model's weights (and optionally, activations) during training. This fake
-quantization allows for the model to adjust for noise introduced by the quantization, so when the model is eventually
-quantized, the accuracy loss is minimized. We use the quantization techniques implemented in [torchao](https://github.com/pytorch/ao) to provide
-support for QAT and post-training quantization (PTQ) in axolotl.
-
-We recommend reviewing the excellent QAT tutorial in the [torchtune library](https://pytorch.org/torchtune/main/tutorials/qat_finetune.html#quantizing-the-qat-model),
-and the QAT documentation in the [torchao library](https://github.com/pytorch/ao/tree/main/torchao/quantization/qat), for more details.
-
-## Configuring QAT in Axolotl
-
-To enable QAT in axolotl, add the following to your configuration file:
-
-```yaml
-qat:
-  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
-  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4" and "int8"
-  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
-  fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after
-```
-
-Once you have finished training, you must quantize your model by using the same quantization configuration which you used to train the model with. You can use the [`quantize`](./quantize.qmd) command to do this.
--- a/docs/quantize.qmd
+++ b/docs/quantize.qmd
@@ -1,53 +0,0 @@
---
-title: "Quantization with torchao"
-back-to-top-navigation: true
-toc: true
-toc-expand: 2
-toc-depth: 4
---
-
-Quantization is a technique to lower the memory footprint of your model, potentially at the cost of accuracy or model performance. We support quantizing your model using the [torchao](https://github.com/pytorch/ao) library. Quantization is supported for both post-training quantization (PTQ) and quantization-aware training (QAT).
-
-
-::: {.callout-note}
-
-We do not currently support quantization techniques such as GGUF/GPTQ,EXL2 at the moment.
-
-:::
-
-## Configuring Quantization in Axolotl
-
-Quantization is configured using the `quantization` key in your configuration file.
-
-```yaml
-base_model: # The path to the model to quantize.
-quantization:
-  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are uintX for X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8
-  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
-  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
-  quantize_embedding: # Optional[bool] = False. Whether to quantize the embedding layer.
-
-output_dir:  # The path to the output directory.
-```
-
-Once quantization is complete, your quantized model will be saved in the `{output_dir}/quantized` directory.
-
-You may also use the `quantize` command to quantize a model which has been trained with [QAT](./qat.md) - you can do this by using the existing QAT configuration file which
-you used to train the model:
-
-```yaml
-# qat.yml
-qat:
-  activation_dtype: int8
-  weight_dtype: int8
-  group_size: 256
-  quantize_embedding: true
-
-output_dir: # The path to the output directory used during training where the final checkpoint has been saved.
-```
-
-```bash
-axolotl quantize qat.yml
-```
-
-This ensures that an identical quantization configuration is used to quantize the model as was used to train it.
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -16,8 +16,7 @@ feedback. Various methods include, but not limited to:
 - [Identity Preference Optimization (IPO)](#ipo)
 - [Kahneman-Tversky Optimization (KTO)](#kto)
 - [Odds Ratio Preference Optimization (ORPO)](#orpo)
- [Group Relative Policy Optimization (GRPO)](#grpo)
- Proximal Policy Optimization (PPO) (not yet supported in axolotl, if you're interested in contributing, please reach out!)
+- Proximal Policy Optimization (PPO) (not yet supported in axolotl)


 ## RLHF using Axolotl
@@ -500,7 +499,7 @@ The input format is a simple JSON input with customizable fields based on the ab
 ### GRPO

 ::: {.callout-tip}
-Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/grpo_code).
+Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/axolotl-cookbook/tree/main/grpo#training-an-r1-style-large-language-model-using-grpo).
 :::

 In the latest GRPO implementation, `vLLM` is used to significantly speedup trajectory generation during training. In this example, we're using 4 GPUs - 2 for training, and 2 for vLLM:
@@ -583,20 +582,7 @@ datasets:

 To see other examples of custom reward functions, please see [TRL GRPO Docs](https://github.com/huggingface/trl/blob/main/docs/source/grpo_trainer.md#using-a-custom-reward-function).

-To see all configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/v0.9.2/src/axolotl/utils/schemas/trl.py).
-
-#### GRPO with DAPO/Dr. GRPO loss
-
-The DAPO paper and subsequently Dr. GRPO paper proposed an alternative loss function for GRPO to remediate the penalty in longer responses.
-
-```yaml
-trl:
-  loss_type: dr_grpo
-  # Normalizes loss based on max completion length (default: 256)
-  max_completion_length:
-```
-
-For more information, see [GRPO docs](https://huggingface.co/docs/trl/v0.17.0/en/grpo_trainer#loss-types).
+To see description of the configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/config/models/input/v0_4_1/trl.py).

 ### SimPO

--- a/docs/sequence_parallelism.qmd
+++ b/docs/sequence_parallelism.qmd
@@ -3,6 +3,8 @@ title: Sequence Parallelism
 description: Train with long sequences split across multiple GPUs.
 ---

+# Sequence Parallelism
+
 Sequence parallelism is a technique that splits sequences across multiple GPUs,
 allowing you to train with very long sequences that wouldn't fit on a single GPU. Each
 GPU processes a different portion of the sequence, and the results are aggregated
@@ -25,7 +27,7 @@ To enable sequence parallelism, add the following to your configuration file:
 sequence_parallel_degree: 4  # Split sequences across 4 GPUs
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 heads_k_stride: 1
-# Optional; one of "varlen_llama3" or "batch_ring". Defaults to
+# Optional; one of "varlen_llama3", "batch_ring", "batch_zigzag", "batch_stripe". Defaults to
 # "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
 ring_attn_func:
 ```
@@ -41,7 +43,7 @@ When sequence parallelism is enabled:

 1. Each sequence is divided into equal chunks across the GPUs in a sequence parallel group
 2. The data collator handles the chunking of input_ids, attention_mask, labels, and position_ids
-3. Position IDs are adjusted to maintain proper relative positions
+3. Position IDs are adjusted to maintain proper relative positions, especially for packed sequences
 4. The trainer uses special ring communication patterns for attention operations

 ## Requirements
@@ -67,11 +69,9 @@ sequence_len: 8192
 ...

 sequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU
+flash_attention: true  # Required with sequence parallelism
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 heads_k_stride: 1
-# Optional; one of "varlen_llama3" or "batch_ring". Defaults to
-# "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
-ring_attn_func:

 ...
 ```
--- a/examples/cerebras/btlm-ft.yml
+++ b/examples/cerebras/btlm-ft.yml
@@ -59,9 +59,7 @@ gradient_checkpointing: false
 resume_from_checkpoint:
 logging_steps: 1

-flash_attention: true
-sdp_attention:
-flash_optimum:
+attention: flash

 gptq_groupsize:
 gptq_model_v1:
--- a/examples/cerebras/qlora.yml
+++ b/examples/cerebras/qlora.yml
@@ -39,8 +39,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-xformers_attention: true
-flash_attention:
+attention: xformers
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 10
--- a/examples/code-llama/13b/lora.yml
+++ b/examples/code-llama/13b/lora.yml
@@ -45,7 +45,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/code-llama/13b/qlora.yml
+++ b/examples/code-llama/13b/qlora.yml
@@ -46,7 +46,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/code-llama/34b/lora.yml
+++ b/examples/code-llama/34b/lora.yml
@@ -45,7 +45,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/code-llama/34b/qlora.yml
+++ b/examples/code-llama/34b/qlora.yml
@@ -46,7 +46,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/code-llama/7b/lora.yml
+++ b/examples/code-llama/7b/lora.yml
@@ -45,7 +45,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/code-llama/7b/qlora.yml
+++ b/examples/code-llama/7b/qlora.yml
@@ -46,7 +46,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/cohere/command-r-7b-qlora.yml
+++ b/examples/cohere/command-r-7b-qlora.yml
@@ -49,7 +49,8 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -112,9 +112,7 @@
    "early_stopping_patience:\n",
    "resume_from_checkpoint:\n",
    "logging_steps: 1\n",
-    "xformers_attention:\n",
-    "flash_attention: false\n",
-    "sdp_attention: true\n",
+    "attention: sdpa\n",
    "\n",
    "warmup_steps: 1\n",
    "max_steps: 25\n",
--- a/examples/dbrx/16bit-lora.yaml
+++ b/examples/dbrx/16bit-lora.yaml
@@ -52,7 +52,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 10
 evals_per_epoch:
--- a/examples/dbrx/8bit-lora.yaml
+++ b/examples/dbrx/8bit-lora.yaml
@@ -55,7 +55,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 10
 evals_per_epoch:
--- a/examples/dbrx/fft-ds-zero3.yaml
+++ b/examples/dbrx/fft-ds-zero3.yaml
@@ -39,7 +39,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 10
 evals_per_epoch:
--- a/examples/deepseek-v2/fft-fsdp-16b.yaml
+++ b/examples/deepseek-v2/fft-fsdp-16b.yaml
@@ -35,7 +35,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 100
 evals_per_epoch: 2
--- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -59,7 +59,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 100
 evals_per_epoch: 2
--- a/examples/falcon/config-7b-lora.yml
+++ b/examples/falcon/config-7b-lora.yml
@@ -43,8 +43,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-xformers_attention: true
-flash_attention:
+attention: xformers
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 40
--- a/examples/falcon/config-7b-qlora.yml
+++ b/examples/falcon/config-7b-qlora.yml
@@ -73,8 +73,7 @@ early_stopping_patience: 3
 resume_from_checkpoint:
 auto_resume_from_checkpoints: true
 logging_steps: 1
-xformers_attention: true
-flash_attention:
+attention: xformers
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 10
--- a/examples/falcon/config-7b.yml
+++ b/examples/falcon/config-7b.yml
@@ -40,8 +40,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-xformers_attention: true
-flash_attention:
+attention: xformers
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 40
--- a/examples/gemma/qlora.yml
+++ b/examples/gemma/qlora.yml
@@ -47,7 +47,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_ratio: 0.1
 evals_per_epoch: 4
--- a/examples/gemma2/qlora.yml
+++ b/examples/gemma2/qlora.yml
@@ -53,7 +53,8 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/gemma2/reward-model.yaml
+++ b/examples/gemma2/reward-model.yaml
@@ -43,7 +43,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/gemma3/gemma-3-1b-qlora.yml
+++ b/examples/gemma3/gemma-3-1b-qlora.yml
@@ -57,7 +57,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_ratio: 0.1
 evals_per_epoch:
--- a/examples/gemma3/gemma-3-4b-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-qlora.yml
@@ -28,7 +28,7 @@ pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 wandb_project:
 wandb_entity:
@@ -51,8 +51,7 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 logging_steps: 1
-flash_attention: true
-eager_attention:
+attention: flash

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/gemma3/gemma-3-4b-vision-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-vision-qlora.yml
@@ -30,7 +30,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 wandb_project:
 wandb_entity:
@@ -53,8 +53,7 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 logging_steps: 1
-flash_attention: true
-eager_attention:
+attention: flash

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/gptj/qlora.yml
+++ b/examples/gptj/qlora.yml
@@ -36,8 +36,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-xformers_attention: true
-flash_attention:
+attention: xformers
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 10
--- a/examples/jamba/qlora.yaml
+++ b/examples/jamba/qlora.yaml
@@ -47,7 +47,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 10
 evals_per_epoch:
--- a/examples/jamba/qlora_deepspeed.yaml
+++ b/examples/jamba/qlora_deepspeed.yaml
@@ -46,7 +46,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 10
 evals_per_epoch:
--- a/examples/jamba/qlora_fsdp_large.yaml
+++ b/examples/jamba/qlora_fsdp_large.yaml
@@ -45,7 +45,8 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: true
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 10
 evals_per_epoch: 1
--- a/examples/jeopardy-bot/config.yml
+++ b/examples/jeopardy-bot/config.yml
@@ -37,8 +37,7 @@ bf16: auto
 tf32: true
 resume_from_checkpoint:
 logging_steps: 5
-xformers_attention: true
-flash_attention:
+attention: xformers
 gptq_groupsize:
 gptq_model_v1:
 warmup_steps: 20
--- a/examples/llama-2/fft_optimized.yml
+++ b/examples/llama-2/fft_optimized.yml
@@ -42,7 +42,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
 flash_attn_fuse_qkv: false
--- a/examples/llama-2/gptq-lora.yml
+++ b/examples/llama-2/gptq-lora.yml
@@ -53,9 +53,7 @@ tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention:
-sdp_attention:
-flash_optimum:
+attention: flash
 warmup_steps: 100
 evals_per_epoch: 4
 saves_per_epoch: 1
--- a/examples/llama-2/lisa.yml
+++ b/examples/llama-2/lisa.yml
@@ -46,7 +46,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
 flash_attn_fuse_qkv: false
--- a/examples/llama-2/loftq.yml
+++ b/examples/llama-2/loftq.yml
@@ -45,7 +45,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -45,7 +45,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-2/qlora-fsdp.yml
+++ b/examples/llama-2/qlora-fsdp.yml
@@ -48,7 +48,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -46,7 +46,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-2/relora.yml
+++ b/examples/llama-2/relora.yml
@@ -48,7 +48,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3-vision/lora-11b.yaml
+++ b/examples/llama-3-vision/lora-11b.yaml
@@ -29,7 +29,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 wandb_project:
 wandb_entity:
@@ -50,8 +50,7 @@ tf32: true

 gradient_checkpointing: true
 logging_steps: 1
-flash_attention: true
-eager_attention:
+attention: flash

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/llama-3/3b-qat-fsdp2.yaml
+++ b/examples/llama-3/3b-qat-fsdp2.yaml
@@ -1,79 +0,0 @@
-base_model: meta-llama/Llama-3.2-3B
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-
-load_in_8bit: false
-load_in_4bit: false
-strict: false
-
-plugins:
-  - axolotl.integrations.liger.LigerPlugin
-
-liger_rope: true
-liger_rms_norm: true
-liger_glu_activation: true
-liger_layer_norm: true
-liger_fused_linear_cross_entropy: true
-
-datasets:
-  - path: yahma/alpaca-cleaned
-    type: alpaca
-
-output_dir: ./outputs/qat_out/
-
-sample_packing: true
-pad_to_sequence_len: true
-sequence_len: 512
-
-flex_attention: true
-flex_attn_compile_kwargs:
-  dynamic: false
-  mode: max-autotune-no-cudagraphs
-
-qat:
-  activation_dtype: int8
-  weight_dtype: int4
-  group_size: 32
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 1
-micro_batch_size: 16
-num_epochs: 1
-optimizer: adamw_torch_fused
-
-cosine_constant_lr_ratio: 0
-cosine_min_lr_ratio: 1.0
-learning_rate: 2e-5
-save_only_model: true
-bf16: true
-
-resume_from_checkpoint:
-logging_steps: 1
-
-evals_per_epoch: 1
-saves_per_epoch: 1
-
-warmup_steps: 10
-weight_decay: 0.0
-fsdp:
-  - full_shard
-  - auto_wrap
-
-fsdp_config:
-  fsdp_version: 2
-  fsdp_offload_params: false
-  fsdp_cpu_ram_efficient_loading: true
-  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
-  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_sharding_strategy: FULL_SHARD
-  fsdp_reshard_after_forward: true
-  fsdp_activation_checkpointing: true
-
-special_tokens:
-  pad_token: <|end_of_text|>
--- a/examples/llama-3/fft-8b-liger-fsdp.yaml
+++ b/examples/llama-3/fft-8b-liger-fsdp.yaml
@@ -49,7 +49,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 100
 evals_per_epoch: 2
--- a/examples/llama-3/fft-8b.yaml
+++ b/examples/llama-3/fft-8b.yaml
@@ -34,7 +34,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 100
 evals_per_epoch: 2
--- a/examples/llama-3/instruct-dpo-lora-8b.yml
+++ b/examples/llama-3/instruct-dpo-lora-8b.yml
@@ -5,10 +5,6 @@ tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name

-special_tokens:
-  pad_token: <|finetune_right_pad_id|>
-  eos_token: <|eot_id|>
-
 load_in_8bit: true
 load_in_4bit: false

@@ -65,7 +61,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/instruct-lora-8b.yml
+++ b/examples/llama-3/instruct-lora-8b.yml
@@ -56,7 +56,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/lora-1b-deduplicate-dpo.yml
+++ b/examples/llama-3/lora-1b-deduplicate-dpo.yml
@@ -77,7 +77,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/lora-1b-deduplicate-sft.yml
+++ b/examples/llama-3/lora-1b-deduplicate-sft.yml
@@ -53,7 +53,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/lora-1b-kernels.yml
+++ b/examples/llama-3/lora-1b-kernels.yml
@@ -54,7 +54,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/llama-3/lora-1b-ray.yml
+++ b/examples/llama-3/lora-1b-ray.yml
@@ -48,7 +48,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/llama-3/lora-1b-sample-packing-sequentially.yml
+++ b/examples/llama-3/lora-1b-sample-packing-sequentially.yml
@@ -55,7 +55,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/lora-1b.yml
+++ b/examples/llama-3/lora-1b.yml
@@ -5,7 +5,7 @@ base_model: NousResearch/Llama-3.2-1B
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
-
+dataset_prepared_path: last_run_prepared
 val_set_size: 0.1
 output_dir: ./outputs/lora-out

@@ -38,7 +38,6 @@ wandb_log_model:
 gradient_accumulation_steps: 2
 micro_batch_size: 2
 num_epochs: 1
-
 optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
@@ -49,7 +48,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/llama-3/lora-8b.yml
+++ b/examples/llama-3/lora-8b.yml
@@ -49,7 +49,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/qlora-1b-kto.yaml
+++ b/examples/llama-3/qlora-1b-kto.yaml
@@ -53,7 +53,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 20
 evals_per_epoch: 4
--- a/examples/llama-3/qlora-1b.yml
+++ b/examples/llama-3/qlora-1b.yml
@@ -51,7 +51,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/llama-3/qlora-fsdp-405b.yaml
+++ b/examples/llama-3/qlora-fsdp-405b.yaml
@@ -39,7 +39,8 @@ gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: true
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/qlora-fsdp-70b.yaml
+++ b/examples/llama-3/qlora-fsdp-70b.yaml
@@ -48,7 +48,8 @@ gradient_checkpointing_kwargs:
  use_reentrant: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-3/qlora.yml
+++ b/examples/llama-3/qlora.yml
@@ -46,7 +46,8 @@ tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-flash_attention: true
+attention: flash
+

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/llama-4/README.md
+++ b/examples/llama-4/README.md
@@ -34,5 +34,3 @@ We provide a script to delinearize Llama 4 linearized models into regular Huggin
 ```bash
 axolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir
 ```
-
-Note: This only works with the non-quantized linearized model. If you have an adapter, merge it with the *non-quantized linearized* model before delinearizing.
--- a/examples/llava/lora-7b.yaml
+++ b/examples/llava/lora-7b.yaml
@@ -25,7 +25,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 wandb_project:
 wandb_entity:
@@ -46,8 +46,7 @@ tf32: true

 gradient_checkpointing: true
 logging_steps: 1
-flash_attention: true
-eager_attention:
+attention: flash

 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/magistral/README.md
+++ b/examples/magistral/README.md
@@ -1,71 +0,0 @@
-# Finetune Magistral Small with Axolotl
-
-Magistral Small is a 24B parameter opensource model from MistralAI found on [HuggingFace](https://huggingface.co/mistralai/Magistral-Small-2506). This guide shows how to fine-tune it with Axolotl with multi-turn conversations with proper masking.
-
-MistralAI has also released a proprietary medium-sized version called Magistral Medium.
-
-Thanks to the team at MistralAI for giving us early access to prepare for this release.
-
-## Getting started
-
-1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Magistral is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
-
-    Here is an example of how to install from main for pip:
-
-```bash
-# Ensure you have Pytorch installed (Pytorch 2.6.0 recommended)
-git clone https://github.com/axolotl-ai-cloud/axolotl.git
-cd axolotl
-
-pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation -e '.[flash-attn,mistral]'
-```
-
-2. Download the example config:
-
-```bash
-axolotl fetch examples
-```
-
-3. Run the finetuning example:
-
-```bash
-axolotl train examples/magistral/magistral-small-qlora.yaml
-```
-
-This config uses about 24GB VRAM.
-
-Let us know how it goes. Happy finetuning! 🚀
-
-### TIPS
-
- For inference, the official MistralAI team recommends `top_p: 0.95` and `temperature: 0.7` with `max_tokens: 40960`.
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The dataset format is the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
-
-## Optimization Guides
-
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
-
-## Limitations
-
-We only support the `mistral-common` tokenizer for Supervised Fine-tuning at the moment and for `type: chat_template` only.
-
-The tokenizer does not work with `dataset.map` with multiprocessing, so we had to disable it. In addition, we do not support overriding tokens yet.
-
-## Related Resources
-
- [MistralAI Magistral Blog](https://mistral.ai/news/magistral/)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
-
-
-## Future Work
-
- Add parity to Preference Tuning, RL, Multi-modal, etc.
- Add parity to other tokenizer configs like overriding tokens.
--- a/examples/magistral/magistral-small-fsdp-qlora.yaml
+++ b/examples/magistral/magistral-small-fsdp-qlora.yaml
@@ -1,72 +0,0 @@
-base_model: mistralai/Magistral-Small-2506
-
-# Enable to use mistral-common tokenizer
-tokenizer_use_mistral_common: true
-
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-
-load_in_8bit: false
-load_in_4bit: true
-
-datasets:
-  - path: fozziethebeat/alpaca_messages_2k_test
-    type: chat_template
-
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.1
-output_dir: ./outputs/lora-out
-
-adapter: qlora
-lora_model_dir:
-
-sequence_len: 2048
-sample_packing: true
-eval_sample_packing: false
-pad_to_sequence_len: true
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_target_modules:
-  - gate_proj
-  - down_proj
-  - up_proj
-  - q_proj
-  - v_proj
-  - k_proj
-  - o_proj
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 1
-optimizer: adamw_torch_fused
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: false
-
-gradient_checkpointing:
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_ratio: 0.1
-evals_per_epoch: 1
-saves_per_epoch: 1
-
-fsdp:
-  - full_shard
-  - auto_wrap
-fsdp_config:
-  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_transformer_layer_cls_to_wrap: MistralDecoderLayer
-  fsdp_activation_checkpointing: true
--- a/examples/magistral/magistral-small-qlora.yaml
+++ b/examples/magistral/magistral-small-qlora.yaml
@@ -1,63 +0,0 @@
-base_model: mistralai/Magistral-Small-2506
-
-# Enable to use mistral-common tokenizer
-tokenizer_use_mistral_common: true
-
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-
-load_in_8bit: false
-load_in_4bit: true
-
-datasets:
-  - path: fozziethebeat/alpaca_messages_2k_test
-    type: chat_template
-
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.1
-output_dir: ./outputs/lora-out
-
-adapter: qlora
-lora_model_dir:
-
-sequence_len: 2048
-sample_packing: true
-pad_to_sequence_len: true
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_target_modules:
-  - gate_proj
-  - down_proj
-  - up_proj
-  - q_proj
-  - v_proj
-  - k_proj
-  - o_proj
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 1
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: false
-
-gradient_checkpointing: true
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_ratio: 0.1
-evals_per_epoch: 1
-saves_per_epoch: 1
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
NanoCode012	ef883b6960	chore: refactor normalize_attn to use mapping and loop	2025-05-07 17:10:18 +07:00
NanoCode012	d0c4930dd5	fix: set replit mpt model to use eager attention	2025-05-07 17:10:18 +07:00
Wing Lian	6ee7cb30fa	fixes from PR feedback	2025-05-07 17:10:18 +07:00
Wing Lian	ba47adc24b	replace attention in the yaml config with an enum	2025-05-07 17:10:18 +07:00