fix attribute error

update more tests + better hqq validation
check if self.cfg.quantization exists when directly setting load_in_4bit
2025-04-21 22:29:24 -04:00 · 2025-04-21 22:17:08 -04:00 · 2025-04-21 21:42:23 -04:00 · 2025-04-21 21:32:01 -04:00 · 2025-04-21 21:22:44 -04:00 · 2025-04-21 17:17:41 -04:00
360 changed files with 8503 additions and 23155 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -16,63 +16,48 @@ on:
 jobs:
  build-base:
    if: github.repository_owner == 'axolotl-ai-cloud'
-    timeout-minutes: 480
    # this job needs to be run on self-hosted GPU runners...
-    runs-on: ubuntu-latest-m
+    runs-on: axolotl-gpu-runner
    strategy:
      fail-fast: false
      matrix:
        include:
+          - cuda: "124"
+            cuda_version: 12.4.1
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: 2.4.1
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "124"
            cuda_version: 12.4.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.5.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
          - cuda: "124"
            cuda_version: 12.4.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.6.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
          - cuda: "126"
            cuda_version: 12.6.3
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.6.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
-          - cuda: "126"
-            cuda_version: 12.6.3
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.7.1
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
-          - cuda: "128"
-            cuda_version: 12.6.3
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.7.1
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: nightly
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base-nightly"
-#          # "next" is for release candidates of pytorch
-#          - cuda: "128"
-#            cuda_version: 12.8.1
-#            cudnn_version: ""
-#            python_version: "3.11"
-#            pytorch: next
-#            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-#            dockerfile: "Dockerfile-base-next"
+          - cuda: "128"
+            cuda_version: 12.8.1
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: next
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -94,60 +79,7 @@ jobs:
        uses: docker/build-push-action@v4
        with:
          context: .
-          file: ./docker/${{ matrix.dockerfile }}
-          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-          labels: ${{ steps.metadata.outputs.labels }}
-          build-args: |
-            CUDA_VERSION=${{ matrix.cuda_version }}
-            CUDNN_VERSION=${{ matrix.cudnn_version }}
-            CUDA=${{ matrix.cuda }}
-            PYTHON_VERSION=${{ matrix.python_version }}
-            PYTORCH_VERSION=${{ matrix.pytorch }}
-            TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }}
-  build-base-uv:
-    if: github.repository_owner == 'axolotl-ai-cloud'
-    timeout-minutes: 480
-    runs-on: ubuntu-latest-m
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - cuda: "126"
-            cuda_version: 12.6.3
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.6.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-uv-base"
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.7.1
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-uv-base"
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Docker metadata
-        id: metadata
-        uses: docker/metadata-action@v5
-        with:
-          images: |
-            axolotlai/axolotl-base-uv
-      - name: Login to Docker Hub
-        uses: docker/login-action@v2
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Build
-        uses: docker/build-push-action@v4
-        with:
-          context: .
-          file: ./docker/${{ matrix.dockerfile }}
+          file: ${{ matrix.pytorch == 'nightly' && './docker/Dockerfile-base-nightly' || matrix.pytorch == 'next' && './docker/Dockerfile-base-next' || './docker/Dockerfile-base' }}
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -9,7 +9,6 @@ on:
       - '.github/workflows/*.yml'
       - "*.[q]md"
       - "examples/**/*.y[a]?ml"
-       - ".pre-commit-config.yaml"
  workflow_dispatch:

 jobs:
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -18,24 +18,19 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.5.1
+            pytorch: 2.4.1
            axolotl_extras:
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.5.1
+            axolotl_extras: vllm
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
            axolotl_extras: vllm
            is_latest: true
-          - cuda: 126
-            cuda_version: 12.6.3
-            python_version: "3.11"
-            pytorch: 2.7.1
-            axolotl_extras:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.7.1
-            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -67,7 +62,6 @@ jobs:
            CUDA=${{ matrix.cuda }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
-            AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}
          file: ./docker/Dockerfile
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
@@ -83,6 +77,11 @@ jobs:
    strategy:
      matrix:
        include:
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.4.1
+            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -94,16 +93,6 @@ jobs:
            pytorch: 2.6.0
            axolotl_extras:
            is_latest: true
-          - cuda: 126
-            cuda_version: 12.6.3
-            python_version: "3.11"
-            pytorch: 2.7.1
-            axolotl_extras:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.7.1
-            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -149,7 +138,7 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.6.0
+            pytorch: 2.4.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -3,13 +3,11 @@ name: docker-multigpu-tests-biweekly
 on:
  pull_request:
    paths:
-      - 'tests/e2e/multigpu/**.py'
+      - 'tests/e2e/multigpu/*.py'
      - 'requirements.txt'
      - 'setup.py'
      - 'pyproject.toml'
      - '.github/workflows/multi-gpu-e2e.yml'
-      - 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
-      - 'src/axolotl/utils/distributed.py'
  workflow_dispatch:
  schedule:
    - cron: '0 0 * * 1,4'  # Runs at 00:00 UTC every monday & thursday
@@ -36,15 +34,15 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.5.1
-            axolotl_extras:
+            pytorch: 2.4.1
+            axolotl_extras:  # no vllm support for 2.4.1
            num_gpus: 2
            nightly_build: "true"
-          - cuda: 126
-            cuda_version: 12.6.3
+          - cuda: 124
+            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.7.1
-            axolotl_extras:
+            pytorch: 2.5.1
+            axolotl_extras: vllm
            num_gpus: 2
            nightly_build: "true"
    runs-on: [self-hosted, modal]
@@ -59,7 +57,7 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==1.0.2 jinja2
+          pip install modal==0.71.8 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
@@ -69,7 +67,6 @@ jobs:
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
-          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
          modal run cicd.multigpu
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -12,6 +12,11 @@ jobs:
      fail-fast: false
      matrix:
        include:
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.4.1
+            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -65,6 +70,11 @@ jobs:
    strategy:
      matrix:
        include:
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.4.1
+            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
--- a/.github/workflows/precommit-autoupdate.yml
+++ b/.github/workflows/precommit-autoupdate.yml
@@ -25,6 +25,7 @@ jobs:
          pre-commit autoupdate
          if [[ -n $(git status --porcelain) ]]; then
            echo "changes=true" >> $GITHUB_OUTPUT
+            git diff .pre-commit-config.yaml > pre-commit-update.diff
          fi

      - name: Create Pull Request
@@ -38,3 +39,11 @@ jobs:
          commit-message: "chore: update pre-commit hooks"
          body: |
            Automated PR to update pre-commit hooks to their latest versions.
+
+            <details>
+            <summary>Changes:</summary>
+
+            ```diff
+            ${{ steps.update.outputs.diff }}
+            ```
+            </details>
--- a/.github/workflows/preview-docs.yml
+++ b/.github/workflows/preview-docs.yml
@@ -1,61 +0,0 @@
-name: Preview
-on:
-  workflow_dispatch:
-  pull_request:
-    types: [opened, synchronize, reopened]
-
-    # Run the workflow only when one of these files changes
-    paths:
-      - '**/*.md'      # any Markdown file
-      - '**/*.qmd'     # any Quarto file
-      - '_quarto.yaml'
-
-permissions:
-  checks: write
-  contents: write
-  deployments: write
-  issues: write
-  discussions: write
-  pages: write
-  pull-requests: write
-  statuses: write
-
-jobs:
-  preview:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Check out repository
-        uses: actions/checkout@v4
-
-      - name: Set up Quarto
-        uses: quarto-dev/quarto-actions/setup@v2
-
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-
-      - name: Install dependencies
-        run: |
-          python3 -m pip install jupyter quartodoc
-          python3 -m pip install -e . --no-deps
-
-      - name: Build autodoc
-        run: quartodoc build
-
-      - name: Quarto render
-        run: quarto render
-
-      - name: Netlify Publish
-        uses: nwtgck/actions-netlify@v3.0
-        with:
-          publish-dir: './_site'
-          enable-pull-request-comment: true
-          enable-github-deployment: true
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          deploy-message: "Deployed On Netlify"
-          github-deployment-environment: 'preview'
-          github-deployment-description: 'Preview Deployment'
-        env:
-          NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
-          NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }}
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -18,102 +18,15 @@ jobs:
        env:
          SKIP: no-commit-to-branch

-  preload-cache:
-    name: Preload HF cache
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        python_version: ["3.11"]
-        pytorch_version: ["2.6.0"]
-    timeout-minutes: 20
-
-    env:
-      AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
-
-    steps:
-      - name: Check out repository code
-        uses: actions/checkout@v4
-
-      - name: Restore HF cache
-        id: hf-cache-restore
-        uses: actions/cache/restore@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ runner.os }}-hf-hub-cache-v2
-
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python_version }}
-          cache: 'pip' # caching pip dependencies
-
-      - name: upgrade pip
-        run: |
-          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
-
-      - name: Install PyTorch
-        run: |
-          pip3 install torch==${{ matrix.pytorch_version }}
-
-      - name: Install dependencies
-        run: |
-          pip3 show torch
-          pip3 install --no-build-isolation -U -e .
-          python scripts/unsloth_install.py | sh
-          python scripts/cutcrossentropy_install.py | sh
-          pip3 install -r requirements-dev.txt -r requirements-tests.txt
-
-      - name: Make sure PyTorch version wasn't clobbered
-        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
-
-      - name: Ensure axolotl CLI was installed
-        run: |
-          axolotl --help
-
-      - name: Pre-Download dataset fixture
-        run: |
-          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
-
-      - name: Run tests
-        run: |
-          pytest -v tests/conftest.py
-
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v5
-        with:
-          token: ${{ secrets.CODECOV_TOKEN }}
-          files: ./coverage.xml
-          flags: unittests,pytorch-${{ matrix.pytorch_version }}
-          fail_ci_if_error: false
-
-      - name: cleanup pip cache
-        run: |
-          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
-
-      - name: Save HF cache
-        id: hf-cache
-        uses: actions/cache/save@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
-
  pytest:
    name: PyTest
    runs-on: ubuntu-latest
-    needs: [preload-cache]
    strategy:
      fail-fast: false
      max-parallel: 2
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
+        pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
    timeout-minutes: 20

    steps:
@@ -193,6 +106,13 @@ jobs:
      fail-fast: false
      matrix:
        include:
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.4.1
+            num_gpus: 1
+            axolotl_extras:
+            nightly_build: "true"
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -227,7 +147,6 @@ jobs:
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
-          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
          modal run cicd.e2e_tests
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -27,9 +27,6 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}

-env:
-  TRANSFORMERS_IS_CI: "yes"
-
 jobs:
  pre-commit:
    name: pre-commit
@@ -47,23 +44,26 @@ jobs:
  pytest:
    name: PyTest
    runs-on: ubuntu-latest
-#    needs: [preload-cache]
    strategy:
      fail-fast: false
+      max-parallel: 2
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.5.1", "2.6.0", "2.7.1"]
+        pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
    timeout-minutes: 20

    steps:
      - name: Check out repository code
        uses: actions/checkout@v4

-      - name: Restore Cache from S3
-        id: hf-cache-restore-s3
-        run: |
-          mkdir -p /home/runner/.cache/huggingface/hub
-          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd
+      - name: Restore HF cache
+        id: hf-cache-restore
+        uses: actions/cache/restore@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ runner.os }}-hf-hub-cache-v2

      - name: Setup Python
        uses: actions/setup-python@v5
@@ -109,7 +109,6 @@ jobs:
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v5
        with:
-          token: ${{ secrets.CODECOV_TOKEN }}
          files: ./coverage.xml
          flags: unittests,pytorch-${{ matrix.pytorch_version }}
          fail_ci_if_error: false
@@ -118,25 +117,38 @@ jobs:
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

+      - name: Save HF cache
+        id: hf-cache
+        uses: actions/cache/save@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
+
  pytest-sdist:
    name: PyTest from Source Dist
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
+      max-parallel: 1
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.5.1", "2.6.0", "2.7.1"]
+        pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
    timeout-minutes: 20

    steps:
      - name: Check out repository code
        uses: actions/checkout@v4

-      - name: Restore Cache from S3
-        id: hf-cache-restore-s3
-        run: |
-          mkdir -p /home/runner/.cache/huggingface/hub
-          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd
+      - name: Restore HF cache
+        id: hf-cache-restore
+        uses: actions/cache/restore@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ runner.os }}-hf-hub-cache-v2

      - name: Setup Python
        uses: actions/setup-python@v5
@@ -183,12 +195,20 @@ jobs:
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

+      - name: Save HF cache
+        id: hf-cache
+        uses: actions/cache/save@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
+
  docker-e2e-tests-1st:
-    # Run this job first as a gate for running the remainder of the test matrix
    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
-    timeout-minutes: 120
+    timeout-minutes: 90
    needs: [pre-commit, pytest, pytest-sdist]

    strategy:
@@ -201,13 +221,6 @@ jobs:
            pytorch: 2.6.0
            num_gpus: 1
            axolotl_extras: vllm
-          - cuda: 126
-            cuda_version: 12.6.3
-            python_version: "3.11"
-            pytorch: 2.6.0
-            num_gpus: 1
-            axolotl_extras:
-            dockerfile: "Dockerfile-uv.jinja"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -218,7 +231,7 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==1.0.2 jinja2
+          pip install modal==0.71.8 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
@@ -228,8 +241,6 @@ jobs:
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
          modal run cicd.e2e_tests
@@ -238,9 +249,7 @@ jobs:
    if: github.repository_owner == 'axolotl-ai-cloud'
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
-    timeout-minutes: 120
-    # Only run the remainder of the matrix if the first e2e check passed;
-    # this is to save on wasted compute costs for known failures that get caught in the first run
+    timeout-minutes: 90
    needs: [pre-commit, pytest, docker-e2e-tests-1st]

    strategy:
@@ -250,67 +259,14 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.6.0
+            pytorch: 2.4.1
            num_gpus: 1
-            axolotl_extras: llmcompressor
+            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.5.1
            num_gpus: 1
-            axolotl_extras:
-          - cuda: 126
-            cuda_version: 12.6.3
-            python_version: "3.11"
-            pytorch: 2.7.1
-            num_gpus: 1
-            axolotl_extras:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.7.1
-            num_gpus: 1
-            axolotl_extras:
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Install Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: Install Modal
-        run: |
-          python -m pip install --upgrade pip
-          pip install modal==1.0.2 jinja2
-      - name: Update env vars
-        run: |
-          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
-          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
-          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
-          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
-          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
-          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
-          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
-      - name: Run tests job on Modal
-        run: |
-          modal run cicd.e2e_tests
-
-  docker-e2e-cleanup:
-    runs-on: [self-hosted, modal]
-    timeout-minutes: 90
-    needs: [docker-e2e-tests]
-
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.6.0
-            num_gpus: 1
            axolotl_extras: vllm
    steps:
      - name: Checkout
@@ -322,7 +278,7 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==1.0.2 jinja2
+          pip install modal==0.71.8 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
@@ -332,7 +288,6 @@ jobs:
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
-          modal run cicd.cleanup
+          modal run cicd.e2e_tests
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,15 +19,15 @@ repos:
    hooks:
      - id: isort
 -   repo: https://github.com/PyCQA/flake8
-    rev: 7.2.0
+    rev: 7.1.2
    hooks:
    - id: flake8
 -   repo: https://github.com/pylint-dev/pylint
-    rev: v3.3.7
+    rev: v3.3.6
    hooks:
    - id: pylint
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.16.0
+    rev: v1.15.0
    hooks:
    - id: mypy
      additional_dependencies:
--- a/.runpod/.gitignore
+++ b/.runpod/.gitignore
@@ -1,161 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-.pybuilder/
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/#use-with-ide
-.pdm.toml
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# pytype static type analyzer
-.pytype/
-
-# Cython debug symbols
-cython_debug/
-
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
-pod/scripts/config.yaml
--- a/.runpod/Dockerfile
+++ b/.runpod/Dockerfile
@@ -1,18 +0,0 @@
-FROM axolotlai/axolotl-cloud:main-py3.11-cu124-2.6.0
-
-COPY .runpod/requirements.txt /requirements.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install --upgrade pip && \
-    python3 -m pip install --upgrade -r /requirements.txt
-
-# Environment settings
-ARG BASE_VOLUME="/runpod-volume"
-ENV BASE_VOLUME=$BASE_VOLUME
-ENV HF_DATASETS_CACHE="${BASE_VOLUME}/huggingface-cache/datasets"
-ENV HUGGINGFACE_HUB_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
-ENV TRANSFORMERS_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
-
-COPY .runpod/src /src
-
-WORKDIR /src
-CMD ["python3", "/src/handler.py"]
--- a/.runpod/README.md
+++ b/.runpod/README.md
@@ -1,335 +0,0 @@
-<h1>LLM Post Training- Full fine-tune, LoRA, QLoRa etc. Llama/Mistral/Gemma and more</h1>
-
-# Configuration Options
-
-This document outlines all available configuration options for training models. The configuration can be provided as a JSON request.
-
-## Usage
-
-You can use these configuration Options:
-
-1. As a JSON request body:
-
-```json
-{
-  "input": {
-    "user_id": "user",
-    "model_id": "model-name",
-    "run_id": "run-id",
-    "credentials": {
-      "wandb_api_key": "", # add your Weights & biases key. TODO:  you will be able to set this in Enviornment variables.
-      "hf_token": "", # add your HF_token. TODO:  you will be able to set this in Enviornment variables.
-    },
-    "args": {
-      "base_model": "NousResearch/Llama-3.2-1B",
-      // ... other options
-    }
-  }
-}
-```
-
-## Configuration Options
-
-### Model Configuration
-
-| Option              | Description                                                                                   | Default              |
-| ------------------- | --------------------------------------------------------------------------------------------- | -------------------- |
-| `base_model`        | Path to the base model (local or HuggingFace)                                                 | Required             |
-| `base_model_config` | Configuration path for the base model                                                         | Same as base_model   |
-| `revision_of_model` | Specific model revision from HuggingFace hub                                                  | Latest               |
-| `tokenizer_config`  | Custom tokenizer configuration path                                                           | Optional             |
-| `model_type`        | Type of model to load                                                                         | AutoModelForCausalLM |
-| `tokenizer_type`    | Type of tokenizer to use                                                                      | AutoTokenizer        |
-| `hub_model_id`      | Repository ID where the model will be pushed on Hugging Face Hub (format: username/repo-name) | Optional             |
-
-## Model Family Identification
-
-| Option                     | Default | Description                    |
-| -------------------------- | ------- | ------------------------------ |
-| `is_falcon_derived_model`  | `false` | Whether model is Falcon-based  |
-| `is_llama_derived_model`   | `false` | Whether model is LLaMA-based   |
-| `is_qwen_derived_model`    | `false` | Whether model is Qwen-based    |
-| `is_mistral_derived_model` | `false` | Whether model is Mistral-based |
-
-## Model Configuration Overrides
-
-| Option                                          | Default    | Description                        |
-| ----------------------------------------------- | ---------- | ---------------------------------- |
-| `overrides_of_model_config.rope_scaling.type`   | `"linear"` | RoPE scaling type (linear/dynamic) |
-| `overrides_of_model_config.rope_scaling.factor` | `1.0`      | RoPE scaling factor                |
-
-### Model Loading Options
-
-| Option         | Description                   | Default |
-| -------------- | ----------------------------- | ------- |
-| `load_in_8bit` | Load model in 8-bit precision | false   |
-| `load_in_4bit` | Load model in 4-bit precision | false   |
-| `bf16`         | Use bfloat16 precision        | false   |
-| `fp16`         | Use float16 precision         | false   |
-| `tf32`         | Use tensor float 32 precision | false   |
-
-## Memory and Device Settings
-
-| Option             | Default   | Description             |
-| ------------------ | --------- | ----------------------- |
-| `gpu_memory_limit` | `"20GiB"` | GPU memory limit        |
-| `lora_on_cpu`      | `false`   | Load LoRA on CPU        |
-| `device_map`       | `"auto"`  | Device mapping strategy |
-| `max_memory`       | `null`    | Max memory per device   |
-
-## Training Hyperparameters
-
-| Option                        | Default   | Description                 |
-| ----------------------------- | --------- | --------------------------- |
-| `gradient_accumulation_steps` | `1`       | Gradient accumulation steps |
-| `micro_batch_size`            | `2`       | Batch size per GPU          |
-| `eval_batch_size`             | `null`    | Evaluation batch size       |
-| `num_epochs`                  | `4`       | Number of training epochs   |
-| `warmup_steps`                | `100`     | Warmup steps                |
-| `warmup_ratio`                | `0.05`    | Warmup ratio                |
-| `learning_rate`               | `0.00003` | Learning rate               |
-| `lr_quadratic_warmup`         | `false`   | Quadratic warmup            |
-| `logging_steps`               | `null`    | Logging frequency           |
-| `eval_steps`                  | `null`    | Evaluation frequency        |
-| `evals_per_epoch`             | `null`    | Evaluations per epoch       |
-| `save_strategy`               | `"epoch"` | Checkpoint saving strategy  |
-| `save_steps`                  | `null`    | Saving frequency            |
-| `saves_per_epoch`             | `null`    | Saves per epoch             |
-| `save_total_limit`            | `null`    | Maximum checkpoints to keep |
-| `max_steps`                   | `null`    | Maximum training steps      |
-
-### Dataset Configuration
-
-```yaml
-datasets:
-  - path: vicgalle/alpaca-gpt4 # HuggingFace dataset or TODO: You will be able to add the local path.
-    type: alpaca # Format type (alpaca, gpteacher, oasst, etc.)
-    ds_type: json # Dataset type
-    data_files: path/to/data # Source data files
-    train_on_split: train # Dataset split to use
-```
-
-## Chat Template Settings
-
-| Option                   | Default                          | Description            |
-| ------------------------ | -------------------------------- | ---------------------- |
-| `chat_template`          | `"tokenizer_default"`            | Chat template type     |
-| `chat_template_jinja`    | `null`                           | Custom Jinja template  |
-| `default_system_message` | `"You are a helpful assistant."` | Default system message |
-
-## Dataset Processing
-
-| Option                        | Default                    | Description                       |
-| ----------------------------- | -------------------------- | --------------------------------- |
-| `dataset_prepared_path`       | `"data/last_run_prepared"` | Path for prepared dataset         |
-| `push_dataset_to_hub`         | `""`                       | Push dataset to HF hub            |
-| `dataset_processes`           | `4`                        | Number of preprocessing processes |
-| `dataset_keep_in_memory`      | `false`                    | Keep dataset in memory            |
-| `shuffle_merged_datasets`     | `true`                     | Shuffle merged datasets           |
-| `dataset_exact_deduplication` | `true`                     | Deduplicate datasets              |
-
-## LoRA Configuration
-
-| Option                     | Default                | Description                    |
-| -------------------------- | ---------------------- | ------------------------------ |
-| `adapter`                  | `"lora"`               | Adapter type (lora/qlora)      |
-| `lora_model_dir`           | `""`                   | Directory with pretrained LoRA |
-| `lora_r`                   | `8`                    | LoRA attention dimension       |
-| `lora_alpha`               | `16`                   | LoRA alpha parameter           |
-| `lora_dropout`             | `0.05`                 | LoRA dropout                   |
-| `lora_target_modules`      | `["q_proj", "v_proj"]` | Modules to apply LoRA          |
-| `lora_target_linear`       | `false`                | Target all linear modules      |
-| `peft_layers_to_transform` | `[]`                   | Layers to transform            |
-| `lora_modules_to_save`     | `[]`                   | Modules to save                |
-| `lora_fan_in_fan_out`      | `false`                | Fan in/out structure           |
-
-## Optimization Settings
-
-| Option                    | Default | Description                |
-| ------------------------- | ------- | -------------------------- |
-| `train_on_inputs`         | `false` | Train on input prompts     |
-| `group_by_length`         | `false` | Group by sequence length   |
-| `gradient_checkpointing`  | `false` | Use gradient checkpointing |
-| `early_stopping_patience` | `3`     | Early stopping patience    |
-
-## Learning Rate Scheduling
-
-| Option                     | Default    | Description          |
-| -------------------------- | ---------- | -------------------- |
-| `lr_scheduler`             | `"cosine"` | Scheduler type       |
-| `lr_scheduler_kwargs`      | `{}`       | Scheduler parameters |
-| `cosine_min_lr_ratio`      | `null`     | Minimum LR ratio     |
-| `cosine_constant_lr_ratio` | `null`     | Constant LR ratio    |
-| `lr_div_factor`            | `null`     | LR division factor   |
-
-## Optimizer Settings
-
-| Option                 | Default      | Description         |
-| ---------------------- | ------------ | ------------------- |
-| `optimizer`            | `"adamw_hf"` | Optimizer choice    |
-| `optim_args`           | `{}`         | Optimizer arguments |
-| `optim_target_modules` | `[]`         | Target modules      |
-| `weight_decay`         | `null`       | Weight decay        |
-| `adam_beta1`           | `null`       | Adam beta1          |
-| `adam_beta2`           | `null`       | Adam beta2          |
-| `adam_epsilon`         | `null`       | Adam epsilon        |
-| `max_grad_norm`        | `null`       | Gradient clipping   |
-
-## Attention Implementations
-
-| Option                     | Default | Description                   |
-| -------------------------- | ------- | ----------------------------- |
-| `flash_optimum`            | `false` | Use better transformers       |
-| `xformers_attention`       | `false` | Use xformers                  |
-| `flash_attention`          | `false` | Use flash attention           |
-| `flash_attn_cross_entropy` | `false` | Flash attention cross entropy |
-| `flash_attn_rms_norm`      | `false` | Flash attention RMS norm      |
-| `flash_attn_fuse_qkv`      | `false` | Fuse QKV operations           |
-| `flash_attn_fuse_mlp`      | `false` | Fuse MLP operations           |
-| `sdp_attention`            | `false` | Use scaled dot product        |
-| `s2_attention`             | `false` | Use shifted sparse attention  |
-
-## Tokenizer Modifications
-
-| Option           | Default | Description                  |
-| ---------------- | ------- | ---------------------------- |
-| `special_tokens` | -       | Special tokens to add/modify |
-| `tokens`         | `[]`    | Additional tokens            |
-
-## Distributed Training
-
-| Option                  | Default | Description           |
-| ----------------------- | ------- | --------------------- |
-| `fsdp`                  | `null`  | FSDP configuration    |
-| `fsdp_config`           | `null`  | FSDP config options   |
-| `deepspeed`             | `null`  | Deepspeed config path |
-| `ddp_timeout`           | `null`  | DDP timeout           |
-| `ddp_bucket_cap_mb`     | `null`  | DDP bucket capacity   |
-| `ddp_broadcast_buffers` | `null`  | DDP broadcast buffers |
-
-<details>
-<summary><h3>Example Configuration Request:</h3></summary>
-
-Here's a complete example for fine-tuning a LLaMA model using LoRA:
-
-```json
-{
-  "input": {
-    "user_id": "user",
-    "model_id": "llama-test",
-    "run_id": "test-run",
-    "credentials": {
-      "wandb_api_key": "",
-      "hf_token": ""
-    },
-    "args": {
-      "base_model": "NousResearch/Llama-3.2-1B",
-      "load_in_8bit": false,
-      "load_in_4bit": false,
-      "strict": false,
-      "datasets": [
-        {
-          "path": "teknium/GPT4-LLM-Cleaned",
-          "type": "alpaca"
-        }
-      ],
-      "dataset_prepared_path": "last_run_prepared",
-      "val_set_size": 0.1,
-      "output_dir": "./outputs/lora-out",
-      "adapter": "lora",
-      "sequence_len": 2048,
-      "sample_packing": true,
-      "eval_sample_packing": true,
-      "pad_to_sequence_len": true,
-      "lora_r": 16,
-      "lora_alpha": 32,
-      "lora_dropout": 0.05,
-      "lora_target_modules": [
-        "gate_proj",
-        "down_proj",
-        "up_proj",
-        "q_proj",
-        "v_proj",
-        "k_proj",
-        "o_proj"
-      ],
-      "gradient_accumulation_steps": 2,
-      "micro_batch_size": 2,
-      "num_epochs": 1,
-      "optimizer": "adamw_8bit",
-      "lr_scheduler": "cosine",
-      "learning_rate": 0.0002,
-      "train_on_inputs": false,
-      "group_by_length": false,
-      "bf16": "auto",
-      "tf32": false,
-      "gradient_checkpointing": true,
-      "logging_steps": 1,
-      "flash_attention": true,
-      "loss_watchdog_threshold": 5,
-      "loss_watchdog_patience": 3,
-      "warmup_steps": 10,
-      "evals_per_epoch": 4,
-      "saves_per_epoch": 1,
-      "weight_decay": 0,
-      "hub_model_id": "runpod/llama-fr-lora",
-      "wandb_name": "test-run-1",
-      "wandb_project": "test-run-1",
-      "wandb_entity": "axo-test",
-      "special_tokens": {
-        "pad_token": "<|end_of_text|>"
-      }
-    }
-  }
-}
-```
-
-</details>
-
-### Advanced Features
-
-#### Wandb Integration
-
- `wandb_project`: Project name for Weights & Biases
- `wandb_entity`: Team name in W&B
- `wandb_watch`: Monitor model with W&B
- `wandb_name`: Name of the W&B run
- `wandb_run_id`: ID for the W&B run
-
-#### Performance Optimization
-
- `sample_packing`: Enable efficient sequence packing
- `eval_sample_packing`: Use sequence packing during evaluation
- `torch_compile`: Enable PyTorch 2.0 compilation
- `flash_attention`: Use Flash Attention implementation
- `xformers_attention`: Use xFormers attention implementation
-
-### Available Optimizers
-
-The following optimizers are supported:
-
- `adamw_hf`: HuggingFace's AdamW implementation
- `adamw_torch`: PyTorch's AdamW
- `adamw_torch_fused`: Fused AdamW implementation
- `adamw_torch_xla`: XLA-optimized AdamW
- `adamw_apex_fused`: NVIDIA Apex fused AdamW
- `adafactor`: Adafactor optimizer
- `adamw_anyprecision`: Anyprecision AdamW
- `adamw_bnb_8bit`: 8-bit AdamW from bitsandbytes
- `lion_8bit`: 8-bit Lion optimizer
- `lion_32bit`: 32-bit Lion optimizer
- `sgd`: Stochastic Gradient Descent
- `adagrad`: Adagrad optimizer
-
-## Notes
-
- Set `load_in_8bit: true` or `load_in_4bit: true` for memory-efficient training
- Enable `flash_attention: true` for faster training on modern GPUs
- Use `gradient_checkpointing: true` to reduce memory usage
- Adjust `micro_batch_size` and `gradient_accumulation_steps` based on your GPU memory
-
-For more detailed information, please refer to the [documentation](https://axolotl-ai-cloud.github.io/axolotl/docs/config.html).
-
-### Errors:
-
- if you face any issues with the Flash Attention-2, Delete yoor worker and Re-start.
--- a/.runpod/hub.json
+++ b/.runpod/hub.json
@@ -1,93 +0,0 @@
-{
-  "title": "Axolotl Fine-Tuning",
-  "description": "Serverless fine-tuning of open-source LLMs with Axolotl. Supports LoRA, QLoRA, DPO, and more using Hugging Face models and datasets.",
-  "type": "serverless",
-  "category": "language",
-  "iconUrl": "https://avatars.githubusercontent.com/u/167502477",
-  "config": {
-    "runsOn": "GPU",
-    "containerDiskInGb": 200,
-    "gpuCount": 1,
-    "allowedCudaVersions": [
-      "12.8",
-      "12.7",
-      "12.6",
-      "12.5",
-      "12.4"
-    ],
-    "presets": [],
-    "env": [
-      {
-        "key": "TOKENIZER",
-        "input": {
-          "name": "Tokenizer",
-          "type": "string",
-          "description": "Name or path of the Hugging Face tokenizer to use.",
-          "default": "",
-          "advanced": true
-        }
-      },
-      {
-        "key": "MAX_NUM_SEQS",
-        "input": {
-          "name": "Max Num Seqs",
-          "type": "number",
-          "description": "Maximum number of sequences per iteration.",
-          "default": 256,
-          "advanced": true
-        }
-      },
-      {
-        "key": "DISABLE_LOG_STATS",
-        "input": {
-          "name": "Disable Log Stats",
-          "type": "boolean",
-          "description": "Disable logging statistics.",
-          "default": false,
-          "trueValue": "true",
-          "falseValue": "false"
-        }
-      },
-      {
-        "key": "LOAD_FORMAT",
-        "input": {
-          "name": "Load Format",
-          "type": "string",
-          "description": "The format of the model weights to load.",
-          "default": "auto",
-          "options": [
-            {
-              "label": "auto",
-              "value": "auto"
-            },
-            {
-              "label": "pt",
-              "value": "pt"
-            },
-            {
-              "label": "safetensors",
-              "value": "safetensors"
-            },
-            {
-              "label": "npcache",
-              "value": "npcache"
-            },
-            {
-              "label": "dummy",
-              "value": "dummy"
-            },
-            {
-              "label": "tensorizer",
-              "value": "tensorizer"
-            },
-            {
-              "label": "bitsandbytes",
-              "value": "bitsandbytes"
-            }
-          ],
-          "advanced": true
-        }
-      }
-    ]
-  }
-}
--- a/.runpod/requirements.txt
+++ b/.runpod/requirements.txt
@@ -1,7 +0,0 @@
-# Required Python packages get listed here, one per line.
-# Reccomended to lock the version number to avoid unexpected changes.
-
-# You can also install packages from a git repository, e.g.:
-# git+https://github.com/runpod/runpod-python.git
-# To learn more, see https://pip.pypa.io/en/stable/reference/requirements-file-format/
-runpod~=1.7.0
--- a/.runpod/src/config/config.yaml
+++ b/.runpod/src/config/config.yaml
@@ -1,573 +0,0 @@
-# # This is the huggingface model that contains *.pt, *.safetensors, or *.bin files
-# # This can also be a relative path to a model on disk
-# base_model: ./llama-7b-hf
-# # You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)
-# base_model_ignore_patterns:
-# # If the base_model repo on hf hub doesn't include configuration .json files,
-# # You can set that here, or leave this empty to default to base_model
-# base_model_config: ./llama-7b-hf
-# # You can specify to choose a specific model revision from huggingface hub
-# model_revision:
-# # Optional tokenizer configuration override in case you want to use a different tokenizer
-# # than the one defined in the base model
-# tokenizer_config:
-# # If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
-# model_type: AutoModelForCausalLM
-# # Corresponding tokenizer for the model AutoTokenizer is a good choice
-# tokenizer_type: AutoTokenizer
-# # Trust remote code for untrusted source
-# trust_remote_code:
-# # use_fast option for tokenizer loading from_pretrained, default to True
-# tokenizer_use_fast:
-# # Whether to use the legacy tokenizer setting, defaults to True
-# tokenizer_legacy:
-# # Resize the model embeddings when new tokens are added to multiples of 32
-# # This is reported to improve training speed on some models
-# resize_token_embeddings_to_32x:
-
-# # Used to identify which the model is based on
-# is_falcon_derived_model:
-# is_llama_derived_model:
-# # Please note that if you set this to true, `padding_side` will be set to "left" by default
-# is_mistral_derived_model:
-# is_qwen_derived_model:
-
-# # optional overrides to the base model configuration
-# model_config:
-#   # RoPE Scaling https://github.com/huggingface/transformers/pull/24653
-#   rope_scaling:
-#     type: # linear | dynamic
-#     factor: # float
-
-
-# # Whether you are training a 4-bit GPTQ quantized model
-# gptq: true
-# gptq_groupsize: 128 # group size
-# gptq_model_v1: false # v1 or v2
-
-# # This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
-# load_in_8bit: true
-# # Use bitsandbytes 4 bit
-# load_in_4bit:
-
-# # Use CUDA bf16
-# bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
-# # Use CUDA fp16
-# fp16: true
-# # Use CUDA tf32
-# tf32: true # require >=ampere
-
-# # No AMP (automatic mixed precision)
-# bfloat16: true # require >=ampere
-# float16: true
-
-# # A list of one or more datasets to finetune the model with
-# datasets:
-#   # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
-#   - path: vicgalle/alpaca-gpt4
-#   # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
-#     type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
-#     ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
-#     data_files: # Optional[str] path to source data files
-#     shards: # Optional[int] number of shards to split data into
-#     name: # Optional[str] name of dataset configuration to load
-#     train_on_split: train # Optional[str] name of dataset split to load from
-
-#     # Optional[str] fastchat conversation type, only used with type: sharegpt
-#     conversation:  # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
-#     field_human: # Optional[str]. Human key to use for conversation.
-#     field_model: # Optional[str]. Assistant key to use for conversation.
-
-#   # Custom user prompt
-#   - path: repo
-#     type:
-#       # The below are defaults. only set what's needed.
-#       system_prompt: ""
-#       system_format: "{system}"
-#       field_system: system
-#       field_instruction: instruction
-#       field_input: input
-#       field_output: output
-
-#       # Customizable to be single line or multi-line
-#       # 'format' can include {input}
-#       format: |-
-#         User: {instruction} {input}
-#         Assistant:
-#       # 'no_input_format' cannot include {input}
-#       no_input_format: "{instruction} "
-
-#       # For `completion` datsets only, uses the provided field instead of `text` column
-#       field:
-
-# # Axolotl attempts to save the dataset as an arrow after packing the data together so
-# # subsequent training attempts load faster, relative path
-# dataset_prepared_path: data/last_run_prepared
-# # Push prepared dataset to hub
-# push_dataset_to_hub: # repo path
-# # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
-# # if not set.
-# dataset_processes: # defaults to os.cpu_count() if not set
-# # push checkpoints to hub
-# hub_model_id: # repo path to push finetuned model
-# # how to push checkpoints to hub
-# # https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy
-# hub_strategy:
-# # Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
-# # Required to be true when used in combination with `push_dataset_to_hub`
-# hf_use_auth_token: # boolean
-# # How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval.
-# val_set_size: 0.04
-# # Num shards for whole dataset
-# dataset_shard_num:
-# # Index of shard to use for whole dataset
-# dataset_shard_idx:
-
-# # The maximum length of an input to train with, this should typically be less than 2048
-# # as most models have a token/context limit of 2048
-# sequence_len: 2048
-# # Pad inputs so each step uses constant sized buffers
-# # This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
-# pad_to_sequence_len:
-# # Max sequence length to concatenate training samples together up to
-# # Inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
-# # FutureWarning: This will soon be DEPRECATED
-# max_packed_sequence_len: 1024
-# # Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
-# sample_packing:
-# # Set to 'false' if getting errors during eval with sample_packing on.
-# eval_sample_packing:
-# # You can set these packing optimizations AFTER starting a training at least once.
-# # The trainer will provide recommended values for these values.
-# sample_packing_eff_est:
-# total_num_tokens:
-
-# # If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model
-# adapter: lora
-# # If you already have a lora model trained that you want to load, put that here.
-# # This means after training, if you want to test the model, you should set this to the value of `lora_out_dir`.
-# lora_model_dir:
-
-# # LoRA hyperparameters
-# # For more details about the following options, see:
-# # https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2
-# lora_r: 8
-# lora_alpha: 16
-# lora_dropout: 0.05
-# lora_target_modules:
-#   - q_proj
-#   - v_proj
-# #  - k_proj
-# #  - o_proj
-# #  - gate_proj
-# #  - down_proj
-# #  - up_proj
-# lora_target_linear: # If true, will target all linear layers
-
-# # If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.
-# # For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.
-# # `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities.
-# # https://github.com/huggingface/peft/issues/334#issuecomment-1561727994
-# lora_modules_to_save:
-# #  - embed_tokens
-# #  - lm_head
-
-# # Once you complete training, the model will be saved to the following directory.
-# # If you merge the adapter to the base model, a subdirectory `merged` will be created under this directory.
-# # Make sure `lora_model_dir` points to this directory if you want to use the trained model.
-# lora_out_dir:
-# lora_fan_in_fan_out: false
-
-# # ReLoRA configuration
-# # Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
-# relora_steps: # Number of steps per ReLoRA restart
-# relora_warmup_steps: # Number of per-restart warmup steps
-# relora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings
-
-# # wandb configuration if you're using it
-# wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
-# wandb_project: # Your wandb project name
-# wandb_entity: # A wandb Team name if using a Team
-# wandb_watch:
-# wandb_run_id: # Set the name of your wandb run
-# wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training
-
-# # Where to save the full-finetuned model to
-# output_dir: ./completed-model
-
-# # Whether to use torch.compile and which backend to use
-# torch_compile:  # bool
-# torch_compile_backend:  # Optional[str]
-
-# # Training hyperparameters
-
-# # If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps.
-# gradient_accumulation_steps: 1
-# # The number of samples to include in each batch. This is the number of samples sent to each GPU.
-# micro_batch_size: 2
-# eval_batch_size:
-# num_epochs: 4
-# warmup_steps: 100  # cannot use with warmup_ratio
-# warmup_ratio: 0.05  # cannot use with warmup_steps
-# learning_rate: 0.00003
-# lr_quadratic_warmup:
-# logging_steps:
-# save_strategy: # Set to `no` to skip checkpoint saves
-# save_steps: # Leave empty to save at each epoch
-# eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
-# save_total_limit: # Checkpoints saved at a time
-# # Maximum number of iterations to train for. It precedes num_epochs which means that
-# # if both are set, num_epochs will not be guaranteed.
-# # e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
-# max_steps:
-
-# eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
-# eval_table_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
-
-# # Save model as safetensors (require safetensors package)
-# save_safetensors:
-
-# # Whether to mask out or include the human's prompt from the training labels
-# train_on_inputs: false
-# # Group similarly sized data to minimize padding.
-# # May be slower to start, as it must download and sort the entire dataset.
-# # Note that training loss may have an oscillating pattern with this enabled.
-# group_by_length: false
-
-# # Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
-# gradient_checkpointing: false
-
-# # Stop training after this many evaluation losses have increased in a row
-# # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
-# early_stopping_patience: 3
-
-# # Specify a scheduler and kwargs to use with the optimizer
-# lr_scheduler: # 'one_cycle' | empty for cosine
-# lr_scheduler_kwargs:
-
-# # For one_cycle optim
-# lr_div_factor: # Learning rate div factor
-
-# # Specify optimizer
-# # Valid values are driven by the Transformers OptimizerNames class, see:
-# # https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
-# #
-# # Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of
-# # torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used
-# # in the examples/ for your model and fine-tuning use case.
-# #
-# # Valid values for 'optimizer' include:
-# # - adamw_hf
-# # - adamw_torch
-# # - adamw_torch_fused
-# # - adamw_torch_xla
-# # - adamw_apex_fused
-# # - adafactor
-# # - adamw_anyprecision
-# # - sgd
-# # - adagrad
-# # - adamw_bnb_8bit
-# # - lion_8bit
-# # - lion_32bit
-# # - paged_adamw_32bit
-# # - paged_adamw_8bit
-# # - paged_lion_32bit
-# # - paged_lion_8bit
-# optimizer:
-# # Specify weight decay
-# weight_decay:
-# # adamw hyperparams
-# adam_beta1:
-# adam_beta2:
-# adam_epsilon:
-# # Gradient clipping max norm
-# max_grad_norm:
-
-# # Augmentation techniques
-# # NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings
-# # currently only supported on Llama and Mistral
-# noisy_embedding_alpha:
-
-# # Whether to bettertransformers
-# flash_optimum:
-# # Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
-# xformers_attention:
-# # Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
-# flash_attention:
-# flash_attn_cross_entropy:  # Whether to use flash-attention cross entropy implementation - advanced use only
-# flash_attn_rms_norm:  # Whether to use flash-attention rms norm implementation - advanced use only
-# flash_attn_fuse_qkv: # Whether to fuse QKV into a single operation
-# flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
-# # Whether to use scaled-dot-product attention
-# # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
-# sdp_attention:
-# # Landmark attention (only llama)
-# landmark_attention:
-# # xpos RoPE see https://github.com/kaiokendev/cutoff-len-is-context-len/blob/main/util/xpos_rope_llama_monkey_patch.py
-# # LLaMA only
-# xpos_rope:
-
-# # Resume from a specific checkpoint dir
-# resume_from_checkpoint:
-# # If resume_from_checkpoint isn't set and you simply want it to start where it left off.
-# # Be careful with this being turned on between different models.
-# auto_resume_from_checkpoints: false
-
-# # Don't mess with this, it's here for accelerate and torchrun
-# local_rank:
-
-# # Add or change special tokens.
-# # If you add tokens here, you don't need to add them to the `tokens` list.
-# special_tokens:
-#   # bos_token: "<s>"
-#   # eos_token: "</s>"
-#   # unk_token: "<unk>"
-
-# # Add extra tokens.
-# tokens:
-
-# # FSDP
-# fsdp:
-# fsdp_config:
-
-# # Deepspeed config path. e.g., deepspeed/zero3.json
-# deepspeed:
-
-# # Advanced DDP Arguments
-# ddp_timeout:
-# ddp_bucket_cap_mb:
-# ddp_broadcast_buffers:
-
-# # Path to torch distx for optim 'adamw_anyprecision'
-# torchdistx_path:
-
-# # Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
-# pretraining_dataset:
-
-# # Debug mode
-# debug:
-
-# # Seed
-# seed:
-
-# # Allow overwrite yml config using from cli
-# strict:
-
-
-
-base_model: ${BASE_MODEL}
-base_model_ignore_patterns: ${BASE_MODEL_IGNORE_PATTERNS}
-base_model_config: ${BASE_MODEL_CONFIG}
-revision_of_model: ${REVISION_OF_MODEL}
-tokenizer_config: ${TOKENIZER_CONFIG}
-model_type: ${MODEL_TYPE}
-tokenizer_type: ${TOKENIZER_TYPE}
-trust_remote_code: ${TRUST_REMOTE_CODE}
-tokenizer_use_fast: ${TOKENIZER_USE_FAST}
-tokenizer_legacy: ${TOKENIZER_LEGACY}
-resize_token_embeddings_to_32x: ${RESIZE_TOKEN_EMBEDDINGS_TO_32X}
-
-is_falcon_derived_model: ${IS_FALCON_DERIVED_MODEL}
-is_llama_derived_model: ${IS_LLAMA_DERIVED_MODEL}
-is_qwen_derived_model: ${IS_QWEN_DERIVED_MODEL}
-is_mistral_derived_model: ${IS_MISTRAL_DERIVED_MODEL}
-
-overrides_of_model_config:
-  rope_scaling:
-    type: ${ROPE_SCALING_TYPE}
-    factor: ${ROPE_SCALING_FACTOR}
-
-bnb_config_kwargs:
-  llm_int8_has_fp16_weight: ${BNB_LLM_INT8_HAS_FP16_WEIGHT}
-  bnb_4bit_quant_type: ${BNB_4BIT_QUANT_TYPE}
-  bnb_4bit_use_double_quant: ${BNB_4BIT_USE_DOUBLE_QUANT}
-
-gptq: ${GPTQ}
-load_in_8bit: ${LOAD_IN_8BIT}
-load_in_4bit: ${LOAD_IN_4BIT}
-bf16: ${BF16}
-fp16: ${FP16}
-tf32: ${TF32}
-bfloat16: ${BFLOAT16}
-float16: ${FLOAT16}
-
-gpu_memory_limit: ${GPU_MEMORY_LIMIT}
-lora_on_cpu: ${LORA_ON_CPU}
-
-datasets:
-  - path: ${DATASET_PATH}
-    type: ${DATASET_TYPE}
-    ds_type: ${DATASET_DS_TYPE}
-    data_files: ${DATASET_DATA_FILES}
-    shards: ${DATASET_SHARDS}
-    name: ${DATASET_NAME}
-    train_on_split: ${DATASET_TRAIN_ON_SPLIT}
-    revision: ${DATASET_REVISION}
-    trust_remote_code: ${DATASET_TRUST_REMOTE_CODE}
-
-rl: ${RL}
-dpo_use_weighting: ${DPO_USE_WEIGHTING}
-
-chat_template: ${CHAT_TEMPLATE}
-chat_template_jinja: ${CHAT_TEMPLATE_JINJA}
-default_system_message: ${DEFAULT_SYSTEM_MESSAGE}
-dataset_prepared_path: ${DATASET_PREPARED_PATH}
-push_dataset_to_hub: ${PUSH_DATASET_TO_HUB}
-dataset_processes: ${DATASET_PROCESSES}
-dataset_keep_in_memory: ${DATASET_KEEP_IN_MEMORY}
-hub_model_id: ${HUB_MODEL_ID}
-hub_strategy: ${HUB_STRATEGY}
-hf_use_auth_token: ${HF_USE_AUTH_TOKEN}
-val_set_size: ${VAL_SET_SIZE}
-dataset_shard_num: ${DATASET_SHARD_NUM}
-dataset_shard_idx: ${DATASET_SHARD_IDX}
-
-sequence_len: ${SEQUENCE_LEN}
-pad_to_sequence_len: ${PAD_TO_SEQUENCE_LEN}
-sample_packing: ${SAMPLE_PACKING}
-eval_sample_packing: ${EVAL_SAMPLE_PACKING}
-sample_packing_eff_est: ${SAMPLE_PACKING_EFF_EST}
-total_num_tokens: ${TOTAL_NUM_TOKENS}
-sample_packing_group_size: ${SAMPLE_PACKING_GROUP_SIZE}
-sample_packing_bin_size: ${SAMPLE_PACKING_BIN_SIZE}
-
-batch_flattening: ${BATCH_FLATTENING}
-device_map: ${DEVICE_MAP}
-max_memory: ${MAX_MEMORY}
-
-adapter: ${ADAPTER}
-lora_model_dir: ${LORA_MODEL_DIR}
-
-lora_r: ${LORA_R}
-lora_alpha: ${LORA_ALPHA}
-lora_dropout: ${LORA_DROPOUT}
-lora_target_modules:
-  - ${LORA_TARGET_MODULES}
-lora_target_linear: ${LORA_TARGET_LINEAR}
-peft_layers_to_transform: ${PEFT_LAYERS_TO_TRANSFORM}
-lora_modules_to_save: ${LORA_MODULES_TO_SAVE}
-lora_fan_in_fan_out: ${LORA_FAN_IN_FAN_OUT}
-
-loraplus_lr_ratio: ${LORAPLUS_LR_RATIO}
-loraplus_lr_embedding: ${LORAPLUS_LR_EMBEDDING}
-
-peft:
-  loftq_config:
-    loftq_bits: ${LOFTQ_BITS}
-
-relora_steps: ${RELORA_STEPS}
-relora_warmup_steps: ${RELORA_WARMUP_STEPS}
-relora_anneal_steps: ${RELORA_ANNEAL_STEPS}
-relora_prune_ratio: ${RELORA_PRUNE_RATIO}
-relora_cpu_offload: ${RELORA_CPU_OFFLOAD}
-
-wandb_mode: ${WANDB_MODE}
-wandb_project: ${WANDB_PROJECT}
-wandb_entity: ${WANDB_ENTITY}
-wandb_watch: ${WANDB_WATCH}
-wandb_name: ${WANDB_NAME}
-wandb_run_id: ${WANDB_RUN_ID}
-wandb_log_model: ${WANDB_LOG_MODEL}
-
-mlflow_tracking_uri: ${MLFLOW_TRACKING_URI}
-mlflow_experiment_name: ${MLFLOW_EXPERIMENT_NAME}
-mlflow_run_name: ${MLFLOW_RUN_NAME}
-hf_mlflow_log_artifacts: ${HF_MLFLOW_LOG_ARTIFACTS}
-
-use_comet: ${USE_COMET}
-comet_api_key: ${COMET_API_KEY}
-comet_workspace: ${COMET_WORKSPACE}
-comet_project_name: ${COMET_PROJECT_NAME}
-comet_experiment_key: ${COMET_EXPERIMENT_KEY}
-comet_mode: ${COMET_MODE}
-comet_online: ${COMET_ONLINE}
-comet_experiment_config: ${COMET_EXPERIMENT_CONFIG}
-
-output_dir: ${OUTPUT_DIR}
-
-torch_compile: ${TORCH_COMPILE}
-torch_compile_backend: ${TORCH_COMPILE_BACKEND}
-
-gradient_accumulation_steps: ${GRADIENT_ACCUMULATION_STEPS}
-micro_batch_size: ${MICRO_BATCH_SIZE}
-eval_batch_size: ${EVAL_BATCH_SIZE}
-num_epochs: ${NUM_EPOCHS}
-warmup_steps: ${WARMUP_STEPS}
-warmup_ratio: ${WARMUP_RATIO}
-learning_rate: ${LEARNING_RATE}
-lr_quadratic_warmup: ${LR_QUADRATIC_WARMUP}
-logging_steps: ${LOGGING_STEPS}
-eval_steps: ${EVAL_STEPS}
-evals_per_epoch: ${EVALS_PER_EPOCH}
-save_strategy: ${SAVE_STRATEGY}
-save_steps: ${SAVE_STEPS}
-saves_per_epoch: ${SAVES_PER_EPOCH}
-save_total_limit: ${SAVE_TOTAL_LIMIT}
-max_steps: ${MAX_STEPS}
-
-eval_table_size: ${EVAL_TABLE_SIZE}
-eval_max_new_tokens: ${EVAL_MAX_NEW_TOKENS}
-eval_causal_lm_metrics: ${EVAL_CAUSAL_LM_METRICS}
-
-profiler_steps: ${PROFILER_STEPS}
-loss_watchdog_threshold: ${LOSS_WATCHDOG_THRESHOLD}
-loss_watchdog_patience: ${LOSS_WATCHDOG_PATIENCE}
-
-save_safetensors: ${SAVE_SAFETENSORS}
-train_on_inputs: ${TRAIN_ON_INPUTS}
-group_by_length: ${GROUP_BY_LENGTH}
-gradient_checkpointing: ${GRADIENT_CHECKPOINTING}
-early_stopping_patience: ${EARLY_STOPPING_PATIENCE}
-
-lr_scheduler: ${LR_SCHEDULER}
-lr_scheduler_kwargs: ${LR_SCHEDULER_KWARGS}
-cosine_min_lr_ratio: ${COSINE_MIN_LR_RATIO}
-cosine_constant_lr_ratio: ${COSINE_CONSTANT_LR_RATIO}
-lr_div_factor: ${LR_DIV_FACTOR}
-
-optimizer: ${OPTIMIZER}
-optim_args: ${OPTIM_ARGS}
-optim_target_modules: ${OPTIM_TARGET_MODULES}
-weight_decay: ${WEIGHT_DECAY}
-adam_beta1: ${ADAM_BETA1}
-adam_beta2: ${ADAM_BETA2}
-adam_epsilon: ${ADAM_EPSILON}
-max_grad_norm: ${MAX_GRAD_NORM}
-
-neftune_noise_alpha: ${NEFTUNE_NOISE_ALPHA}
-
-flash_optimum: ${FLASH_OPTIMUM}
-xformers_attention: ${XFORMERS_ATTENTION}
-flash_attention: ${FLASH_ATTENTION}
-flash_attn_cross_entropy: ${FLASH_ATTN_CROSS_ENTROPY}
-flash_attn_rms_norm: ${FLASH_ATTN_RMS_NORM}
-flash_attn_fuse_qkv: ${FLASH_ATTN_FUSE_QKV}
-flash_attn_fuse_mlp: ${FLASH_ATTN_FUSE_MLP}
-sdp_attention: ${SDP_ATTENTION}
-s2_attention: ${S2_ATTENTION}
-resume_from_checkpoint: ${RESUME_FROM_CHECKPOINT}
-auto_resume_from_checkpoints: ${AUTO_RESUME_FROM_CHECKPOINTS}
-
-local_rank: ${LOCAL_RANK}
-
-special_tokens:
-  bos_token: ${SPECIAL_TOKEN_BOS}
-  eos_token: ${SPECIAL_TOKEN_EOS}
-  unk_token: ${SPECIAL_TOKEN_UNK}
-  pad_token: ${SPECIAL_TOKEN_PAD}
-
-tokens: ${TOKENS}
-
-fsdp: ${FSDP}
-fsdp_config: ${FSDP_CONFIG}
-deepspeed: ${DEEPSPEED}
-
-ddp_timeout: ${DDP_TIMEOUT}
-ddp_bucket_cap_mb: ${DDP_BUCKET_CAP_MB}
-ddp_broadcast_buffers: ${DDP_BROADCAST_BUFFERS}
-
-torchdistx_path: ${TORCHDISTX_PATH}
-pretraining_dataset: ${PRETRAINING_DATASET}
-debug: ${DEBUG}
-seed: ${SEED}
-strict: ${STRICT}
--- a/.runpod/src/handler.py
+++ b/.runpod/src/handler.py
@@ -1,66 +0,0 @@
-"""
-Runpod serverless entrypoint handler
-"""
-
-import os
-
-import runpod
-import yaml
-from huggingface_hub._login import login
-from train import train
-from utils import get_output_dir
-
-BASE_VOLUME = os.environ.get("BASE_VOLUME", "/runpod-volume")
-if not os.path.exists(BASE_VOLUME):
-    os.makedirs(BASE_VOLUME)
-
-logger = runpod.RunPodLogger()
-
-
-async def handler(job):
-    runpod_job_id = job["id"]
-    inputs = job["input"]
-    run_id = inputs.get("run_id", "default_run_id")
-    args = inputs.get("args", {})
-
-    # Set output directory
-    output_dir = os.path.join(BASE_VOLUME, get_output_dir(run_id))
-    args["output_dir"] = output_dir
-
-    # First save args to a temporary config file
-    config_path = "/workspace/test_config.yaml"
-
-    # Add run_name and job_id to args before saving
-    args["run_name"] = run_id
-    args["runpod_job_id"] = runpod_job_id
-
-    yaml_data = yaml.dump(args, default_flow_style=False)
-    with open(config_path, "w", encoding="utf-8") as file:
-        file.write(yaml_data)
-
-    # Handle credentials
-    credentials = inputs.get("credentials", {})
-
-    if "wandb_api_key" in credentials:
-        os.environ["WANDB_API_KEY"] = credentials["wandb_api_key"]
-    if "hf_token" in credentials:
-        os.environ["HF_TOKEN"] = credentials["hf_token"]
-
-    if os.environ.get("HF_TOKEN"):
-        login(token=os.environ["HF_TOKEN"])
-    else:
-        logger.info("No HF_TOKEN provided. Skipping login.")
-
-    logger.info("Starting Training.")
-    async for result in train(config_path):  # Pass the config path instead of args
-        logger.info(result)
-    logger.info("Training Complete.")
-
-    # Cleanup
-    if "WANDB_API_KEY" in os.environ:
-        del os.environ["WANDB_API_KEY"]
-    if "HF_TOKEN" in os.environ:
-        del os.environ["HF_TOKEN"]
-
-
-runpod.serverless.start({"handler": handler, "return_aggregate_stream": True})
--- a/.runpod/src/test_input.json
+++ b/.runpod/src/test_input.json
@@ -1,61 +0,0 @@
-{
-  "input": {
-    "user_id": "user",
-    "model_id": "llama-test",
-    "run_id": "llama-test",
-    "credentials": {
-      "wandb_api_key": "",
-      "hf_token": ""
-    },
-    "args": {
-      "base_model": "NousResearch/Meta-Llama-3-8B",
-      "model_type": "LlamaForCausalLM",
-      "tokenizer_type": "AutoTokenizer",
-      "load_in_8bit": true,
-      "load_in_4bit": false,
-      "strict": false,
-      "datasets": [
-        {
-          "path": "mhenrichsen/alpaca_2k_test",
-          "type": "alpaca"
-        }
-      ],
-      "val_set_size": 0.05,
-      "output_dir": "./outputs/lora-out",
-      "sequence_len": 4096,
-      "sample_packing": true,
-      "eval_sample_packing": false,
-      "pad_to_sequence_len": true,
-      "adapter": "lora",
-      "lora_r": 32,
-      "lora_alpha": 16,
-      "lora_dropout": 0.05,
-      "lora_target_linear": true,
-      "lora_modules_to_save": [
-        "embed_tokens",
-        "lm_head"
-      ],
-      "gradient_accumulation_steps": 4,
-      "micro_batch_size": 2,
-      "num_epochs": 1,
-      "optimizer": "adamw_bnb_8bit",
-      "lr_scheduler": "cosine",
-      "learning_rate": 0.0002,
-      "train_on_inputs": false,
-      "group_by_length": false,
-      "bf16": "auto",
-      "tf32": false,
-      "gradient_checkpointing": true,
-      "logging_steps": 1,
-      "flash_attention": true,
-      "warmup_steps": 1,
-      "evals_per_epoch": 1,
-      "eval_max_new_tokens": 128,
-      "saves_per_epoch": 1,
-      "weight_decay": 0.0,
-      "special_tokens": {
-        "pad_token": "<|end_of_text|>"
-      }
-    }
-  }
-}
--- a/.runpod/src/train.py
+++ b/.runpod/src/train.py
@@ -1,45 +0,0 @@
-"""
-Runpod train entrypoint
-"""
-
-import asyncio
-
-
-async def train(config_path: str, gpu_id: str = "0", preprocess: bool = True):
-    """
-    Run preprocessing (if enabled) and training with the given config file
-    :param config_path: Path to the YAML config file
-    :param gpu_id: GPU ID to use (default: "0")
-    :param preprocess: Whether to run preprocessing (default: True)
-
-    """
-    # First check if preprocessing is needed
-    if preprocess:
-        # Preprocess command
-        preprocess_cmd = (
-            f"CUDA_VISIBLE_DEVICES={gpu_id} axolotl preprocess {config_path}"
-        )
-        process = await asyncio.create_subprocess_shell(
-            preprocess_cmd,
-            stdout=asyncio.subprocess.PIPE,
-            stderr=asyncio.subprocess.STDOUT,
-        )
-
-        if process.stdout is not None:
-            async for line in process.stdout:
-                yield f"Preprocessing: {line.decode().strip()}"
-        await process.wait()
-        yield "Preprocessing completed."
-    else:
-        yield "Skipping preprocessing step."
-
-    # Training command
-    train_cmd = f"axolotl train {config_path}"
-    process = await asyncio.create_subprocess_shell(
-        train_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT
-    )
-
-    if process.stdout is not None:
-        async for line in process.stdout:
-            yield f"Training: {line.decode().strip()}"
-    await process.wait()
--- a/.runpod/src/utils.py
+++ b/.runpod/src/utils.py
@@ -1,89 +0,0 @@
-"""
-Runpod launcher utils
-"""
-
-import os
-
-import yaml
-
-
-def get_output_dir(run_id):
-    path = f"fine-tuning/{run_id}"
-    return path
-
-
-def make_valid_config(input_args):
-    """
-    Creates and saves updated config file, returns the path to the new config
-    :param input_args: dict of input args
-    :return: str, path to the updated config file
-    """
-    # Load default config
-    with open("config/config.yaml", "r", encoding="utf-8") as fin:
-        all_args = yaml.safe_load(fin)
-
-    if not input_args:
-        print("No args provided, using defaults")
-    else:
-        all_args.update(input_args)
-
-    # Create updated config path
-    updated_config_path = "config/updated_config.yaml"
-
-    # Save updated config to new file
-    with open(updated_config_path, "w", encoding="utf-8") as f:
-        yaml.dump(all_args, f)
-
-    return updated_config_path
-
-
-def set_config_env_vars(args: dict):
-    """
-    Convert API arguments into environment variables.
-    Handles nested dictionaries, lists, and special values.
-
-    Args:
-        args (dict): The arguments dictionary from the API request
-    """
-
-    def process_value(value):
-        """Convert Python values to string format for environment variables"""
-        if value is None:
-            return ""
-        if isinstance(value, bool):
-            return str(value).lower()
-        if isinstance(value, (list, dict)):
-            return str(value)
-        return str(value)
-
-    def set_env_vars(data, prefix=""):
-        """Recursively set environment variables from nested dictionary"""
-        for key, value in data.items():
-            env_key = prefix + key.upper()
-
-            # Handle special cases
-            if isinstance(value, dict):
-                # For nested dictionaries (like special_tokens)
-                set_env_vars(value, f"{env_key}_")
-            elif isinstance(value, list):
-                # Handle list of dictionaries (like datasets)
-                if value and isinstance(value[0], dict):
-                    for i, item in enumerate(value):
-                        set_env_vars(item, f"{env_key}_{i}_")
-                else:
-                    # For simple lists (like lora_target_modules)
-                    os.environ[env_key] = process_value(value)
-            else:
-                # Handle all other cases
-                os.environ[env_key] = process_value(value)
-
-    # Clear any existing related environment variables
-    # This prevents old values from persisting
-    for key in list(os.environ.keys()):
-        if key.startswith(
-            ("BASE_MODEL", "MODEL_TYPE", "TOKENIZER_TYPE", "DATASET", "LORA_", "WANDB_")
-        ):
-            del os.environ[key]
-
-    # Set new environment variables
-    set_env_vars(args)
--- a/.runpod/test-input.json
+++ b/.runpod/test-input.json
@@ -1,86 +0,0 @@
-{
-  "input": {
-    "name": "quick_smoke_test_sft",
-    "user_id": "user",
-    "model_id": "llama-test",
-    "run_id": "llama-test",
-    "credentials": {
-      "wandb_api_key": "",
-      "hf_token": ""
-    },
-    "args": {
-      "base_model": "HuggingFaceTB/SmolLM2-135M",
-      "model_type": "AutoModelForCausalLM",
-      "tokenizer_type": "AutoTokenizer",
-      "load_in_4bit": true,
-      "strict": false,
-      "datasets": [
-        {
-          "path": "mhenrichsen/alpaca_2k_test",
-          "type": "alpaca",
-          "split": "train[:10%]"
-        }
-      ],
-      "val_set_size": 0.02,
-      "output_dir": "./outputs/lora-out",
-      "sequence_len": 4096,
-      "sample_packing": true,
-      "eval_sample_packing": false,
-      "pad_to_sequence_len": true,
-      "adapter": "qlora",
-      "lora_r": 32,
-      "lora_alpha": 64,
-      "lora_dropout": 0.05,
-      "lora_target_linear": true,
-      "lora_modules_to_save": [
-        "embed_tokens",
-        "lm_head"
-      ],
-      "gradient_accumulation_steps": 2,
-      "micro_batch_size": 1,
-      "num_epochs": 1,
-      "optimizer": "adamw_torch_fused",
-      "lr_scheduler": "cosine",
-      "learning_rate": 0.0002,
-      "train_on_inputs": false,
-      "group_by_length": false,
-      "bf16": "auto",
-      "tf32": true,
-      "gradient_checkpointing": true,
-      "logging_steps": 1,
-      "flash_attention": true,
-      "warmup_steps": 1,
-      "evals_per_epoch": 1,
-      "eval_max_new_tokens": 128,
-      "saves_per_epoch": 1,
-      "weight_decay": 0.0,
-      "special_tokens": {
-        "pad_token": "<|endoftext|>"
-      },
-      "max_steps": 20
-    },
-    "timeout": 100000
-  },
-  "config": {
-    "gpuTypeId": "NVIDIA GeForce RTX 4090",
-    "gpuCount": 1,
-    "containerDiskInGb": 200,
-    "env": [
-      {
-        "key": "TOKENIZER",
-        "value": ""
-      },
-      {
-        "key": "DISABLE_LOG_STATS",
-        "value": "true"
-      }
-    ],
-    "allowedCudaVersions": [
-      "12.8",
-      "12.7",
-      "12.6",
-      "12.5",
-      "12.4"
-    ]
-  }
-}
--- a/.runpod/tests.json
+++ b/.runpod/tests.json
@@ -1,90 +0,0 @@
-{
-  "tests": [
-    {
-      "name": "quick_smoke_test_sft",
-      "input": {
-        "user_id": "user",
-        "model_id": "llama-test",
-        "run_id": "llama-test",
-        "credentials": {
-          "wandb_api_key": "",
-          "hf_token": ""
-        },
-        "args": {
-          "base_model": "HuggingFaceTB/SmolLM2-135M",
-          "model_type": "AutoModelForCausalLM",
-          "tokenizer_type": "AutoTokenizer",
-          "load_in_4bit": true,
-          "strict": false,
-          "datasets": [
-            {
-              "path": "mhenrichsen/alpaca_2k_test",
-              "type": "alpaca",
-              "split": "train[:10%]"
-            }
-          ],
-          "val_set_size": 0.02,
-          "output_dir": "./outputs/lora-out",
-          "sequence_len": 4096,
-          "sample_packing": true,
-          "eval_sample_packing": false,
-          "pad_to_sequence_len": true,
-          "adapter": "qlora",
-          "lora_r": 32,
-          "lora_alpha": 64,
-          "lora_dropout": 0.05,
-          "lora_target_linear": true,
-          "lora_modules_to_save": [
-            "embed_tokens",
-            "lm_head"
-          ],
-          "gradient_accumulation_steps": 2,
-          "micro_batch_size": 1,
-          "num_epochs": 1,
-          "optimizer": "adamw_torch_fused",
-          "lr_scheduler": "cosine",
-          "learning_rate": 0.0002,
-          "train_on_inputs": false,
-          "group_by_length": false,
-          "bf16": "auto",
-          "tf32": true,
-          "gradient_checkpointing": true,
-          "logging_steps": 1,
-          "flash_attention": true,
-          "warmup_steps": 1,
-          "evals_per_epoch": 1,
-          "eval_max_new_tokens": 128,
-          "saves_per_epoch": 1,
-          "weight_decay": 0.0,
-          "special_tokens": {
-            "pad_token": "<|endoftext|>"
-          },
-          "max_steps": 20
-        }
-      },
-      "timeout": 100000
-    }
-  ],
-  "config": {
-    "gpuTypeId": "NVIDIA GeForce RTX 4090",
-    "gpuCount": 1,
-    "containerDiskInGb": 200,
-    "env": [
-      {
-        "key": "TOKENIZER",
-        "value": ""
-      },
-      {
-        "key": "DISABLE_LOG_STATS",
-        "value": "true"
-      }
-    ],
-    "allowedCudaVersions": [
-      "12.8",
-      "12.7",
-      "12.6",
-      "12.5",
-      "12.4"
-    ]
-  }
-}
--- a/README.md
+++ b/README.md
@@ -22,32 +22,28 @@
    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg" alt="multigpu-semi-weekly tests">
 </p>

-
-## 🎉 Latest Updates
-
- 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral) to start training your own Magistral models with Axolotl!
- 2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the [docs](https://docs.axolotl.ai/docs/qat.html) to learn more!
- 2025/04: Llama 4 support has been added in Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/llama-4) to start training your own Llama 4 models with Axolotl's linearized version!
- 2025/03: Axolotl has implemented Sequence Parallelism (SP) support. Read the [blog](https://huggingface.co/blog/axolotl-ai-co/long-context-with-sequence-parallelism-in-axolotl) and [docs](https://docs.axolotl.ai/docs/sequence_parallelism.html) to learn how to scale your context length when fine-tuning.
- 2025/03: (Beta) Fine-tuning Multimodal models is now supported in Axolotl. Check out the [docs](https://docs.axolotl.ai/docs/multimodal.html) to fine-tune your own!
- 2025/02: Axolotl has added LoRA optimizations to reduce memory usage and improve training speed for LoRA and QLoRA in single GPU and multi-GPU training (DDP and DeepSpeed). Jump into the [docs](https://docs.axolotl.ai/docs/lora_optims.html) to give it a try.
- 2025/02: Axolotl has added GRPO support. Dive into our [blog](https://huggingface.co/blog/axolotl-ai-co/training-llms-w-interpreter-feedback-wasm) and [GRPO example](https://github.com/axolotl-ai-cloud/grpo_code) and have some fun!
- 2025/01: Axolotl has added Reward Modelling / Process Reward Modelling fine-tuning support. See [docs](https://docs.axolotl.ai/docs/reward_modelling.html).
-
-## ✨ Overview
-
 Axolotl is a tool designed to streamline post-training for various AI models.
+Post-training refers to any modifications or additional training performed on
+pre-trained models - including full model fine-tuning, parameter-efficient tuning (like
+LoRA and QLoRA), supervised fine-tuning (SFT), instruction tuning, and alignment
+techniques. With support for multiple model architectures and training configurations,
+Axolotl makes it easy to get started with these techniques.
+
+Axolotl is designed to work with YAML config files that contain everything you need to
+preprocess a dataset, train or fine-tune a model, run model inference or evaluation,
+and much more.

 Features:

- **Multiple Model Support**: Train various models like LLaMA, Mistral, Mixtral, Pythia, and more. We are compatible with HuggingFace transformers causal language models.
- **Training Methods**: Full fine-tuning, LoRA, QLoRA, GPTQ, QAT, Preference Tuning (DPO, IPO, KTO, ORPO), RL (GRPO), Multimodal, and Reward Modelling (RM) / Process Reward Modelling (PRM).
- **Easy Configuration**: Re-use a single YAML file between dataset preprocess, training, evaluation, quantization, and inference.
- **Performance Optimizations**: [Multipacking](https://docs.axolotl.ai/docs/multipack.html), [Flash Attention](https://github.com/Dao-AILab/flash-attention), [Xformers](https://github.com/facebookresearch/xformers), [Flex Attention](https://pytorch.org/blog/flexattention/), [Liger Kernel](https://github.com/linkedin/Liger-Kernel), [Cut Cross Entropy](https://github.com/apple/ml-cross-entropy/tree/main), Sequence Parallelism (SP), LoRA optimizations, Multi-GPU training (FSDP1, FSDP2, DeepSpeed), Multi-node training (Torchrun, Ray), and many more!
- **Flexible Dataset Handling**: Load from local, HuggingFace, and cloud (S3, Azure, GCP, OCI) datasets.
- **Cloud Ready**: We ship [Docker images](https://hub.docker.com/u/axolotlai) and also [PyPI packages](https://pypi.org/project/axolotl/) for use on cloud platforms and local hardware.
-
-
+- Train various Huggingface models such as llama, pythia, falcon, mpt
+- Supports fullfinetune, lora, qlora, relora, and gptq
+- Customize configurations using a simple yaml file or CLI overwrite
+- Load different dataset formats, use custom formats, or bring your own tokenized datasets
+- Integrated with [xformers](https://github.com/facebookresearch/xformers), flash attention, [liger kernel](https://github.com/linkedin/Liger-Kernel), rope scaling, and multipacking
+- Works with single GPU or multiple GPUs via FSDP or Deepspeed
+- Easily run with Docker locally or on the cloud
+- Log results and optionally checkpoints to wandb, mlflow or Comet
+- And more!

 ## 🚀 Quick Start

@@ -55,7 +51,7 @@ Features:

 - NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
 - Python 3.11
- PyTorch ≥2.5.1
+- PyTorch ≥2.4.1

 ### Installation

@@ -85,12 +81,19 @@ axolotl train examples/llama-3/lora-1b.yml

 That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/getting-started.html) for a more detailed walkthrough.

+## ✨ Key Features
+
+- **Multiple Model Support**: Train various models like LLaMA, Mistral, Mixtral, Pythia, and more
+- **Training Methods**: Full fine-tuning, LoRA, QLoRA, and more
+- **Easy Configuration**: Simple YAML files to control your training setup
+- **Performance Optimizations**: Flash Attention, xformers, multi-GPU training
+- **Flexible Dataset Handling**: Use various formats and custom datasets
+- **Cloud Ready**: Run on cloud platforms or local hardware

 ## 📚 Documentation

 - [Installation Options](https://docs.axolotl.ai/docs/installation.html) - Detailed setup instructions for different environments
 - [Configuration Guide](https://docs.axolotl.ai/docs/config.html) - Full configuration options and examples
- [Dataset Loading](https://docs.axolotl.ai/docs/dataset_loading.html) - Loading datasets from various sources
 - [Dataset Guide](https://docs.axolotl.ai/docs/dataset-formats/) - Supported formats and how to use them
 - [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
 - [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
@@ -109,6 +112,31 @@ That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/ge

 Contributions are welcome! Please see our [Contributing Guide](https://github.com/axolotl-ai-cloud/axolotl/blob/main/.github/CONTRIBUTING.md) for details.

+## Supported Models
+
+|             | fp16/fp32 | lora | qlora | gptq | gptq w/flash attn | flash attn | xformers attn |
+|-------------|:----------|:-----|-------|------|-------------------|------------|--------------|
+| llama       | ✅         | ✅    | ✅     | ✅             | ✅                 | ✅          | ✅            |
+| Mistral     | ✅         | ✅    | ✅     | ✅             | ✅                 | ✅          | ✅            |
+| Mixtral-MoE | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
+| Mixtral8X22 | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
+| Pythia      | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
+| cerebras    | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
+| btlm        | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
+| mpt         | ✅         | ❌    | ❓     | ❌             | ❌                 | ❌          | ❓            |
+| falcon      | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
+| gpt-j       | ✅         | ✅    | ✅     | ❌             | ❌                 | ❓          | ❓            |
+| XGen        | ✅         | ❓    | ✅     | ❓             | ❓                 | ❓          | ✅            |
+| phi         | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
+| RWKV        | ✅         | ❓    | ❓     | ❓             | ❓                 | ❓          | ❓            |
+| Qwen        | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
+| Gemma       | ✅         | ✅    | ✅     | ❓             | ❓                 | ✅          | ❓            |
+| Jamba       | ✅         | ✅    | ✅     | ❓             | ❓                 | ✅          | ❓            |
+
+✅: supported
+❌: not supported
+❓: untested
+
 ## ❤️ Sponsors

 Thank you to our sponsors who help make Axolotl possible:
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -17,9 +17,7 @@ quartodoc:
        - convert
        - prompt_tokenizers
        - logging_config
-        - core.builders.base
-        - core.builders.causal
-        - core.builders.rl
+        - core.trainer_builder
        - core.training_args
        - core.chat.messages
        - core.chat.format.chatml
@@ -45,37 +43,13 @@ quartodoc:
        - cli.vllm_serve
        - cli.cloud.base
        - cli.cloud.modal_
-        - cli.quantize
    - title: Trainers
      desc: Training implementations
      contents:
        - core.trainers.base
        - core.trainers.trl
-        - core.trainers.mamba
-        - core.trainers.relora
        - core.trainers.dpo.trainer
        - core.trainers.grpo.trainer
-        - core.trainers.grpo.sampler
-        - core.trainers.utils
-    - title: Model Loading
-      desc: Functionality for loading and patching models, tokenizers, etc.
-      contents:
-        - loaders.model
-        - loaders.tokenizer
-        - loaders.processor
-        - loaders.adapter
-        - loaders.patch_manager
-        - loaders.constants
-    - title: Mixins
-      desc: Mixin classes for augmenting trainers
-      contents:
-        - core.trainers.mixins.optimizer
-        - core.trainers.mixins.rng_state_loader
-        - core.trainers.mixins.scheduler
-    - title: Context Managers
-      desc: Context managers for altering trainer behaviors
-      contents:
-        - utils.ctx_managers.sequence_parallel
    - title: Prompt Strategies
      desc: Prompt formatting strategies
      contents:
@@ -112,7 +86,7 @@ quartodoc:
        - kernels.swiglu
        - kernels.quantize
        - kernels.utils
-    - title: Monkey Patches
+    - title: MonkeyPatches
      desc: Runtime patches for model optimizations
      contents:
        - monkeypatch.llama_attn_hijack_flash
@@ -129,16 +103,17 @@ quartodoc:
        - monkeypatch.trainer_fsdp_optim
        - monkeypatch.transformers_fa_utils
        - monkeypatch.unsloth_
+        - monkeypatch.attention.mllama
        - monkeypatch.data.batch_dataset_fetcher
        - monkeypatch.mixtral
-        - monkeypatch.gradient_checkpointing.offload_cpu
-        - monkeypatch.gradient_checkpointing.offload_disk
    - title: Utils
      desc: Utility functions
      contents:
+        - utils.models
        - utils.tokenization
        - utils.chat_templates
        - utils.lora
+        - utils.lora_embeddings
        - utils.model_shard_quant
        - utils.bench
        - utils.freeze
@@ -149,7 +124,7 @@ quartodoc:
        - utils.optimizers.adopt
        - utils.data.pretraining
        - utils.data.sft
-        - utils.quantization
+        - utils.gradient_checkpointing.unsloth
    - title: Schemas
      desc: Pydantic data models for Axolotl config
      contents:
@@ -199,14 +174,12 @@ quartodoc:
        - utils.callbacks.lisa
        - utils.callbacks.mlflow_
        - utils.callbacks.comet_
-        - utils.callbacks.qat
+
 website:
  title: "Axolotl"
  description: "We make fine-tuning accessible, scalable, and fun"
  favicon: favicon.jpg

-  google-analytics: "G-9KYCVJBNMQ"
-
  navbar:
    logo: image/axolotl_logo_digital_white.svg
    title: false
@@ -259,8 +232,6 @@ website:
            - docs/lr_groups.qmd
            - docs/lora_optims.qmd
            - docs/dataset_loading.qmd
-            - docs/qat.qmd
-            - docs/quantize.qmd

        - section: "Core Concepts"
          contents:
--- a/cicd/Dockerfile-uv.jinja
+++ b/cicd/Dockerfile-uv.jinja
@@ -1,52 +0,0 @@
-FROM axolotlai/axolotl-base-uv:{{ BASE_TAG }}
-
-ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
-ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
-ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
-ENV CUDA="{{ CUDA }}"
-ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
-ENV GITHUB_REF="{{ GITHUB_REF }}"
-ENV GITHUB_SHA="{{ GITHUB_SHA }}"
-ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
-ENV HF_HOME="{{ HF_HOME }}"
-
-RUN apt-get update && \
-    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
-
-WORKDIR /workspace
-
-RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
-
-WORKDIR /workspace/axolotl
-
-RUN git fetch origin +$GITHUB_REF && \
-    git checkout FETCH_HEAD
-
-# If AXOLOTL_EXTRAS is set, append it in brackets
-RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
-        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
-        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
-        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
-        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
-        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
-    fi
-
-RUN uv pip install packaging==23.2 setuptools==75.8.0
-RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
-    else \
-        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
-    fi
-
-RUN python scripts/unsloth_install.py --uv | sh
-RUN python scripts/cutcrossentropy_install.py --uv | sh
-
-# So we can test the Docker image
-RUN uv pip install -r requirements-dev.txt -r requirements-tests.txt
-
-# fix so that git fetch/pull from remote works
-RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
-    git config --get remote.origin.fetch
-
-# helper for huggingface-login cli
-RUN git config --global credential.helper store
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -9,7 +9,8 @@ pytest -v --durations=10 -n8 \
  --ignore=tests/patched/ \
  --ignore=tests/cli \
  /workspace/axolotl/tests/ \
-  --cov=axolotl
+  --cov=axolotl \
+  --cov-report=xml:coverage.xml

 # Run lora kernels tests with coverage append
 pytest -v --durations=10 \
@@ -18,7 +19,7 @@ pytest -v --durations=10 \
  --cov-append

 # Run patched tests excluding lora kernels with coverage append
-pytest --full-trace -vvv --durations=10 \
+pytest -v --durations=10 \
  --ignore=tests/e2e/patched/lora_kernels \
  /workspace/axolotl/tests/e2e/patched \
  --cov=axolotl \
@@ -50,6 +51,11 @@ pytest -v --durations=10 \
  /workspace/axolotl/tests/e2e/ \
  --cov=axolotl \
  --cov-append \
-  --cov-report=xml:e2e-coverage.xml
+  --cov-report=xml:coverage.xml

-codecov upload-process -t $CODECOV_TOKEN -f e2e-coverage.xml -F e2e,pytorch-${PYTORCH_VERSION} || true
+# Upload coverage to Codecov
+if [ -f e2e-coverage.xml ]; then
+  codecov -f e2e-coverage.xml -F e2e,pytorch-${PYTORCH_VERSION}
+else
+  echo "Coverage file not found. Coverage report may have failed."
+fi
--- a/cicd/cleanup.py
+++ b/cicd/cleanup.py
@@ -1,19 +0,0 @@
-"""Modal app to run axolotl GPU cleanup"""
-
-from .single_gpu import VOLUME_CONFIG, app, cicd_image, run_cmd
-
-
-@app.function(
-    image=cicd_image,
-    timeout=60 * 60,
-    cpu=8.0,
-    memory=131072,
-    volumes=VOLUME_CONFIG,
-)
-def cleanup():
-    run_cmd("./cicd/cleanup.sh", "/workspace/axolotl")
-
-
-@app.local_entrypoint()
-def main():
-    cleanup.remote()
--- a/cicd/cleanup.sh
+++ b/cicd/cleanup.sh
@@ -1,6 +0,0 @@
-#!/bin/bash
-set -e
-
-# cleanup old cache files for datasets processing and intermediate mappings
-find /workspace/data/huggingface-cache/hub/datasets -name "cache-*" -type f -mtime +1 -exec rm {} \;
-find /workspace/data/huggingface-cache/hub/datasets -name "*.lock" -type f -mtime +1 -exec rm {} \;
--- a/cicd/e2e_tests.py
+++ b/cicd/e2e_tests.py
@@ -1,12 +1,74 @@
 """Modal app to run axolotl GPU tests"""

-from .single_gpu import GPU_CONFIG, VOLUME_CONFIG, app, cicd_image, run_cmd
+# pylint: disable=duplicate-code
+
+import os
+import pathlib
+import tempfile
+
+import jinja2
+import modal
+from jinja2 import select_autoescape
+from modal import App, Image
+
+cicd_path = pathlib.Path(__file__).parent.resolve()
+
+template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
+template_env = jinja2.Environment(
+    loader=template_loader, autoescape=select_autoescape()
+)
+df_template = template_env.get_template("Dockerfile.jinja")
+
+df_args = {
+    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
+    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
+    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
+    "CUDA": os.environ.get("CUDA", "121"),
+    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
+    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
+    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
+    "HF_HOME": "/workspace/data/huggingface-cache/hub",
+}
+
+dockerfile_contents = df_template.render(**df_args)
+
+temp_dir = tempfile.mkdtemp()
+with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
+    f.write(dockerfile_contents)
+
+cicd_image = Image.from_dockerfile(
+    pathlib.Path(temp_dir) / "Dockerfile",
+    context_mount=None,
+    force_build=True,
+    gpu="A10G",
+).env(df_args)
+
+app = App("Axolotl CI/CD", secrets=[])
+
+hf_cache_volume = modal.Volume.from_name(
+    "axolotl-ci-hf-hub-cache", create_if_missing=True
+)
+VOLUME_CONFIG = {
+    "/workspace/data/huggingface-cache/hub": hf_cache_volume,
+}
+
+N_GPUS = int(os.environ.get("N_GPUS", 1))
+GPU_CONFIG = modal.gpu.L40S(count=N_GPUS)
+
+
+def run_cmd(cmd: str, run_folder: str):
+    import subprocess  # nosec
+
+    # Propagate errors from subprocess.
+    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
+        exit(exit_code)  # pylint: disable=consider-using-sys-exit


@app.function(
    image=cicd_image,
    gpu=GPU_CONFIG,
-    timeout=90 * 60,  # 90 min
+    timeout=60 * 60,
    cpu=8.0,
    memory=131072,
    volumes=VOLUME_CONFIG,
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -24,12 +24,11 @@ df_template = template_env.get_template("Dockerfile.jinja")
 df_args = {
    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.5.1"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu124-2.5.1"),
-    "CUDA": os.environ.get("CUDA", "124"),
+    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
+    "CUDA": os.environ.get("CUDA", "121"),
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
-    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
    "HF_HOME": "/workspace/data/huggingface-cache/hub",
 }

@@ -55,7 +54,7 @@ VOLUME_CONFIG = {
 }

 N_GPUS = int(os.environ.get("N_GPUS", 2))
-GPU_CONFIG = f"H100:{N_GPUS}"
+GPU_CONFIG = modal.gpu.H100(count=N_GPUS)


 def run_cmd(cmd: str, run_folder: str):
@@ -70,7 +69,7 @@ def run_cmd(cmd: str, run_folder: str):
    image=cicd_image,
    gpu=GPU_CONFIG,
    timeout=90 * 60,
-    cpu=16.0,
+    cpu=8.0,
    memory=131072 * N_GPUS,
    volumes=VOLUME_CONFIG,
 )
--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
@@ -6,13 +6,13 @@ pytest -v -n2 \
  --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ \
  --ignore=/workspace/axolotl/tests/e2e/multigpu/patched/ \
  /workspace/axolotl/tests/e2e/multigpu/ \
-  --cov=axolotl
-
-# Run solo tests with coverage append
-pytest -v --durations=10 -n1 \
-  /workspace/axolotl/tests/e2e/multigpu/solo/ \
  --cov=axolotl \
-  --cov-append
+  --cov-report=xml:multigpu-coverage.xml
+
+pytest -v  --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/solo/ \
+  --cov=axolotl \
+  --cov-append \
+  --cov-report=xml:multigpu-coverage.xml

 pytest -v  --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/patched/ \
  --cov=axolotl \
@@ -20,4 +20,8 @@ pytest -v  --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/patched/ \
  --cov-report=xml:multigpu-coverage.xml

 # Upload coverage to Codecov
-codecov upload-process -t "${CODECOV_TOKEN}" -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION} || true
+if [ -f multigpu-coverage.xml ]; then
+  codecov -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION}
+else
+  echo "Coverage file not found. Coverage report may have failed."
+fi
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
@@ -1,68 +0,0 @@
-"""Modal app to run axolotl GPU tests"""
-
-# pylint: disable=duplicate-code
-
-import os
-import pathlib
-import tempfile
-
-import jinja2
-import modal
-import modal.experimental
-from jinja2 import select_autoescape
-from modal import App
-
-cicd_path = pathlib.Path(__file__).parent.resolve()
-
-template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
-template_env = jinja2.Environment(
-    loader=template_loader, autoescape=select_autoescape()
-)
-dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja")
-df_template = template_env.get_template(dockerfile)
-
-df_args = {
-    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
-    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.5.1"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu124-2.5.1"),
-    "CUDA": os.environ.get("CUDA", "124"),
-    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
-    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
-    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
-    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
-    "HF_HOME": "/workspace/data/huggingface-cache/hub",
-}
-
-dockerfile_contents = df_template.render(**df_args)
-
-temp_dir = tempfile.mkdtemp()
-with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
-    f.write(dockerfile_contents)
-
-cicd_image = modal.experimental.raw_dockerfile_image(
-    pathlib.Path(temp_dir) / "Dockerfile",
-    # context_mount=None,
-    force_build=True,
-    # gpu="A10G",
-).env(df_args)
-
-app = App("Axolotl CI/CD", secrets=[])
-
-hf_cache_volume = modal.Volume.from_name(
-    "axolotl-ci-hf-hub-cache", create_if_missing=True
-)
-VOLUME_CONFIG = {
-    "/workspace/data/huggingface-cache/hub": hf_cache_volume,
-}
-
-N_GPUS = int(os.environ.get("N_GPUS", 1))
-GPU_CONFIG = f"L40S:{N_GPUS}"
-
-
-def run_cmd(cmd: str, run_folder: str):
-    import subprocess  # nosec
-
-    # Propagate errors from subprocess.
-    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
-        exit(exit_code)  # pylint: disable=consider-using-sys-exit
--- a/codecov.yml
+++ b/codecov.yml
@@ -1,7 +1,5 @@
 codecov:
  require_ci_to_pass: yes
-  notify:
-    wait_for_ci: true

 coverage:
  precision: 2
@@ -19,7 +17,7 @@ coverage:
        if_no_uploads: error
        if_not_found: success
        if_ci_failed: error
-        only_pulls: true
+        only_pulls: false
        flags: null
        paths: null
    patch:
@@ -51,6 +49,3 @@ comment:
  require_changes: no
  require_base: no
  require_head: yes
-
-github_checks:
-  annotations: false
--- a/deepspeed_configs/zero2_torch_compile.json
+++ b/deepspeed_configs/zero2_torch_compile.json
@@ -1,31 +0,0 @@
-{
-  "compile": {
-    "disable": false,
-    "backend": "inductor"
-  },
-  "zero_optimization": {
-    "stage": 2,
-    "offload_optimizer": {
-      "device": "cpu"
-    },
-    "contiguous_gradients": true,
-    "overlap_comm": true
-  },
-  "bf16": {
-    "enabled": "auto"
-  },
-  "fp16": {
-    "enabled": "auto",
-    "auto_cast": false,
-    "loss_scale": 0,
-    "initial_scale_power": 32,
-    "loss_scale_window": 1000,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
-  "train_batch_size": "auto",
-  "train_micro_batch_size_per_gpu": "auto",
-  "wall_clock_breakdown": false
-}
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -37,7 +37,3 @@ RUN git lfs install --skip-repo && \
    pip3 install awscli && \
    # The base image ships with `pydantic==1.8.2` which is not working
    pip3 install -U --no-cache-dir pydantic==1.10.10
-
-RUN if [ "$PYTORCH_VERSION" = "2.7.1" ] ; then \
-        pip3 install flash-attn==2.7.4.post1; \
-    fi
--- a/docker/Dockerfile-base-next
+++ b/docker/Dockerfile-base-next
@@ -29,7 +29,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
 WORKDIR /workspace

 RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
-    python3 -m pip install --no-cache-dir -U torch==2.7.1 --extra-index-url https://download.pytorch.org/whl/test/cu$CUDA && \
+    python3 -m pip install --no-cache-dir -U torch==2.7.0 --extra-index-url https://download.pytorch.org/whl/test/cu$CUDA && \
    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"

--- a/docker/Dockerfile-uv-base
+++ b/docker/Dockerfile-uv-base
@@ -1,40 +0,0 @@
-ARG CUDA_VERSION="12.6.3"
-ARG CUDNN_VERSION=""
-ARG UBUNTU_VERSION="22.04"
-ARG MAX_JOBS=4
-
-FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
-
-ARG PYTHON_VERSION="3.11"
-ARG PYTORCH_VERSION="2.6.0"
-ARG CUDA="126"
-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
-
-ENV PYTHON_VERSION=$PYTHON_VERSION
-ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
-ENV UV_TORCH_BACKEND="cu${CUDA}"
-
-RUN apt-get update \
-    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config curl && rm -rf /var/lib/apt/lists/* \
-    && git lfs install --skip-repo \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh
-
-ENV PATH="/root/.local/bin:${PATH}"
-
-RUN uv python install ${PYTHON_VERSION}
-
-WORKDIR /workspace
-
-RUN uv venv --no-project --relocatable axolotl-venv
-
-ENV PATH="/workspace/axolotl-venv/bin:${PATH}"
-
-RUN uv pip install packaging setuptools wheel psutil \
-    && uv pip install torch==${PYTORCH_VERSION} \
-    && uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" \
-    && uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" \
-    && uv pip install awscli pydantic
-
-RUN if [ "$PYTORCH_VERSION" = "2.7.1" ] ; then \
-        uv pip install --no-build-isolation flash-attn==2.7.4.post1; \
-    fi
--- a/docs/cli.qmd
+++ b/docs/cli.qmd
@@ -199,27 +199,6 @@ output_dir: # Directory to save evaluation results

 See [LM Eval Harness](https://github.com/EleutherAI/lm-evaluation-harness) for more details.

-### delinearize-llama4
-
-Delinearizes a Llama 4 linearized model into a regular HuggingFace Llama 4 model. This only works with the non-quantized linearized model.
-
-```bash
-axolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir
-```
-
-This would be necessary to use with other frameworks. If you have an adapter, merge it with the non-quantized linearized model before delinearizing.
-
-### quantize
-
-Quantizes a model using the quantization configuration specified in your YAML file.
-
-```bash
-axolotl quantize config.yml
-```
-
-See [Quantization](./quantize.qmd) for more details.
-
-
 ## Legacy CLI Usage

 While the new Click-based CLI is preferred, Axolotl still supports the legacy module-based CLI:
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -27,15 +27,11 @@ trust_remote_code:
 tokenizer_use_fast:
 # Whether to use the legacy tokenizer setting, defaults to True
 tokenizer_legacy:
-# Whether to use mistral-common tokenizer. If set to True, it will use the mistral-common tokenizer.
-tokenizer_use_mistral_common:
 # Resize the model embeddings when new tokens are added to multiples of 32
 # This is reported to improve training speed on some models
 resize_token_embeddings_to_32x:
 # Optional[bool] Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.
 shrink_embeddings:
-# Optional[bool] Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs
-embeddings_skip_upcast:
 # Whether to load the model with randomly initialized weights. Useful for
 # pre-training a model from scratch or debugging purposes.
 random_init_weights:
@@ -59,44 +55,55 @@ overrides_of_model_config:
 overrides_of_model_kwargs:
  # use_cache: False

-# optional overrides to the bnb 4bit quantization configuration
-# https://huggingface.co/docs/transformers/main/main_classes/quantization#transformers.BitsAndBytesConfig
-bnb_config_kwargs:
-  # These are default values
-  llm_int8_has_fp16_weight: false
-  bnb_4bit_quant_type: nf4
-  bnb_4bit_use_double_quant: true

-# quantization aware training
-qat:
-  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
-  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4" and "int8"
-  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
-  fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after

-# post-training quantization
+# Quantization configuration.
 quantization:
-  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are uintX for X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8
-  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
-  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
-  quantize_embedding: # Optional[bool] = False. Whether to quantize the embedding layer.
+  backend: bnb | hqq | gptq
+  bits: 8
+  # optional overrides to the bnb 4bit quantization configuration
+  # https://huggingface.co/docs/transformers/main/main_classes/quantization#transformers.BitsAndBytesConfig
+  bnb_config_kwargs:
+    # These are default values
+    llm_int8_has_fp16_weight: false
+    bnb_4bit_quant_type: nf4
+    bnb_4bit_use_double_quant: true

+  # If using hqq config, additional config paramters are needed. See: https://huggingface.co/docs/transformers/main/en//quantization/hqq
+  hqq_config:
+    # pick one of the following, depending on if you want to uniformly quantize the whole model or
+    # apply different quantization settings to specific layers in the model:

+    # if uniformly quantize the whole model:
+    group_size: 64
+    # if we want to invoke dynamic_config in order to apply specific layers with different quantization settings:
+    - nbits: 4
+      group_size: 64
+      target_modules:
+        - self_attn.k_proj
+        - self_attn.v_proj
+        - self_attn.o_proj
+    - nbits: 3
+      group_size: 32
+      target_modules:
+        - mlp.gate_proj
+        - mlp.up_proj
+        - mlp.down_proj
+
+# (Internal Use Only)
 # Whether you are training a 4-bit GPTQ quantized model
-gptq: true
-
+gptq:
 # This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
-load_in_8bit: true
+load_in_8bit:
 # Use bitsandbytes 4 bit
 load_in_4bit:

 # Use CUDA bf16
-bf16: true # bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection. require >=ampere
+bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
 # Use CUDA fp16
 fp16: true
 # Use CUDA tf32
 tf32: true # require >=ampere
-# Note: if bf16 is set to 'auto', and fp16 is set to true, we will prefer the explict fp16 setting

 # No AMP (automatic mixed precision)
 bfloat16: true # require >=ampere
@@ -114,10 +121,8 @@ plugins:
  # - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

 # A list of one or more datasets to finetune the model with
-# See https://docs.axolotl.ai/docs/dataset_loading.html for guide on loading datasets
-# See https://docs.axolotl.ai/docs/dataset-formats/ for guide on dataset formats
 datasets:
-  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory
+  # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
  - path: vicgalle/alpaca-gpt4
    # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]
    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
@@ -175,14 +180,6 @@ datasets:
    # Key containing the messages (default: "messages")
    field_messages: messages

-    # Key containing the tools (default: "tools")
-    # Must be a list[dict] and follow [JSON schema](https://json-schema.org/learn/getting-started-step-by-step).
-    field_tools: tools
-
-    # Key containing the system message (default: "system")
-    # If the system message is not present in the dataset sample, it will be loaded from the field_system property.
-    field_system: system
-
    # Mapping of properties from the input dataset to the chat template.
    # (default: message_property_mappings={'role':'role', 'content':'content'})
    # If a property exists in the template but not in this mapping, the system will attempt
@@ -209,14 +206,10 @@ datasets:
    # adding a system turn with empty content.
    drop_system_message:

-    # Optional[bool]. (for Qwen3 template only) Whether to split the assistant content based on a reasoning trace inside delimited tags
-    # See example at `docs/dataset-formats/conversation.qmd`
-    split_thinking:
-
    # IMPORTANT: The following fields determine which parts of the conversation to train on.
    # Priority order: message_field_training > message_field_training_detail > train_on_inputs or role in roles_to_train
    # See examples at `docs/dataset-formats/conversation.qmd`
-    # Note: If the below 5 fields are empty, defaults to training only on the last message.
+    # Note: If the below 4 fields are set to empty, defaults to training only on the last message.

    # Optional[List[str]]. Roles to train on. The tokens from these roles will be considered for the loss.
    roles_to_train: ["assistant"]  # default
@@ -225,13 +218,7 @@ datasets:
    # - turn (default): train on the EOS token at the end of each trainable turn
    # - last: train on the last EOS token in the conversation
    # TIP: Please make sure that your `tokenizer.eos_token` is same as EOS/EOT token in template. Otherwise, set `eos_token` under `special_tokens`.
-    train_on_eos: turn
-    # Optional[str]. Which EOT (End-of-Turn) tokens to train on in the conversation. Possible values are:
-    # - all: train on all EOT tokens
-    # - turn: train on the EOT token at the end of each trainable turn
-    # - last: train on the last EOT token in the conversation
-    # If not specified, defaults to the value of train_on_eos for backward compatibility.
-    train_on_eot:
+    train_on_eos: last
    # The key in the message turn that indicates via boolean whether tokens of a turn should be considered for training. Useful to selectively train on certain turns besides the `roles_to_train`.
    message_field_training: training
    # The key in the message turn that contains the training details. Useful to selectively train on certain tokens in a turn.
@@ -243,7 +230,7 @@ datasets:
 # The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.
 shuffle_merged_datasets: true

-# Deduplicates datasets and test_datasets with identical entries.
+Deduplicates datasets and test_datasets with identical entries.
 dataset_exact_deduplication: true

 # A list of one or more datasets to eval the model with.
@@ -292,25 +279,10 @@ trl:

  num_generations: # Optional[int]. Number of generations to sample.
  log_completions: # Optional[bool]. Whether to log completions.
-  num_completions_to_print: # Optional[int]. Number of completions to print when log_completions is True.

  sync_ref_model: # Optional[bool]. Whether to sync the reference model.
  ref_model_mixup_alpha: # Optional[float]. Mixup alpha for the reference model.
  ref_model_sync_steps: # Optional[int]. Sync steps for the reference model.
-  scale_rewards: # Optional[bool]. Whether to scale rewards by their standard deviation.
-
-  temperature: # Optional[float]. Sampling temperature for the GRPO policy.
-  top_p: # Optional[float]. Top-p sampling probability for the generation policy.
-  top_k: # Optional[int]. Top-k sampling for the generation policy.
-  min_p: # Optional[float]. Minimum probability for the generation policy.
-  repetition_penalty: # Optional[float]. Penalty for tokens that appear in prompt and generated text.
-
-  num_iterations: # Optional[int]. Number of iterations per batch (μ) for GRPO.
-  epsilon: # Optional[float]. Epsilon value for clipping in the GRPO algorithm.
-  epsilon_high: # Optional[float]. Upper-bound epsilon value for clipping in the GRPO algorithm.
-  use_liger_loss: # Optional[bool]. Whether to use Liger loss for GRPO.
-  loss_type: # Optional[str]. Loss formulation to use. Supported values: grpo, bnpo, dr_grpo.
-  mask_truncated_completions: # Optional[bool]. Whether to exclude truncated completions from loss calculation.


 # reward modelling: `True` or `False`
@@ -329,17 +301,8 @@ process_reward_model:
 chat_template: tokenizer_default
 # custom jinja template for chat template. This will be only used if chat_template is set to `jinja` or `null` (in which case chat_template is automatically set to `jinja`). Default is null.
 chat_template_jinja: null
-# Optional[List[str]]. Custom EOT (End-of-Turn) tokens to mask/unmask during training.
-# These tokens mark the boundaries between conversation turns.
-# For example: ["/INST", "</s>", "[/SYSTEM_PROMPT]"]
-# If not specified, defaults to just the model's eos_token.
-# This is useful for templates that use multiple delimiter tokens.
-eot_tokens:
-  # - "</s>"
-  # - "[/INST]"
-  # - "[/SYSTEM_PROMPT]"
-# Changes the default system message
-default_system_message: You are a helpful assistant. Please give a long and detailed answer. # Currently only supports chatml.
+# Changes the default system message. Currently only supports chatml.
+default_system_message: You are a helpful assistant. Please give a long and detailed answer.
 # Axolotl attempts to save the dataset as an arrow after packing the data together so
 # subsequent training attempts load faster, relative path
 dataset_prepared_path: data/last_run_prepared
@@ -520,7 +483,6 @@ output_dir: ./completed-model
 # setting to `auto` will enable torch compile when torch>=2.5.1
 torch_compile:  # Optional[Union[Literal["auto"], bool]]
 torch_compile_backend:  # Optional[str]
-torch_compile_mode:  # 'default' | 'reduce-overhead' | 'max-autotune'

 # Training hyperparameters

@@ -543,7 +505,6 @@ save_strategy: # Set to `"no"` to skip checkpoint saves, `"epoch"` at end of eac
 save_steps: # Leave empty to save at each epoch, integer for every N steps. float for fraction of total steps
 saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
 save_total_limit: # Checkpoints saved at a time
-save_only_model: # Save only the model weights, skipping the optimizer. Using this means you can't resume from checkpoints.
 # Maximum number of iterations to train for. It precedes num_epochs which means that
 # if both are set, num_epochs will not be guaranteed.
 # e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
@@ -567,7 +528,7 @@ profiler_steps: # enable the pytorch profiler to capture the first N steps of tr
 loss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)
 loss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)

-# Save model as safetensors (require safetensors package). Default True
+# Save model as safetensors (require safetensors package)
 save_safetensors:

 # Whether to mask out or include the human's prompt from the training labels
@@ -577,7 +538,7 @@ train_on_inputs: false
 # Note that training loss may have an oscillating pattern with this enabled.
 group_by_length: false

-# Whether to use gradient checkpointing. Available options are: true, false, "offload", "offload_disk".
+# Whether to use gradient checkpointing. Available options are: true, false, "offload".
 # https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
 gradient_checkpointing: false
 # additional kwargs to pass to the trainer for gradient checkpointing
@@ -589,24 +550,7 @@ gradient_checkpointing: false
 early_stopping_patience: 3

 # Specify a scheduler and kwargs to use with the optimizer
-# Valid values are driven by the Transformers SchedulerType class, see:
-# https://github.com/huggingface/transformers/blob/5f4ecf2d9f867a1255131d2461d75793c0cf1db2/src/transformers/trainer_utils.py#L420
-# Valid values include
-# - 'linear'
-# - 'cosine' (default)
-# - 'cosine_with_restarts'
-# - 'polynomial'
-# - 'constant'
-# - 'constant_with_warmup'
-# - 'inverse_sqrt'
-# - 'reduce_lr_on_plateau'
-# - 'cosine_with_min_lr'
-# - 'warmup_stable_decay'
-
-# Additional schedulers include:
-# - 'one_cycle'
-# - 'rex'
-lr_scheduler:
+lr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | empty for cosine
 lr_scheduler_kwargs:
 cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
 cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)
@@ -624,7 +568,7 @@ lr_div_factor: # Learning rate div factor
 #
 # Valid values for 'optimizer' include:
 # - adamw_torch
-# - adamw_torch_fused (default)
+# - adamw_torch_fused
 # - adamw_torch_xla
 # - adamw_torch_npu_fused
 # - adamw_apex_fused
@@ -668,7 +612,6 @@ lr_div_factor: # Learning rate div factor
 # - optimi_adamw
 # - ao_adamw_8bit
 # - ao_adamw_fp8
-# - came_pytorch
 optimizer:
 # Dictionary of arguments to pass to the optimizer
 optim_args:
@@ -688,9 +631,7 @@ weight_decay:
 # adamw hyperparams
 adam_beta1:
 adam_beta2:
-adam_beta3:  # only used for CAME Optimizer
 adam_epsilon:
-adam_epsilon2:  # only used for CAME Optimizer
 # Gradient clipping max norm
 max_grad_norm:

@@ -746,10 +687,8 @@ special_tokens:
  # unk_token: "<unk>"
  # pad_token: "[PAD]"

-# Optional[list[str]]. Add extra tokens to the tokenizer.
+# Add extra tokens.
 tokens:
-  # - "<|startoftext|>"
-  # - "<|endoftext|>"

 # Mapping token_id to new_token_string to override reserved added_tokens in the tokenizer.
 # Only works for tokens that are not part of the base vocab (aka are added_tokens).
--- a/docs/custom_integrations.qmd
+++ b/docs/custom_integrations.qmd
@@ -49,8 +49,7 @@ sections = [
    ("Knowledge Distillation (KD)", "kd"),
    ("Liger Kernels", "liger"),
    ("Language Model Evaluation Harness (LM Eval)", "lm_eval"),
-    ("Spectrum", "spectrum"),
-    ("LLMCompressor", "llm_compressor")
+    ("Spectrum", "spectrum")
 ]

 for section_name, folder_name in sections:
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -4,6 +4,18 @@ description: Conversation format for supervised fine-tuning.
 order: 3
 ---

+## sharegpt
+
+::: {.callout-important}
+ShareGPT is deprecated!. Please see [chat_template](#chat_template) section below.
+:::
+
+## pygmalion
+
+```{.json filename="data.jsonl"}
+{"conversations": [{"role": "...", "value": "..."}]}
+```
+
 ## chat_template

 Chat Template strategy uses a jinja2 template that converts a list of messages into a prompt. Support using tokenizer's template, a supported template, or custom jinja2.
@@ -52,9 +64,7 @@ We recommend checking the below examples for other usecases.

 ### Examples

-#### Training on last message
-
-(Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.
+1. Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.

 ```yaml
 datasets:
@@ -68,9 +78,7 @@ datasets:
 If you receive an error like "`chat_template` choice is `tokenizer_default` but tokenizer's `chat_template` is null.", it means the tokenizer does not have a default `chat_template`. Follow the examples below instead to set a custom `chat_template`.
 :::

-#### Overriding default chat template
-
-Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages.
+2. Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages.

 ```yaml
 chat_template: gemma # this overwrites the tokenizer's chat_template
@@ -80,13 +88,7 @@ datasets:
    roles_to_train: ["assistant"]  # default value
 ```

-::: {.callout-note}
-If you want to use built-in chat_template, use `chat_template: tokenizer_default` (this is set by default).
-:::
-
-#### Using default chat template with fallback
-
-Using the tokenizer_config.json's chat template or `chatml` as fallback if the former's chat template does not exist, on OpenAI messages format, training on all assistant messages.
+3. Using the tokenizer_config.json's chat template or `chatml` as fallback if the former's chat template does not exist, on OpenAI messages format, training on all assistant messages.

 ```yaml
 chat_template: tokenizer_default_fallback_chatml # this overwrites the tokenizer's chat_template
@@ -95,9 +97,7 @@ datasets:
    type: chat_template
 ```

-#### Custom Jinja template
-
-Using a custom jinja template on OpenAI messages format, training on all assistant messages.
+4. Using a custom jinja template on OpenAI messages format, training on all assistant messages.

 ```yaml
 # chat_template: jinja # `jinja` will be implied if the `chat_template_jinja` is set and this field is empty
@@ -109,123 +109,10 @@ datasets:
 ```

 ::: {.callout-important}
-Please make sure that your `tokenizer.eos_token` is same as EOS (End-of-Sequence) token in template. Otherwise, set `eos_token` under `special_tokens: `.
+Please make sure that your `tokenizer.eos_token` is same as EOS/EOT token in template. Otherwise, set `eos_token` under `special_tokens`.
 :::

-#### Using template with different token for EOT and EOS
-
- If you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the `eot_tokens: ` config. The handling of EOT tokens follows `train_on_eos: ` which defaults to turn.
-
-```yaml
-eot_tokens:
-  - "[/INST]"
-  # - "[/SYSTEM_PROMPT]"
-
-datasets:
-  - path: ...
-    type: chat_template
-
-    # optional
-    train_on_eot: turn  # defaults read from train_on_eos (which defaults to turn)
-```
-
-::: {.callout-tip}
-See [config documentation](../config.qmd) for detailed explanations of "turn", "last", and "all" options for training on tokens.
-:::
-
-::: {.callout-note}
-Using `eot_tokens` requires each token that exists in `chat_template` to be a single token in the tokenizer. Otherwise, the tokenizer will split the token and cause unexpected behavior.
-
-You can add those tokens as new tokens under `tokens: ` or (recommended) override unused added_tokens via `added_tokens_overrides: `. See [config](../config.qmd) for more details.
-:::
-
- Continuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set `train_on_eos: last`.
-
-```yaml
-eot_tokens:
-  - "[/INST]"
-  # ...
-
-datasets:
-  - path: ...
-    type: chat_template
-
-    train_on_eos: last
-    train_on_eot: turn
-```
-
-::: {.callout-tip}
-If EOS token only appears at the end of a prompt, `train_on_eos: last` is equivalent to `train_on_eos: turn`. Therefore, generally, you can leave them to their defaults and omit them.
-:::
-
-
-#### Using tool use
-
-Instead of passing `tools` via the system prompt, an alternative method would be to have the `tools` in a separate column and loaded via `chat_template` to let the template dynamically build it.
-
-```json
-{
-    "tools": [
-        {
-            "type": "...",
-            "function": {
-                "name": "...",
-                "description": "...",
-                "parameters": {
-                    "type": "...",
-                    "properties": {
-                        // ...
-                    },
-                    "required": ["..."],
-                },
-            },
-        },
-    ],
-    "messages": [
-        // ...
-        {
-            "role": "assistant", // call the function via assistant
-            "tool_calls": [
-                {
-                    "type": "function",
-                    "function": {
-                        "name": "...",
-                        "arguments": {
-                            "...": "...",
-                        }
-                    }
-                }
-            ]
-        },
-        {
-            "role": "tool",
-            "name": "...",
-            "content": "..."
-        },
-    ],
-}
-```
-
-::: {.callout-note}
-Tools need to follow [JSON schema](https://json-schema.org/learn/getting-started-step-by-step).
-:::
-
-```yaml
-chat_template: llama4
-datasets:
-  - path: ...
-    type: chat_template
-    # field_tools: tools # default is `tools`
-```
-
-::: {.callout-tip}
-Look into the `chat_template` you are using to see if it supports `tools` and what the expected role is for the tool answer. In the example above, the tool answer is expected to be in the `tool` or `ipython` role for `llama4` template.
-:::
-
-
-#### Using fine-grained control over token masking
-
-(Advanced) Using fine-grained control over tokens and turns to train in a conversation
+5. (Advanced) Using fine-grained control over tokens and turns to train in a conversation

 For a data sample that looks like:

@@ -275,45 +162,3 @@ datasets:
 ::: {.callout-tip}
 It is not necessary to set both `message_field_training` and `message_field_training_detail` at once.
 :::
-
-#### Reasoning split
-
-(For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.
-
-```yaml
-datasets:
-  - path: ...
-    type: chat_template
-    chat_template: qwen3
-    split_thinking: true
-```
-
-For example, a content can look like:
-
-```json
-{
-  "content": "<think>Some thinking outputs</think>Output after thinking."
-}
-```
-
-After split, it will look like:
-
-```json
-{
-  "reasoning_content": "Some thinking outputs",
-  "content": "Output after thinking..."
-}
-```
-
-
-## sharegpt
-
-::: {.callout-important}
-ShareGPT is deprecated!. Please see [chat_template](#chat_template) section.
-:::
-
-## pygmalion
-
-```{.json filename="data.jsonl"}
-{"conversations": [{"role": "...", "value": "..."}]}
-```
--- a/docs/dataset-formats/index.qmd
+++ b/docs/dataset-formats/index.qmd
@@ -36,6 +36,10 @@ It is typically recommended to save your dataset as `.jsonl` due to its flexibil

 Axolotl supports loading from a Hugging Face hub repo or from local files.

+::: {.callout-important}
+For pre-training only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts.
+:::
+
 ### Pre-training from Hugging Face hub datasets

 As an example, to train using a Hugging Face dataset `hf_org/name`, you can pass the following config:
@@ -73,21 +77,18 @@ datasets:
    type: completion
 ```

-From local files:
+From local files (either example works):

 ```yaml
 datasets:
  - path: A.jsonl
    type: completion

-  - path: B.jsonl
+  - path: json
+    data_files: ["A.jsonl", "B.jsonl", "C.jsonl"]
    type: completion
 ```

-::: {.callout-important}
-For `completion` only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts. If you are interested in having this for `pretraining_dataset` too, please let us know or help make a PR!
-:::
-
 ### Pre-training dataset configuration tips

 #### Setting max_steps
--- a/docs/dataset_loading.qmd
+++ b/docs/dataset_loading.qmd
@@ -54,7 +54,7 @@ datasets:

 #### Files

-To load a JSON file, you would do something like this:
+Usually, to load a JSON file, you would do something like this:

 ```python
 from datasets import load_dataset
@@ -66,11 +66,19 @@ Which translates to the following config:

 ```yaml
 datasets:
-  - path: data.json
-    ds_type: json
+  - path: json
+    data_files: /path/to/your/file.jsonl
 ```

-In the example above, it can be seen that we can just point the `path` to the file or directory along with the `ds_type` to load the dataset.
+However, to make things easier, we have added a few shortcuts for loading local dataset files.
+
+You can just point the `path` to the file or directory along with the `ds_type` to load the dataset. The below example shows for a JSON file:
+
+```yaml
+datasets:
+  - path: /path/to/your/file.jsonl
+    ds_type: json
+```

 This works for CSV, JSON, Parquet, and Arrow files.

--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -8,10 +8,6 @@ format:

 This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai).

-::: {.callout-important}
-For Blackwell GPUs, please use the tags with Pytorch 2.7.1 and CUDA 12.8.
-:::
-
 ## Base

 The base image is the most minimal image that can install Axolotl. It is based on the `nvidia/cuda` image. It includes python, torch, git, git-lfs, awscli, pydantic, and more.
@@ -32,10 +28,9 @@ main-base-py{python_version}-cu{cuda_version}-{pytorch_version}

 Tags examples:

- `main-base-py3.11-cu128-2.7.1`
- `main-base-py3.11-cu126-2.7.1`
 - `main-base-py3.11-cu124-2.6.0`
 - `main-base-py3.11-cu124-2.5.1`
+- `main-base-py3.11-cu124-2.4.1`

 ## Main

@@ -55,7 +50,7 @@ Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl)
 # on push to main
 main-py{python_version}-cu{cuda_version}-{pytorch_version}

-# latest main (currently torch 2.6.0, python 3.11, cuda 12.4)
+# latest main (currently torch 2.5.1, python 3.11, cuda 12.4)
 main-latest

 # nightly build
@@ -73,13 +68,14 @@ There may be some extra tags appended to the image, like `-vllm` which installs

 Tags examples:

- `main-py3.11-cu126-2.7.0`
 - `main-py3.11-cu124-2.6.0`
 - `main-py3.11-cu124-2.5.1`
+- `main-py3.11-cu124-2.4.1`
 - `main-latest`
 - `main-20250303-py3.11-cu124-2.6.0`
 - `main-20250303-py3.11-cu124-2.5.1`
- `0.9.2`
+- `main-20250303-py3.11-cu124-2.4.1`
+- `0.7.1`

 ## Cloud

--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -73,54 +73,10 @@ description: Frequently asked questions

 > A: This is likely an empty turn.

-**Q: The EOS token is incorrectly being masked or not being masked / `EOS token __ not found in chat template`.**
+**Q: The EOS/EOT token is incorrectly being masked or not being masked.**

-> A: There can be two reasons:
-
-> 1. This is because of the mismatch between `tokenizer.eos_token` and EOS token in template. Please make sure to set `eos_token: ` under `special_tokens: ` to the same EOS token as in template.
-
-> 2. The EOS token is not in the template. Please check if your template is correct. As an example, `phi_35` template does not use its dedicated EOS token `<|endoftext|>` at the end.
+> A: This is because of the mismatch between `tokenizer.eos_token` and EOS/EOT token in template. Please make sure to set `eos_token` under `special_tokens` to the same EOS/EOT token as in template.

 **Q: "`chat_template` choice is `tokenizer_default` but tokenizer's `chat_template` is null. Please add a `chat_template` in tokenizer config"**

 > A: This is because the tokenizer does not have a chat template. Please add a chat template in the tokenizer config. See [chat_template](dataset-formats/conversation.qmd#chat-template) for more details.
-
-**Q: The EOT token(s) are incorrectly being masked or not being masked / `EOT token __ not found in chat template`.**
-
-> A: There can be two reasons:
-
-> 1. The EOT token is different from the EOS token and was not specified under `eot_tokens: `. Please set `eot_tokens: ` to the same EOT token(s) as in template.
-
-> 2. There is more than one EOT token per turn in the template. Please raise an issue with examples as we recognize this as an edge case.
-
-**Q: `EOT token encoding failed. Please check if the token is valid and can be encoded.`**
-
-> A: There could be some issue with the tokenizer or unicode encoding. Please raise an issue with examples with the EOT token & tokenizer causing the issue.
-
-**Q: `EOT token __ is encoded as multiple tokens.`**
-
-> A: This is because the EOT token is encoded as multiple tokens which can cause unexpected behavior. Please add it under `tokens: ` or (recommended) override unused added_tokens via `added_tokens_overrides: `.
-
-**Q: `Conflict between train_on_eos and train_on_eot. eos_token is in eot_tokens and train_on_eos != train_on_eot`**
-
-> A: This is because the EOS token is in the `eot_tokens: ` while mismatch between `train_on_eos: ` and `train_on_eot: `. This will cause one to override the other. Please ensure that `train_on_eos: ` and `train_on_eot: ` are the same or remove the EOS token from `eot_tokens: `.
-
-**Q: If `eot_tokens: ` is not provided, what happens?**
-
-> A: If `eot_tokens: ` is not provided, the default behavior is the same as before. EOS tokens used to delimit turns are masked/unmasked depending on whether the turn is trainable.
-
-> Internally, `eot_tokens: tokenizer.eos_token` and `train_on_eot: train_on_eos` (which defaults to `turn`). This transition helps clarify the naming and behavior of EOT/EOS tokens.
-
-**Q: `Data processing error: CAS service error`**
-
-> A: Try disabling XET with `export HF_HUB_DISABLE_XET=1`
-
-**Q: `torch._inductor.exc.LoweringException: NoValidChoicesError: No choices to select, please consider adding ATEN into max_autotune_gemm_backends config (defined in torch/_inductor/config.py) to allow at least one choice. `**
-
-> A: Depending on the version of torch, you may need to include this in your YAML:
-
-> ```yaml
-> flex_attn_compile_kwargs:
->   dynamic: false
->   mode: max-autotune-no-cudagraphs
-> ```
--- a/docs/getting-started.qmd
+++ b/docs/getting-started.qmd
@@ -104,7 +104,7 @@ the `alpaca` dataset format, which has the following format:
 Please see our [Dataset Formats](dataset-formats) for more dataset formats and how to
 format them.

-2. Prepare your JSONL data in the specified format (in this case, the expected `alpaca`
+2. Prepare your JSONL data in the specified format (in this case, the expected `alpaca
 format):

 ```json
@@ -120,12 +120,6 @@ axolotl train my_training.yml

 ## Common Tasks {#sec-common-tasks}

-::: {.callout-tip}
-
-The same yaml file is used for training, inference, and merging.
-
-:::
-
 ### Testing Your Model {#sec-testing}

 After training, test your model:
@@ -134,16 +128,6 @@ After training, test your model:
 axolotl inference my_training.yml --lora-model-dir="./outputs/lora-out"
 ```

-More details can be found in [Inference](inference.qmd).
-
-### Using a UI {#sec-ui}
-
-Launch a Gradio interface:
-
-```bash
-axolotl inference my_training.yml --lora-model-dir="./outputs/lora-out" --gradio
-```
-
 ### Preprocessing Data {#sec-preprocessing}

 For large datasets, preprocess first:
@@ -152,22 +136,14 @@ For large datasets, preprocess first:
 axolotl preprocess my_training.yml
 ```

-Please make sure to set `dataset_prepared_path: ` in your config to set the path to save the prepared dataset.
+### Using a UI {#sec-ui}

-More details can be found in [Dataset Preprocessing](dataset_preprocessing.qmd).
-
-### Merging LoRA weights {#sec-merging-lora}
-
-To merge the LoRA weights back into the base model, run:
+Launch a Gradio interface:

 ```bash
-axolotl merge-lora my_training.yml --lora-model-dir="./outputs/lora-out"
+axolotl inference my_training.yml --lora-model-dir="./outputs/lora-out" --gradio
 ```

-The merged model will be saved in the `{output_dir}/merged` directory.
-
-More details can be found in [Merging LoRA weights](inference.qmd#sec-merging).
-
 ## Next Steps {#sec-next-steps}

 Now that you have the basics, you might want to:
@@ -180,7 +156,6 @@ Now that you have the basics, you might want to:
 Check our other guides for details on these topics:

 - [Configuration Guide](config.qmd) - Full configuration options
- [Dataset Loading](dataset_loading.qmd) - Loading datasets from various sources
 - [Dataset Formats](dataset-formats) - Working with different data formats
 - [Multi-GPU Training](multi-gpu.qmd)
 - [Multi-Node Training](multi-node.qmd)
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -15,20 +15,10 @@ This guide covers all the ways you can install and set up Axolotl for your envir

 - NVIDIA GPU (Ampere architecture or newer for `bf16` and Flash Attention) or AMD GPU
 - Python ≥3.10
- PyTorch ≥2.5.1
+- PyTorch ≥2.4.1

 ## Installation Methods {#sec-installation-methods}

-::: {.callout-important}
-Please make sure to have Pytorch installed before installing Axolotl in your local environment.
-
-Follow the instructions at: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
-:::
-
-::: {.callout-important}
-For Blackwell GPUs, please use Pytorch 2.7.0 and CUDA 12.8.
-:::
-
 ### PyPI Installation (Recommended) {#sec-pypi}

 ```{.bash}
@@ -41,40 +31,6 @@ installed) in order not to clobber it, and so that we set the correct version of
 dependencies that are specific to the PyTorch version or other installed
 co-dependencies.

-### uv Installation {#sec-uv}
-
-uv is a fast, reliable Python package installer and resolver built in Rust. It offers significant performance improvements over pip and provides better dependency resolution, making it an excellent choice for complex environments.
-
-Install uv if not already installed
-```{.bash}
-curl -LsSf https://astral.sh/uv/install.sh | sh
-source $HOME/.local/bin/env
-```
-
-Choose your CUDA version to use with PyTorch; e.g. `cu124`, `cu126`, `cu128`,
-then create the venv and activate
-```{.bash}
-export UV_TORCH_BACKEND=cu126
-uv venv --no-project --relocatable
-source .venv/bin/activate
-```
-
-Install PyTorch
- PyTorch 2.6.0 recommended
-```{.bash}
-uv pip install packaging setuptools wheel
-uv pip install torch==2.6.0
-uv pip install awscli pydantic
-```
-
-Install axolotl from PyPi
-```{.bash}
-uv pip install --no-build-isolation axolotl[deepspeed,flash-attn]
-
-# optionally install with vLLM if you're using torch==2.6.0 and want to train w/ GRPO
-uv pip install --no-build-isolation axolotl[deepspeed,flash-attn,vllm]
-```
-
 ### Edge/Development Build {#sec-edge-build}

 For the latest features between releases:
@@ -110,10 +66,6 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it \
 ```
 :::

-::: {.callout-important}
-For Blackwell GPUs, please use `axolotlai/axolotl:main-py3.11-cu128-2.7.0` or the cloud variant `axolotlai/axolotl-cloud:main-py3.11-cu128-2.7.0`.
-:::
-
 Please refer to the [Docker documentation](docker.qmd) for more information on the different Docker images that are available.

 ## Cloud Environments {#sec-cloud}
--- a/docs/lora_optims.qmd
+++ b/docs/lora_optims.qmd
@@ -84,10 +84,6 @@ lora_qkv_kernel: true
 lora_o_kernel: true
 ```

-::: {.callout-note}
-Currently, LoRA kernels are not supported for RLHF training, only SFT.
-:::
-
 ## Requirements

 - One or more NVIDIA or AMD GPUs (in order to use the Triton kernels)
--- a/docs/multi-gpu.qmd
+++ b/docs/multi-gpu.qmd
@@ -87,7 +87,20 @@ We support sequence parallelism (SP) via the
 allows one to split up sequences across GPUs, which is useful in the event that a
 single sequence causes OOM errors during model training.

-See our [dedicated guide](sequence_parallelism.qmd) for more information.
+First, install `ring-flash-attn`, recommended via `pip install axolotl[ring-flash-attn]`,
+or from source with `pip install .[ring-flash-attn]`.
+
+Your Axolotl YAML config should contain the following lines:
+
+```{.yaml}
+sequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU
+flash_attention: true  # Required with sequence parallelism
+
+# Optional; strides across the key dimension. Larger values use more memory but will make training faster.
+heads_k_stride: 1
+```
+
+See our [dedicated guide](sequence_parallelism.qmd) for more details.

 ### FSDP + QLoRA {#sec-fsdp-qlora}

--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -43,7 +43,7 @@ datasets:
 # leave the vision model and vision tower frozen
 # load_in_8bit: true
 adapter: lora
-lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 # (optional) if you want to resize images to a set size
 image_size: 512
@@ -164,7 +164,7 @@ Here is an example of a multi-modal dataset:
        {
            "role": "user",
            "content": [
-                {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
+                {"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
                {"type": "text", "text": "Describe this image in detail."}
            ]
        },
--- a/docs/qat.qmd
+++ b/docs/qat.qmd
@@ -1,32 +0,0 @@
---
-title: "Quantization Aware Training (QAT)"
-back-to-top-navigation: true
-toc: true
-toc-expand: 2
-toc-depth: 4
---
-
-## Overview
-
-[Quantization Aware Training](https://pytorch.org/blog/introduction-to-quantization-on-pytorch/#quantization-aware-training) (QAT) is a technique for improving the accuracy of models which are quantized
-by applying "fake" quantizations to the model's weights (and optionally, activations) during training. This fake
-quantization allows for the model to adjust for noise introduced by the quantization, so when the model is eventually
-quantized, the accuracy loss is minimized. We use the quantization techniques implemented in [torchao](https://github.com/pytorch/ao) to provide
-support for QAT and post-training quantization (PTQ) in axolotl.
-
-We recommend reviewing the excellent QAT tutorial in the [torchtune library](https://pytorch.org/torchtune/main/tutorials/qat_finetune.html#quantizing-the-qat-model),
-and the QAT documentation in the [torchao library](https://github.com/pytorch/ao/tree/main/torchao/quantization/qat), for more details.
-
-## Configuring QAT in Axolotl
-
-To enable QAT in axolotl, add the following to your configuration file:
-
-```yaml
-qat:
-  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
-  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4" and "int8"
-  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
-  fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after
-```
-
-Once you have finished training, you must quantize your model by using the same quantization configuration which you used to train the model with. You can use the [`quantize`](./quantize.qmd) command to do this.
--- a/docs/quantize.qmd
+++ b/docs/quantize.qmd
@@ -1,53 +0,0 @@
---
-title: "Quantization with torchao"
-back-to-top-navigation: true
-toc: true
-toc-expand: 2
-toc-depth: 4
---
-
-Quantization is a technique to lower the memory footprint of your model, potentially at the cost of accuracy or model performance. We support quantizing your model using the [torchao](https://github.com/pytorch/ao) library. Quantization is supported for both post-training quantization (PTQ) and quantization-aware training (QAT).
-
-
-::: {.callout-note}
-
-We do not currently support quantization techniques such as GGUF/GPTQ,EXL2 at the moment.
-
-:::
-
-## Configuring Quantization in Axolotl
-
-Quantization is configured using the `quantization` key in your configuration file.
-
-```yaml
-base_model: # The path to the model to quantize.
-quantization:
-  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are uintX for X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8
-  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
-  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
-  quantize_embedding: # Optional[bool] = False. Whether to quantize the embedding layer.
-
-output_dir:  # The path to the output directory.
-```
-
-Once quantization is complete, your quantized model will be saved in the `{output_dir}/quantized` directory.
-
-You may also use the `quantize` command to quantize a model which has been trained with [QAT](./qat.md) - you can do this by using the existing QAT configuration file which
-you used to train the model:
-
-```yaml
-# qat.yml
-qat:
-  activation_dtype: int8
-  weight_dtype: int8
-  group_size: 256
-  quantize_embedding: true
-
-output_dir: # The path to the output directory used during training where the final checkpoint has been saved.
-```
-
-```bash
-axolotl quantize qat.yml
-```
-
-This ensures that an identical quantization configuration is used to quantize the model as was used to train it.
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -16,8 +16,7 @@ feedback. Various methods include, but not limited to:
 - [Identity Preference Optimization (IPO)](#ipo)
 - [Kahneman-Tversky Optimization (KTO)](#kto)
 - [Odds Ratio Preference Optimization (ORPO)](#orpo)
- [Group Relative Policy Optimization (GRPO)](#grpo)
- Proximal Policy Optimization (PPO) (not yet supported in axolotl, if you're interested in contributing, please reach out!)
+- Proximal Policy Optimization (PPO) (not yet supported in axolotl)


 ## RLHF using Axolotl
@@ -500,10 +499,12 @@ The input format is a simple JSON input with customizable fields based on the ab
 ### GRPO

 ::: {.callout-tip}
-Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/grpo_code).
+Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/axolotl-cookbook/tree/main/grpo#training-an-r1-style-large-language-model-using-grpo).
 :::

-In the latest GRPO implementation, `vLLM` is used to significantly speedup trajectory generation during training. In this example, we're using 4 GPUs - 2 for training, and 2 for vLLM:
+If you have multiple GPUs available, we reccomend using `vLLM` with the `GRPOTrainer` to significantly speedup trajectory generation during training.
+First, launch a `vLLM` server using `trl vllm-serve` - you may use a config file or CLI overrides to configure your vLLM server. In this example, we're
+using 4 GPUs - 2 for training, and 2 for vLLM:

 ::: {.callout-important}
 Make sure you've installed the correct version of vLLM by including it as an extra when installing axolotl, e.g. `pip install axolotl[vllm]`.
@@ -538,10 +539,6 @@ Your `vLLM` instance will now attempt to spin up, and it's time to kick off trai
 CUDA_VISIBLE_DEVICES=0,1 axolotl train grpo.yaml --num-processes 2
 ```

-::: {.callout-note}
-Due to TRL's implementation with vLLM, the vLLM instance must use the last N GPUs instead of the first N GPUs. This is why in the example above, we use `CUDA_VISIBLE_DEVICES=2,3` for the vLLM instance.
-:::
-
 #### Reward functions

 GRPO uses custom reward functions and transformations. Please have them ready locally.
@@ -583,20 +580,7 @@ datasets:

 To see other examples of custom reward functions, please see [TRL GRPO Docs](https://github.com/huggingface/trl/blob/main/docs/source/grpo_trainer.md#using-a-custom-reward-function).

-To see all configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/v0.9.2/src/axolotl/utils/schemas/trl.py).
-
-#### GRPO with DAPO/Dr. GRPO loss
-
-The DAPO paper and subsequently Dr. GRPO paper proposed an alternative loss function for GRPO to remediate the penalty in longer responses.
-
-```yaml
-trl:
-  loss_type: dr_grpo
-  # Normalizes loss based on max completion length (default: 256)
-  max_completion_length:
-```
-
-For more information, see [GRPO docs](https://huggingface.co/docs/trl/v0.17.0/en/grpo_trainer#loss-types).
+To see description of the configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/config/models/input/v0_4_1/trl.py).

 ### SimPO

--- a/docs/sequence_parallelism.qmd
+++ b/docs/sequence_parallelism.qmd
@@ -3,6 +3,8 @@ title: Sequence Parallelism
 description: Train with long sequences split across multiple GPUs.
 ---

+# Sequence Parallelism
+
 Sequence parallelism is a technique that splits sequences across multiple GPUs,
 allowing you to train with very long sequences that wouldn't fit on a single GPU. Each
 GPU processes a different portion of the sequence, and the results are aggregated
@@ -25,7 +27,7 @@ To enable sequence parallelism, add the following to your configuration file:
 sequence_parallel_degree: 4  # Split sequences across 4 GPUs
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 heads_k_stride: 1
-# Optional; one of "varlen_llama3" or "batch_ring". Defaults to
+# Optional; one of "varlen_llama3", "batch_ring", "batch_zigzag", "batch_stripe". Defaults to
 # "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
 ring_attn_func:
 ```
@@ -41,7 +43,7 @@ When sequence parallelism is enabled:

 1. Each sequence is divided into equal chunks across the GPUs in a sequence parallel group
 2. The data collator handles the chunking of input_ids, attention_mask, labels, and position_ids
-3. Position IDs are adjusted to maintain proper relative positions
+3. Position IDs are adjusted to maintain proper relative positions, especially for packed sequences
 4. The trainer uses special ring communication patterns for attention operations

 ## Requirements
@@ -67,11 +69,9 @@ sequence_len: 8192
 ...

 sequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU
+flash_attention: true  # Required with sequence parallelism
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 heads_k_stride: 1
-# Optional; one of "varlen_llama3" or "batch_ring". Defaults to
-# "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
-ring_attn_func:

 ...
 ```
--- a/examples/gemma3/gemma-3-4b-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-qlora.yml
@@ -28,7 +28,7 @@ pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 wandb_project:
 wandb_entity:
--- a/examples/gemma3/gemma-3-4b-vision-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-vision-qlora.yml
@@ -30,7 +30,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 wandb_project:
 wandb_entity:
--- a/examples/glm4/qlora-32b.yaml
+++ b/examples/glm4/qlora-32b.yaml
@@ -1,62 +0,0 @@
-base_model: THUDM/GLM-4-32B-0414
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-
-load_in_4bit: true
-
-datasets:
-  - path: teknium/GPT4-LLM-Cleaned
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0
-output_dir: ./outputs/qlora-out
-
-adapter: qlora
-lora_model_dir:
-
-sequence_len: 2048
-sample_packing: true
-eval_sample_packing: true
-pad_to_sequence_len: true
-
-lora_r: 16
-lora_alpha: 32
-lora_dropout: 0.05
-lora_target_modules:
-  - gate_proj
-  - down_proj
-  - up_proj
-  - q_proj
-  - v_proj
-  - k_proj
-  - o_proj
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 2
-micro_batch_size: 2
-num_epochs: 1
-optimizer: adamw_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: false
-
-gradient_checkpointing: true
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-loss_watchdog_threshold: 5.0
-loss_watchdog_patience: 3
-
-warmup_steps: 10
-evals_per_epoch: 1
-saves_per_epoch: 1
-weight_decay: 0.0
-special_tokens:
--- a/examples/llama-3-vision/lora-11b.yaml
+++ b/examples/llama-3-vision/lora-11b.yaml
@@ -29,7 +29,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 wandb_project:
 wandb_entity:
--- a/examples/llama-3/3b-qat-fsdp2.yaml
+++ b/examples/llama-3/3b-qat-fsdp2.yaml
@@ -1,79 +0,0 @@
-base_model: meta-llama/Llama-3.2-3B
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-
-load_in_8bit: false
-load_in_4bit: false
-strict: false
-
-plugins:
-  - axolotl.integrations.liger.LigerPlugin
-
-liger_rope: true
-liger_rms_norm: true
-liger_glu_activation: true
-liger_layer_norm: true
-liger_fused_linear_cross_entropy: true
-
-datasets:
-  - path: yahma/alpaca-cleaned
-    type: alpaca
-
-output_dir: ./outputs/qat_out/
-
-sample_packing: true
-pad_to_sequence_len: true
-sequence_len: 512
-
-flex_attention: true
-flex_attn_compile_kwargs:
-  dynamic: false
-  mode: max-autotune-no-cudagraphs
-
-qat:
-  activation_dtype: int8
-  weight_dtype: int4
-  group_size: 32
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 1
-micro_batch_size: 16
-num_epochs: 1
-optimizer: adamw_torch_fused
-
-cosine_constant_lr_ratio: 0
-cosine_min_lr_ratio: 1.0
-learning_rate: 2e-5
-save_only_model: true
-bf16: true
-
-resume_from_checkpoint:
-logging_steps: 1
-
-evals_per_epoch: 1
-saves_per_epoch: 1
-
-warmup_steps: 10
-weight_decay: 0.0
-fsdp:
-  - full_shard
-  - auto_wrap
-
-fsdp_config:
-  fsdp_version: 2
-  fsdp_offload_params: false
-  fsdp_cpu_ram_efficient_loading: true
-  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
-  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_sharding_strategy: FULL_SHARD
-  fsdp_reshard_after_forward: true
-  fsdp_activation_checkpointing: true
-
-special_tokens:
-  pad_token: <|end_of_text|>
--- a/examples/llama-3/instruct-dpo-lora-8b.yml
+++ b/examples/llama-3/instruct-dpo-lora-8b.yml
@@ -5,10 +5,6 @@ tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name

-special_tokens:
-  pad_token: <|finetune_right_pad_id|>
-  eos_token: <|eot_id|>
-
 load_in_8bit: true
 load_in_4bit: false

--- a/examples/llama-3/lora-1b.yml
+++ b/examples/llama-3/lora-1b.yml
@@ -5,7 +5,7 @@ base_model: NousResearch/Llama-3.2-1B
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
-
+dataset_prepared_path: last_run_prepared
 val_set_size: 0.1
 output_dir: ./outputs/lora-out

@@ -38,7 +38,6 @@ wandb_log_model:
 gradient_accumulation_steps: 2
 micro_batch_size: 2
 num_epochs: 1
-
 optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
--- a/examples/llama-3/sparse-finetuning.yaml
+++ b/examples/llama-3/sparse-finetuning.yaml
@@ -1,77 +0,0 @@
-base_model: neuralmagic/Sparse-Llama-3.1-8B-2of4
-
-plugins:
-  - axolotl.integrations.llm_compressor.LLMCompressorPlugin
-
-load_in_8bit: false
-load_in_4bit: false
-strict: false
-
-datasets:
-  - path: tatsu-lab/alpaca
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.05
-output_dir: ./outputs/out
-
-sequence_len: 4096
-sample_packing: true
-pad_to_sequence_len: true
-eval_sample_packing: false
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 8
-micro_batch_size: 1
-num_epochs: 1
-optimizer: paged_adamw_8bit
-lr_scheduler: cosine
-learning_rate: 2e-5
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16:
-tf32: false
-
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-early_stopping_patience:
-resume_from_checkpoint:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-
-warmup_steps: 100
-evals_per_epoch: 2
-eval_table_size:
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
-  pad_token: <|end_of_text|>
-
-llmcompressor:
-  recipe:
-    finetuning_stage:
-      finetuning_modifiers:
-        ConstantPruningModifier:
-          targets: [
-            're:.*q_proj.weight',
-            're:.*k_proj.weight',
-            're:.*v_proj.weight',
-            're:.*o_proj.weight',
-            're:.*gate_proj.weight',
-            're:.*up_proj.weight',
-            're:.*down_proj.weight',
-          ]
-          start: 0
-  save_compressed: true
--- a/examples/llama-4/README.md
+++ b/examples/llama-4/README.md
@@ -26,13 +26,3 @@ Multi-GPU (4xH100) for Llama 4 Scout uses 62.8GB VRAM/GPU @ 4k contenxt length @
 ### Llama 4 Maverick 17Bx128Experts (400B)

 Coming Soon
-
-## Delinearized Llama 4 Models
-
-We provide a script to delinearize Llama 4 linearized models into regular HuggingFace Llama 4 models.
-
-```bash
-axolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir
-```
-
-Note: This only works with the non-quantized linearized model. If you have an adapter, merge it with the *non-quantized linearized* model before delinearizing.
--- a/examples/llama-4/scout-qlora-single-h100-flex.yaml
+++ b/examples/llama-4/scout-qlora-single-h100-flex.yaml
@@ -10,6 +10,7 @@ plugins:
 liger_glu_activation: true
 liger_rms_norm: true
 liger_layer_norm: true
+cut_cross_entropy: true

 llama4_linearized_experts: true  # needed with custom linearized experts model
 load_in_4bit: true
--- a/examples/llava/lora-7b.yaml
+++ b/examples/llava/lora-7b.yaml
@@ -25,7 +25,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 wandb_project:
 wandb_entity:
--- a/examples/magistral/README.md
+++ b/examples/magistral/README.md
@@ -1,71 +0,0 @@
-# Finetune Magistral Small with Axolotl
-
-Magistral Small is a 24B parameter opensource model from MistralAI found on [HuggingFace](https://huggingface.co/mistralai/Magistral-Small-2506). This guide shows how to fine-tune it with Axolotl with multi-turn conversations with proper masking.
-
-MistralAI has also released a proprietary medium-sized version called Magistral Medium.
-
-Thanks to the team at MistralAI for giving us early access to prepare for this release.
-
-## Getting started
-
-1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Magistral is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
-
-    Here is an example of how to install from main for pip:
-
-```bash
-# Ensure you have Pytorch installed (Pytorch 2.6.0 recommended)
-git clone https://github.com/axolotl-ai-cloud/axolotl.git
-cd axolotl
-
-pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation -e '.[flash-attn,mistral]'
-```
-
-2. Download the example config:
-
-```bash
-axolotl fetch examples
-```
-
-3. Run the finetuning example:
-
-```bash
-axolotl train examples/magistral/magistral-small-qlora.yaml
-```
-
-This config uses about 24GB VRAM.
-
-Let us know how it goes. Happy finetuning! 🚀
-
-### TIPS
-
- For inference, the official MistralAI team recommends `top_p: 0.95` and `temperature: 0.7` with `max_tokens: 40960`.
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The dataset format is the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
-
-## Optimization Guides
-
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
-
-## Limitations
-
-We only support the `mistral-common` tokenizer for Supervised Fine-tuning at the moment and for `type: chat_template` only.
-
-The tokenizer does not work with `dataset.map` with multiprocessing, so we had to disable it. In addition, we do not support overriding tokens yet.
-
-## Related Resources
-
- [MistralAI Magistral Blog](https://mistral.ai/news/magistral/)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
-
-
-## Future Work
-
- Add parity to Preference Tuning, RL, Multi-modal, etc.
- Add parity to other tokenizer configs like overriding tokens.
--- a/examples/magistral/magistral-small-fsdp-qlora.yaml
+++ b/examples/magistral/magistral-small-fsdp-qlora.yaml
@@ -1,72 +0,0 @@
-base_model: mistralai/Magistral-Small-2506
-
-# Enable to use mistral-common tokenizer
-tokenizer_use_mistral_common: true
-
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-
-load_in_8bit: false
-load_in_4bit: true
-
-datasets:
-  - path: fozziethebeat/alpaca_messages_2k_test
-    type: chat_template
-
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.1
-output_dir: ./outputs/lora-out
-
-adapter: qlora
-lora_model_dir:
-
-sequence_len: 2048
-sample_packing: true
-eval_sample_packing: false
-pad_to_sequence_len: true
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_target_modules:
-  - gate_proj
-  - down_proj
-  - up_proj
-  - q_proj
-  - v_proj
-  - k_proj
-  - o_proj
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 1
-optimizer: adamw_torch_fused
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: false
-
-gradient_checkpointing:
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_ratio: 0.1
-evals_per_epoch: 1
-saves_per_epoch: 1
-
-fsdp:
-  - full_shard
-  - auto_wrap
-fsdp_config:
-  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_transformer_layer_cls_to_wrap: MistralDecoderLayer
-  fsdp_activation_checkpointing: true
--- a/examples/magistral/magistral-small-qlora.yaml
+++ b/examples/magistral/magistral-small-qlora.yaml
@@ -1,63 +0,0 @@
-base_model: mistralai/Magistral-Small-2506
-
-# Enable to use mistral-common tokenizer
-tokenizer_use_mistral_common: true
-
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-
-load_in_8bit: false
-load_in_4bit: true
-
-datasets:
-  - path: fozziethebeat/alpaca_messages_2k_test
-    type: chat_template
-
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.1
-output_dir: ./outputs/lora-out
-
-adapter: qlora
-lora_model_dir:
-
-sequence_len: 2048
-sample_packing: true
-pad_to_sequence_len: true
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_target_modules:
-  - gate_proj
-  - down_proj
-  - up_proj
-  - q_proj
-  - v_proj
-  - k_proj
-  - o_proj
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 1
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: false
-
-gradient_checkpointing: true
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_ratio: 0.1
-evals_per_epoch: 1
-saves_per_epoch: 1
--- a/examples/mistral/mistral-small-3.1-24B-lora.yml
+++ b/examples/mistral/mistral-small-3.1-24B-lora.yml
@@ -27,7 +27,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 wandb_project:
 wandb_entity:
--- a/examples/orpheus/README.md
+++ b/examples/orpheus/README.md
@@ -1,341 +0,0 @@
-# Finetuning LLMs to output audio
-
-In this example, we finetune Orpcanopylabs/orpheus-tts-0.1-pretrained (a LLaMA 3.2 3b model) to output audio.
-
-The `finetune.yml` withe current settings will run on any Nvidia GPU with 45GB VRAM or more. If you adjust the batch size it can easily run on any GPU under 24GB.
-
-## Dataset pre-processing for pre-training
-If you are adding another voice in English, please jump ahead to finetuning pre-processing.
-
-For this to work, we need to preprocess our dataset. Since we are expecting to output audio, we will need to add tokens to the tokenizer.
-
-Using this code, it will download the SNAC model and add the correct tokens and upload the final dataset.
-
-```python
-import torch
-from snac import SNAC
-from datasets import load_dataset
-from huggingface_hub import snapshot_download
-from datasets import load_dataset
-import random
-import torchaudio.transforms as T
-from transformers import AutoTokenizer
-import os
-
-my_original_dataset_name = "<huggingface-id-of-dataset-that-we-want-to-preprocess>"
-name_to_push_dataset_to = "<huggingface-id-of-where-to-save-dataset>"
-
-dsn = my_original_dataset_name
-
-snapshot_download(
-    repo_id=dsn,
-    repo_type="dataset",
-    revision="main",
-    max_workers=64,
-)
-
-
-ds = load_dataset(dsn, split="train")
-ds_sample_rate = ds[0]["audio"]["sampling_rate"]
-
-model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
-model = model.to("mps")
-
-def tokenise_audio(waveform):
-  waveform = torch.from_numpy(waveform).unsqueeze(0)
-  waveform = waveform.to(dtype=torch.float32)
-  resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)
-  waveform = resample_transform(waveform)
-
-  waveform = waveform.unsqueeze(0).to("cuda")
-
-  #generate the codes from snac
-  with torch.inference_mode():
-    codes = model.encode(waveform)
-
-  all_codes = []
-  for i in range(codes[0].shape[1]):
-    all_codes.append(codes[0][0][i].item()+128266)
-    all_codes.append(codes[1][0][2*i].item()+128266+4096)
-    all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))
-    all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))
-    all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))
-    all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))
-    all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))
-
-
-  return all_codes
-
-def add_codes(example):
-    # Always initialize codes_list to None
-    codes_list = None
-
-    try:
-        answer_audio = example.get("audio")
-        # If there's a valid audio array, tokenise it
-        if answer_audio and "array" in answer_audio:
-            audio_array = answer_audio["array"]
-            codes_list = tokenise_audio(audio_array)
-    except Exception as e:
-        print(f"Skipping row due to error: {e}")
-        # Keep codes_list as None if we fail
-    example["codes_list"] = codes_list
-
-    return example
-
-ds = ds.map(add_codes, remove_columns=["audio"])
-
-#@title Load Tokenizer
-tokeniser_length = 128256
-start_of_text = 128000
-end_of_text = 128009
-
-start_of_speech = tokeniser_length + 1
-end_of_speech = tokeniser_length + 2
-
-start_of_human = tokeniser_length + 3
-end_of_human = tokeniser_length + 4
-
-start_of_ai = tokeniser_length + 5
-end_of_ai =  tokeniser_length + 6
-pad_token = tokeniser_length + 7
-
-audio_tokens_start = tokeniser_length + 10
-
-tokenizer_name = "canopylabs/orpheus-3b-0.1-pretrained"
-
-
-tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-num_proc = os.cpu_count() - 2
-
-ds = ds.filter(lambda x: x["codes_list"] is not None)
-ds = ds.filter(lambda x: len(x["codes_list"]) > 0)
-
-#@title Create Input Ids
-def remove_duplicate_frames(example):
-    vals = example["codes_list"]
-    if len(vals) % 7 != 0:
-        raise ValueError("Input list length must be divisible by 7")
-
-    result = vals[:7]
-
-    removed_frames = 0
-
-    for i in range(7, len(vals), 7):
-        current_first = vals[i]
-        previous_first = result[-7]
-
-        if current_first != previous_first:
-            result.extend(vals[i:i+7])
-        else:
-            removed_frames += 1
-
-    example["codes_list"] = result
-
-    return example
-
-ds = ds.map(remove_duplicate_frames, num_proc=num_proc)
-
-
-def create_input_ids(example):
-    text_ids = tokenizer.encode({example['text']},  add_special_tokens=True)
-    text_ids.append(end_of_text)
-    example["text_tokens"] = text_ids
-    input_ids = (
-        [start_of_human]
-        + example["text_tokens"]
-        + [end_of_human]
-        + [start_of_ai]
-        + [start_of_speech]
-        + example["codes_list"]
-        + [end_of_speech]
-        + [end_of_ai]
-    )
-    example["input_ids"] = input_ids
-    example["labels"] = input_ids
-    example["attention_mask"] = [1] * len(input_ids)
-
-    return example
-
-ds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=["text", "codes_list"])
-
-#@title Remove unnecessary columns
-columns_to_keep = ["input_ids", "labels", "attention_mask"]
-columns_to_remove = [col for col in ds.column_names if col not in columns_to_keep]
-
-ds = ds.remove_columns(columns_to_remove)
-
-ds.push_to_hub(name_to_push_dataset_to)
-```
-
-
-## Finetune pre-processing
-Use this code to add a new voice.
-
-```python
-import torch
-from snac import SNAC
-from datasets import load_dataset
-from huggingface_hub import snapshot_download
-from datasets import load_dataset
-import random
-import torchaudio.transforms as T
-from transformers import AutoTokenizer
-import os
-
-my_original_dataset_name = "<huggingface-id-of-dataset-that-we-want-to-preprocess>"
-name_to_push_dataset_to = "<huggingface-id-of-where-to-save-dataset>"
-
-dsn = my_original_dataset_name
-
-snapshot_download(
-    repo_id=dsn,
-    repo_type="dataset",
-    revision="main",
-    max_workers=64,
-)
-
-
-ds = load_dataset(dsn, split="train")
-ds_sample_rate = ds[0]["audio"]["sampling_rate"]
-
-model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
-model = model.to("mps")
-
-def tokenise_audio(waveform):
-  waveform = torch.from_numpy(waveform).unsqueeze(0)
-  waveform = waveform.to(dtype=torch.float32)
-  resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)
-  waveform = resample_transform(waveform)
-
-  waveform = waveform.unsqueeze(0).to("cuda")
-
-  #generate the codes from snac
-  with torch.inference_mode():
-    codes = model.encode(waveform)
-
-  all_codes = []
-  for i in range(codes[0].shape[1]):
-    all_codes.append(codes[0][0][i].item()+128266)
-    all_codes.append(codes[1][0][2*i].item()+128266+4096)
-    all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))
-    all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))
-    all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))
-    all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))
-    all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))
-
-
-  return all_codes
-
-def add_codes(example):
-    # Always initialize codes_list to None
-    codes_list = None
-
-    try:
-        answer_audio = example.get("audio")
-        # If there's a valid audio array, tokenise it
-        if answer_audio and "array" in answer_audio:
-            audio_array = answer_audio["array"]
-            codes_list = tokenise_audio(audio_array)
-    except Exception as e:
-        print(f"Skipping row due to error: {e}")
-        # Keep codes_list as None if we fail
-    example["codes_list"] = codes_list
-
-    return example
-
-ds = ds.map(add_codes, remove_columns=["audio"])
-
-#@title Load Tokenizer
-tokeniser_length = 128256
-start_of_text = 128000
-end_of_text = 128009
-
-start_of_speech = tokeniser_length + 1
-end_of_speech = tokeniser_length + 2
-
-start_of_human = tokeniser_length + 3
-end_of_human = tokeniser_length + 4
-
-start_of_ai = tokeniser_length + 5
-end_of_ai =  tokeniser_length + 6
-pad_token = tokeniser_length + 7
-
-audio_tokens_start = tokeniser_length + 10
-
-tokenizer_name = "canopylabs/orpheus-3b-0.1-pretrained"
-
-
-tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-num_proc = os.cpu_count() - 2
-
-ds = ds.filter(lambda x: x["codes_list"] is not None)
-ds = ds.filter(lambda x: len(x["codes_list"]) > 0)
-
-#@title Create Input Ids
-def remove_duplicate_frames(example):
-    vals = example["codes_list"]
-    if len(vals) % 7 != 0:
-        raise ValueError("Input list length must be divisible by 7")
-
-    result = vals[:7]
-
-    removed_frames = 0
-
-    for i in range(7, len(vals), 7):
-        current_first = vals[i]
-        previous_first = result[-7]
-
-        if current_first != previous_first:
-            result.extend(vals[i:i+7])
-        else:
-            removed_frames += 1
-
-    example["codes_list"] = result
-
-    return example
-
-ds = ds.map(remove_duplicate_frames, num_proc=num_proc)
-
-tok_info = '''*** HERE you can modify the text prompt
-i.e. if you wanted a multispeaker model like canopylabs/orpheus-3b-0.1-ft, you can pass:
-f"{example["source"]}:  {example["text"]}", as is passed.
-'''
-print(tok_info)
-
-def create_input_ids(example):
-    text_ids = tokenizer.encode(f"{example['speaker_id']}: {example['text']}",  add_special_tokens=True)
-    text_ids.append(end_of_text)
-    example["text_tokens"] = text_ids
-    input_ids = (
-        [start_of_human]
-        + example["text_tokens"]
-        + [end_of_human]
-        + [start_of_ai]
-        + [start_of_speech]
-        + example["codes_list"]
-        + [end_of_speech]
-        + [end_of_ai]
-    )
-    example["input_ids"] = input_ids
-    example["labels"] = input_ids
-    example["attention_mask"] = [1] * len(input_ids)
-
-    return example
-
-ds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=["text", "codes_list"])
-
-#@title Remove unnecessary columns
-columns_to_keep = ["input_ids", "labels", "attention_mask"]
-columns_to_remove = [col for col in ds.column_names if col not in columns_to_keep]
-
-ds = ds.remove_columns(columns_to_remove)
-
-ds.push_to_hub(name_to_push_dataset_to)
-```
-
-## Training
-After preprocessing is done, fill out the blanks in finetune.yml and simply run `axolotl train finetune.yml`
-
-## Inference
-For inference, please refer to the original [orpheus github](https://github.com/canopyai/Orpheus-TTS/tree/main).
--- a/examples/orpheus/finetune.yml
+++ b/examples/orpheus/finetune.yml
@@ -1,52 +0,0 @@
-base_model: canopylabs/orpheus-3b-0.1-pretrained
-
-hub_model_id: <your-hub-model-id>
-
-plugins:
-  - axolotl.integrations.liger.LigerPlugin
-liger_rope: true
-liger_rms_norm: true
-liger_glu_activation: true
-liger_fused_linear_cross_entropy: true
-
-datasets:
-  - path: <your-hf-dataset-id>
-    type:  # leave empty to load pre-tokenized
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.01
-output_dir: ./outputs/out
-
-sequence_len: 8192
-sample_packing: true
-pad_to_sequence_len: true
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 8
-micro_batch_size: 4
-num_epochs: 3
-optimizer: adamw_torch_fused
-lr_scheduler: cosine
-learning_rate: 2e-5
-
-bf16: auto
-tf32: false
-
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_steps: 20
-evals_per_epoch: 5
-saves_per_epoch: 5
-weight_decay: 0.05
-
-special_tokens:
-  pad_token: <custom_token_7>
--- a/examples/pixtral/lora-12b.yml
+++ b/examples/pixtral/lora-12b.yml
@@ -25,7 +25,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 wandb_project:
 wandb_entity:
--- a/examples/qwen2-vl/lora-7b.yaml
+++ b/examples/qwen2-vl/lora-7b.yaml
@@ -25,7 +25,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 wandb_project:
 wandb_entity:
--- a/examples/qwen2/dpo.yaml
+++ b/examples/qwen2/dpo.yaml
@@ -2,6 +2,7 @@ base_model: Qwen/Qwen2.5-0.5B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name

+
 chat_template: qwen_25
 rl: dpo
 datasets:
--- a/examples/qwen3/32b-qlora.yaml
+++ b/examples/qwen3/32b-qlora.yaml
@@ -1,69 +0,0 @@
-base_model: Qwen/Qwen3-32B
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-
-plugins:
-  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
-strict: false
-
-chat_template: qwen3
-datasets:
-  - path: mlabonne/FineTome-100k
-    type: chat_template
-    split: train[:20%]
-    field_messages: conversations
-    message_property_mappings:
-      role: from
-      content: value
-val_set_size: 0.0
-output_dir: ./outputs/out
-dataset_prepared_path: last_run_prepared
-
-sequence_len: 2048
-sample_packing: true
-eval_sample_packing: true
-pad_to_sequence_len: true
-
-load_in_4bit: true
-adapter: qlora
-lora_r: 16
-lora_alpha: 32
-lora_target_modules:
-  - q_proj
-  - k_proj
-  - v_proj
-  - o_proj
-  - down_proj
-  - up_proj
-lora_mlp_kernel: true
-lora_qkv_kernel: true
-lora_o_kernel: true
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 2
-micro_batch_size: 1
-num_epochs: 1
-optimizer: adamw_torch_4bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: true
-
-gradient_checkpointing: offload
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_steps: 10
-evals_per_epoch: 4
-saves_per_epoch: 1
-weight_decay: 0.0
-special_tokens:
--- a/examples/qwen3/8b-qat-fsdp2.yml
+++ b/examples/qwen3/8b-qat-fsdp2.yml
@@ -1,78 +0,0 @@
-base_model: Qwen/Qwen3-8B
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-
-load_in_8bit: false
-load_in_4bit: false
-strict: false
-
-plugins:
-  - axolotl.integrations.liger.LigerPlugin
-
-liger_rope: true
-liger_rms_norm: true
-liger_glu_activation: true
-liger_layer_norm: true
-liger_fused_linear_cross_entropy: true
-
-datasets:
-  - path: tatsu-lab/alpaca
-    type: alpaca
-
-output_dir: ./outputs/qat_out/
-
-sequence_len: 2048
-sample_packing: true
-flex_attention: true
-pad_to_sequence_len: true
-
-flex_attn_compile_kwargs:
-  dynamic: false
-  mode: max-autotune-no-cudagraphs
-
-qat:
-  activation_dtype: int8
-  weight_dtype: int4
-  group_size: 256
-  fake_quant_after_n_steps: 1000
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 1
-micro_batch_size: 2
-max_steps: 2000
-optimizer: adamw_torch_fused
-lr_scheduler: cosine
-learning_rate: 2e-5
-
-bf16: true
-tf32: true
-
-resume_from_checkpoint:
-logging_steps: 1
-
-evals_per_epoch: 1
-saves_per_epoch: 1
-
-warmup_steps: 10
-weight_decay: 0.0
-fsdp:
-  - full_shard
-  - auto_wrap
-
-fsdp_config:
-  fsdp_version: 2
-  fsdp_offload_params: false
-  fsdp_cpu_ram_efficient_loading: true
-  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  fsdp_transformer_layer_cls_to_wrap: Qwen3DecoderLayer
-  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_sharding_strategy: FULL_SHARD
-  fsdp_reshard_after_forward: true
-  fsdp_activation_checkpointing: true
-
-special_tokens:
--- a/examples/qwen3/qlora-fsdp.yaml
+++ b/examples/qwen3/qlora-fsdp.yaml
@@ -1,68 +0,0 @@
-base_model: Qwen/Qwen3-8B
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-
-load_in_8bit: false
-load_in_4bit: true
-strict: false
-
-datasets:
-  - path: tatsu-lab/alpaca
-    type: alpaca
-dataset_prepared_path:
-val_set_size: 0.05
-output_dir: ./outputs/out
-
-sequence_len: 2048
-sample_packing: true
-eval_sample_packing: true
-pad_to_sequence_len: true
-
-adapter: qlora
-lora_model_dir:
-lora_r: 32
-lora_alpha: 64
-lora_dropout: 0.05
-lora_target_linear: true
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 1
-num_epochs: 1
-optimizer: adamw_torch_fused
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: true
-
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_steps: 10
-evals_per_epoch: 4
-saves_per_epoch: 1
-weight_decay: 0.0
-fsdp:
-  - full_shard
-  - auto_wrap
-fsdp_config:
-  fsdp_limit_all_gathers: true
-  fsdp_sync_module_states: true
-  fsdp_offload_params: true
-  fsdp_use_orig_params: false
-  fsdp_cpu_ram_efficient_loading: true
-  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  fsdp_transformer_layer_cls_to_wrap: Qwen3DecoderLayer
-  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_sharding_strategy: FULL_SHARD
-special_tokens:
--- a/requirements-tests.txt
+++ b/requirements-tests.txt
@@ -1,5 +1,4 @@
 codecov
-codecov-cli
 pytest
 pytest-cov
 pytest-retry
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,23 +6,23 @@ triton>=3.0.0
 mamba-ssm==1.2.0.post1
 xformers>=0.0.23.post1
 autoawq==0.2.7.post3
-liger-kernel==0.5.10
+liger-kernel==0.5.8
 # END section

 packaging==23.2

-huggingface_hub==0.32.2
-peft==0.15.2
-transformers==4.52.3
+peft==0.15.1
+transformers==4.51.3
 tokenizers>=0.21.1
-accelerate==1.7.0
-datasets==3.6.0
-deepspeed>=0.17.0
-trl==0.18.1
-hf_xet==1.1.2
+accelerate==1.6.0
+datasets==3.5.0
+deepspeed>=0.15.4
+trl==0.16.1
+hf_xet==1.0.0

 optimum==1.16.2
 hf_transfer
+hqq==0.2.5
 sentencepiece
 gradio==5.23.3

@@ -62,10 +62,8 @@ langdetect==1.0.9
 immutabledict==4.2.0
 antlr4-python3-runtime==4.13.2

-torchao==0.10.0
+torchao==0.9.0
 schedulefree==1.4.1

 axolotl-contribs-lgpl==0.0.6
 axolotl-contribs-mit==0.0.3
-
-mistral-common==1.6.0
--- a/scripts/cutcrossentropy_install.py
+++ b/scripts/cutcrossentropy_install.py
@@ -9,8 +9,6 @@ except ImportError as exc:
    raise ImportError("Install torch via `pip install torch`") from exc
 from packaging.version import Version as V

-USE_UV = "--uv" in sys.argv[1:]
-
 v = V(torch.__version__)

 # no cut-cross-entropy support for torch < 2.4.0
@@ -25,9 +23,7 @@ if cce_spec:
    if not importlib.util.find_spec("cut_cross_entropy.transformers"):
        UNINSTALL_PREFIX = "pip uninstall -y cut-cross-entropy && "

-UV_PREFIX = "uv " if USE_UV else ""
-
 print(
    UNINSTALL_PREFIX
-    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@a1174ca"'
+    + 'pip install "cut-cross-entropy[transformers] @ git+https://github.com/apple/ml-cross-entropy.git@bad6f7b49c75fdec69471abb71b4cddd0f0c6438"'
 )
--- a/scripts/motd
+++ b/scripts/motd
@@ -11,7 +11,7 @@
                                 =@#       @#  #@=     #@   =#@@@@#=    +#@@=  +#@@@@#=    .##@@+   @@
    @@@@  @@@@@@@@@@@@@@@@

-Welcome to the axolotl cloud image! If the you've mounted a disk to /workspace and the axolotl directory is empty, run the following commands:
+Welcome to the axolotl cloud image! If the you've mounted a disk to /workspace and the axolotl directory ie empty, run the following commands:

 ```
 cd /workspace
--- a/scripts/unsloth_install.py
+++ b/scripts/unsloth_install.py
@@ -1,15 +1,11 @@
 # noqa
 # pylint: skip-file
-import sys
-
 try:
    import torch
 except ImportError:
    raise ImportError("Install torch via `pip install torch`")
 from packaging.version import Version as V

-use_uv = "--uv" in sys.argv[1:]
-
 v = V(torch.__version__)
 cuda = str(torch.version.cuda)
 try:
@@ -35,7 +31,6 @@ elif v < V("2.6.0"):
 else:
    raise RuntimeError(f"Torch = {v} too new!")
 x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "")
-uv_prefix = "uv " if use_uv else ""
 print(
-    f'{uv_prefix}pip install unsloth-zoo==2024.12.1 && {uv_prefix}pip install --no-deps "unsloth[{x}]==2024.12.4"'
+    f'pip install unsloth-zoo==2024.12.1 && pip install --no-deps "unsloth[{x}]==2024.12.4"'
 )
--- a/setup.py
+++ b/setup.py
@@ -51,7 +51,7 @@ def parse_requirements(extras_require_map):
            try:
                torch_version = version("torch")
            except PackageNotFoundError:
-                torch_version = "2.6.0"  # default to torch 2.6
+                torch_version = "2.5.1"
            _install_requires.append(f"torch=={torch_version}")

            version_match = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?", torch_version)
@@ -64,16 +64,10 @@ def parse_requirements(extras_require_map):
            else:
                raise ValueError("Invalid version format")

-            if (major, minor) >= (2, 7):
+            if (major, minor) >= (2, 6):
                _install_requires.pop(_install_requires.index(xformers_version))
-                # _install_requires.append("xformers==0.0.29.post3")  # xformers seems to be hard pinned to 2.6.0
-                extras_require_map["vllm"] = ["vllm==0.8.5.post1"]
-            elif (major, minor) >= (2, 6):
-                _install_requires.pop(_install_requires.index(xformers_version))
-                _install_requires.append(
-                    "xformers==0.0.29.post2"
-                )  # vllm needs post2 w torch 2.6
-                extras_require_map["vllm"] = ["vllm==0.8.5.post1"]
+                _install_requires.append("xformers==0.0.29.post2")
+                extras_require_map["vllm"] = ["vllm==0.8.3"]
            elif (major, minor) >= (2, 5):
                _install_requires.pop(_install_requires.index(xformers_version))
                if patch == 0:
@@ -118,7 +112,7 @@ extras_require = {
        "yunchang==0.6.0",
    ],
    "deepspeed": [
-        "deepspeed==0.17.0",
+        "deepspeed==0.15.4",
        "deepspeed-kernels",
    ],
    "mamba-ssm": [
@@ -142,7 +136,6 @@ extras_require = {
        "apollo-torch",
        "lomo-optim==0.1.1",
        "torch-optimi==0.2.1",
-        "came_pytorch==0.1.3",
    ],
    "ray": [
        "ray[train]",
@@ -150,9 +143,6 @@ extras_require = {
    "vllm": [
        "vllm==0.7.2",
    ],
-    "llmcompressor": [
-        "llmcompressor==0.5.1",
-    ],
 }

 install_requires, dependency_links, extras_require_build = parse_requirements(
--- a/src/axolotl/init.py
+++ b/src/axolotl/init.py
@@ -4,4 +4,4 @@ import pkgutil

 __path__ = pkgutil.extend_path(__path__, __name__)  # Make this a namespace package

-__version__ = "0.10.0"
+__version__ = "0.8.0"
--- a/src/axolotl/cli/init.py
+++ b/src/axolotl/cli/init.py
@@ -2,7 +2,4 @@

 import os

-from axolotl.logging_config import configure_logging
-
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
-configure_logging()
--- a/src/axolotl/cli/args.py
+++ b/src/axolotl/cli/args.py
@@ -28,6 +28,7 @@ class TrainerCliArgs:
    debug: bool = field(default=False)
    debug_text_only: bool = field(default=False)
    debug_num_examples: int = field(default=0)
+    merge_lora: bool = field(default=False)
    prompter: Optional[str] = field(default=None)
    shard: bool = field(default=False)
    main_process_port: Optional[int] = field(default=None)
@@ -38,16 +39,16 @@ class TrainerCliArgs:
 class VllmServeCliArgs:
    """Dataclass with CLI arguments for `axolotl vllm-serve` command."""

-    tensor_parallel_size: Optional[int] = field(
-        default=None,
+    tensor_parallel_size: int = field(
+        default=1,
        metadata={"help": "Number of tensor parallel workers to use."},
    )
-    host: Optional[str] = field(
-        default=None,  # nosec B104
+    host: str = field(
+        default="0.0.0.0",  # nosec B104
        metadata={"help": "Host address to run the server on."},
    )
-    port: Optional[int] = field(
-        default=None,
+    port: int = field(
+        default=8000,
        metadata={"help": "Port to run the server on."},
    )
    gpu_memory_utilization: Optional[float] = field(
@@ -81,32 +82,6 @@ class VllmServeCliArgs:
            "hardware support this feature."
        },
    )
-    serve_module: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "Module to serve. If not set, the default module will be used."
-        },
-    )
-
-    enable_reasoning: Optional[bool] = field(
-        default=None,
-    )
-
-    reasoning_parser: Optional[str] = field(
-        default=None,
-    )
-
-
-@dataclass
-class QuantizeCliArgs:
-    """Dataclass with CLI arguments for `axolotl quantize` command."""
-
-    base_model: Optional[str] = field(default=None)
-    weight_dtype: Optional[str] = field(default=None)
-    activation_dtype: Optional[str] = field(default=None)
-    quantize_embedding: Optional[bool] = field(default=None)
-    group_size: Optional[int] = field(default=None)
-    output_dir: Optional[str] = field(default=None)


@dataclass
--- a/src/axolotl/cli/art.py
+++ b/src/axolotl/cli/art.py
@@ -16,15 +16,8 @@ AXOLOTL_LOGO = """
    @@@@  @@@@@@@@@@@@@@@@
 """

-HAS_PRINTED_LOGO = False
-

 def print_axolotl_text_art():
    """Prints axolotl ASCII art."""
-
-    global HAS_PRINTED_LOGO  # pylint: disable=global-statement
-    if HAS_PRINTED_LOGO:
-        return
    if is_main_process():
-        HAS_PRINTED_LOGO = True
        print(AXOLOTL_LOGO)
--- a/src/axolotl/cli/checks.py
+++ b/src/axolotl/cli/checks.py
@@ -1,5 +1,6 @@
 """Various checks for Axolotl CLI."""

+import logging
 import os
 from pathlib import Path

@@ -7,9 +8,10 @@ from accelerate.commands.config import config_args
 from huggingface_hub import HfApi
 from huggingface_hub.utils import LocalTokenNotFoundError

-from axolotl.utils.logging import get_logger
+from axolotl.logging_config import configure_logging

-LOG = get_logger(__name__)
+configure_logging()
+LOG = logging.getLogger(__name__)


 def check_accelerate_default_config() -> None:
--- a/src/axolotl/cli/cloud/modal_.py
+++ b/src/axolotl/cli/cloud/modal_.py
@@ -82,7 +82,7 @@ class ModalCloud(Cloud):
        return res

    def get_image(self):
-        docker_tag = "main-py3.11-cu124-2.6.0"
+        docker_tag = "main-py3.11-cu124-2.5.1"
        if self.config.docker_tag:
            docker_tag = self.config.docker_tag
        docker_image = f"axolotlai/axolotl:{docker_tag}"
--- a/src/axolotl/cli/config.py
+++ b/src/axolotl/cli/config.py
@@ -1,10 +1,10 @@
 """Configuration loading and processing."""

 import json
+import logging
 import os
 import tempfile
 from pathlib import Path
-from tempfile import NamedTemporaryFile
 from typing import Union
 from urllib.parse import urlparse

@@ -21,12 +21,11 @@ from axolotl.utils.config import (
    validate_config,
 )
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.logging import get_logger
 from axolotl.utils.mlflow_ import setup_mlflow_env_vars
 from axolotl.utils.trainer import prepare_opinionated_env, prepare_optim_env
 from axolotl.utils.wandb_ import setup_wandb_env_vars

-LOG = get_logger(__name__, use_environ=True)
+LOG = logging.getLogger(__name__)


 def check_remote_config(config: Union[str, Path]) -> Union[str, Path]:
@@ -119,12 +118,12 @@ def choose_config(path: Path) -> str:
        )

    if len(yaml_files) == 1:
-        LOG.info(f"Using default YAML file '{yaml_files[0]}'")
+        print(f"Using default YAML file '{yaml_files[0]}'")
        return str(yaml_files[0])

-    LOG.info("Choose a YAML file:")
+    print("Choose a YAML file:")
    for idx, file in enumerate(yaml_files):
-        LOG.info(f"{idx + 1}. {file}")
+        print(f"{idx + 1}. {file}")

    chosen_file = None
    while chosen_file is None:
@@ -133,9 +132,9 @@ def choose_config(path: Path) -> str:
            if 1 <= choice <= len(yaml_files):
                chosen_file = str(yaml_files[choice - 1])
            else:
-                LOG.info("Invalid choice. Please choose a number from the list.")
+                print("Invalid choice. Please choose a number from the list.")
        except ValueError:
-            LOG.info("Invalid input. Please enter a number.")
+            print("Invalid input. Please enter a number.")

    return chosen_file

@@ -153,15 +152,7 @@ def prepare_plugins(cfg: DictDefault):
            plugin_manager.register(plugin_name)


-def plugin_set_cfg(cfg: DictDefault):
-    if cfg.get("plugins"):
-        plugin_manager = PluginManager.get_instance()
-        plugin_manager.cfg = cfg
-
-
-def load_cfg(
-    config: str | Path | DictDefault = Path("examples/"), **kwargs
-) -> DictDefault:
+def load_cfg(config: Union[str, Path] = Path("examples/"), **kwargs) -> DictDefault:
    """
    Loads the `axolotl` configuration stored at `config`, validates it, and performs
    various setup.
@@ -173,24 +164,13 @@ def load_cfg(
    Returns:
        `DictDefault` mapping configuration keys to values.
    """
-    if isinstance(config, (str, Path)):
-        config = check_remote_config(config)
-        if Path(config).is_dir():
-            config = choose_config(Path(config))
+    config = check_remote_config(config)
+    if Path(config).is_dir():
+        config = choose_config(Path(config))

-        # Load the config from the yaml file
-        with open(config, encoding="utf-8") as file:
-            cfg: DictDefault = DictDefault(yaml.safe_load(file))
-
-        cfg.axolotl_config_path = config
-    else:
-        cfg = config
-        with NamedTemporaryFile(
-            mode="w", delete=False, suffix=".yml", prefix="axolotl_config_"
-        ) as temp_file:
-            temp_file.write(yaml.dump(config.to_dict()))
-            temp_file.close()
-        cfg.axolotl_config_path = temp_file.name
+    # Load the config from the yaml file
+    with open(config, encoding="utf-8") as file:
+        cfg: DictDefault = DictDefault(yaml.safe_load(file))

    # If there are any options passed in the cli, if it is something that seems valid
    # from the yaml, then overwrite the value
@@ -204,6 +184,8 @@ def load_cfg(
            else:
                cfg[k] = kwargs[k]

+    cfg.axolotl_config_path = config
+
    try:
        device_props = torch.cuda.get_device_properties("cuda")
        gpu_version = "sm_" + str(device_props.major) + str(device_props.minor)
@@ -231,6 +213,5 @@ def load_cfg(
    setup_wandb_env_vars(cfg)
    setup_mlflow_env_vars(cfg)
    setup_comet_env_vars(cfg)
-    plugin_set_cfg(cfg)

    return cfg
--- a/src/axolotl/cli/evaluate.py
+++ b/src/axolotl/cli/evaluate.py
@@ -1,6 +1,6 @@
 """CLI to run evaluation on a model."""

-import os
+import logging
 from pathlib import Path
 from typing import Union

@@ -14,11 +14,9 @@ from axolotl.cli.checks import check_accelerate_default_config, check_user_token
 from axolotl.cli.config import load_cfg
 from axolotl.common.datasets import load_datasets, load_preference_datasets
 from axolotl.evaluate import evaluate
-from axolotl.utils import patch_optimized_env
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.logging import get_logger

-LOG = get_logger(__name__)
+LOG = logging.getLogger(__name__)


 def do_evaluate(cfg: DictDefault, cli_args: TrainerCliArgs) -> None:
@@ -31,14 +29,10 @@ def do_evaluate(cfg: DictDefault, cli_args: TrainerCliArgs) -> None:
        cfg: Dictionary mapping `axolotl` config keys to values.
        cli_args: CLI arguments.
    """
-    # Enable expandable segments for cuda allocation to improve VRAM usage
-    patch_optimized_env()
-
    # pylint: disable=duplicate-code
    print_axolotl_text_art()
    check_accelerate_default_config()
-    if int(os.getenv("LOCAL_RANK", "0")) == 0:
-        check_user_token()
+    check_user_token()

    if cfg.rl:
        dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args)
--- a/src/axolotl/cli/inference.py
+++ b/src/axolotl/cli/inference.py
@@ -1,6 +1,7 @@
 """CLI to run inference on a trained model."""

 import importlib
+import logging
 import sys
 from pathlib import Path
 from threading import Thread
@@ -21,9 +22,8 @@ from axolotl.utils.chat_templates import (
    get_chat_template_from_config,
 )
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.logging import get_logger

-LOG = get_logger(__name__)
+LOG = logging.getLogger(__name__)


 def get_multi_line_input() -> str:
--- a/src/axolotl/cli/main.py
+++ b/src/axolotl/cli/main.py
@@ -2,6 +2,7 @@

 # pylint: disable=redefined-outer-name

+import logging
 import os
 import subprocess  # nosec B404
 import tempfile
@@ -16,7 +17,6 @@ import axolotl
 from axolotl.cli.args import (
    EvaluateCliArgs,
    PreprocessCliArgs,
-    QuantizeCliArgs,
    TrainerCliArgs,
    VllmServeCliArgs,
 )
@@ -28,13 +28,11 @@ from axolotl.cli.utils import (
    fetch_from_github,
    filter_none_kwargs,
 )
+from axolotl.cli.vllm_serve import do_vllm_serve
 from axolotl.integrations.lm_eval.cli import lm_eval
-from axolotl.utils import patch_optimized_env
-from axolotl.utils.logging import get_logger
+from axolotl.utils import set_pytorch_cuda_alloc_conf
 from axolotl.utils.schemas.config import AxolotlInputConfig

-LOG = get_logger(__name__)
-

@click.group()
@click.version_option(version=axolotl.__version__, prog_name="axolotl")
@@ -58,8 +56,6 @@ def preprocess(config: str, cloud: Optional[str] = None, **kwargs) -> None:
        kwargs: Additional keyword arguments which correspond to CLI args or `axolotl`
            config options.
    """
-    patch_optimized_env()
-
    if cloud:
        from axolotl.cli.cloud import do_cli_preprocess

@@ -105,7 +101,7 @@ def train(
            config options.
    """
    # Enable expandable segments for cuda allocation to improve VRAM usage
-    patch_optimized_env()
+    set_pytorch_cuda_alloc_conf()

    if "use_ray" in kwargs and kwargs["use_ray"]:
        accelerate = False
@@ -179,7 +175,7 @@ def train(

                    do_cli(config=cfg_file, **kwargs)
        except subprocess.CalledProcessError as exc:
-            LOG.error(f"Failed to train/fine-tune config '{cfg_file}': {exc}")
+            logging.error(f"Failed to train/fine-tune config '{cfg_file}': {exc}")
            if not sweep:
                raise exc

@@ -331,21 +327,9 @@ def fetch(directory: str, dest: Optional[str]) -> None:
@add_options_from_dataclass(VllmServeCliArgs)
@filter_none_kwargs
 def vllm_serve(config: str, **cli_args: VllmServeCliArgs):
-    from axolotl.cli.vllm_serve import do_vllm_serve
-
    do_vllm_serve(config, cli_args)


-@cli.command()
-@click.argument("config", type=click.Path(exists=True, path_type=str))
-@add_options_from_dataclass(QuantizeCliArgs)
-@filter_none_kwargs
-def quantize(config: str, **cli_args: QuantizeCliArgs):
-    from axolotl.cli.quantize import do_quantize
-
-    do_quantize(config, cli_args)
-
-
@cli.command()
@click.argument("model", type=click.Path(exists=True, path_type=str))
@click.argument("output", type=click.Path(exists=False, path_type=str))
--- a/src/axolotl/cli/merge_lora.py
+++ b/src/axolotl/cli/merge_lora.py
@@ -1,18 +1,20 @@
 """CLI to merge a trained LoRA into a base model."""

+import logging
 from pathlib import Path
 from typing import Union

 import fire
+import transformers
 from dotenv import load_dotenv

+from axolotl.cli.args import TrainerCliArgs
 from axolotl.cli.art import print_axolotl_text_art
 from axolotl.cli.config import load_cfg
 from axolotl.cli.utils import load_model_and_tokenizer
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.logging import get_logger

-LOG = get_logger(__name__)
+LOG = logging.getLogger(__name__)


 def do_merge_lora(*, cfg: DictDefault) -> None:
@@ -66,6 +68,12 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs) -> None:
    Raises:
        ValueError: If target directory for LoRA merged model does not exist.
    """
+    # pylint: disable=duplicate-code
+    parser = transformers.HfArgumentParser(TrainerCliArgs)
+    parsed_cli_args, _ = parser.parse_args_into_dataclasses(
+        return_remaining_strings=True
+    )
+    parsed_cli_args.merge_lora = True

    parsed_cfg = load_cfg(
        config,
--- a/src/axolotl/cli/merge_sharded_fsdp_weights.py
+++ b/src/axolotl/cli/merge_sharded_fsdp_weights.py
@@ -1,6 +1,7 @@
 """CLI to merge sharded FSDP model checkpoints into a single combined checkpoint."""

 import json
+import logging
 import os
 import shutil
 from pathlib import Path
@@ -10,6 +11,7 @@ import fire
 import torch
 import torch.distributed.checkpoint as dist_cp
 import torch.distributed.checkpoint.format_utils as dist_cp_format_utils
+import transformers
 from accelerate.utils import (
    SAFE_WEIGHTS_INDEX_NAME,
    SAFE_WEIGHTS_NAME,
@@ -22,11 +24,11 @@ from huggingface_hub import split_torch_state_dict_into_shards
 from safetensors.torch import save_file as safe_save_file
 from torch.distributed.checkpoint.format_utils import _EmptyStateDictLoadPlanner

+from axolotl.cli.args import TrainerCliArgs
 from axolotl.cli.art import print_axolotl_text_art
 from axolotl.cli.config import load_cfg
-from axolotl.utils.logging import get_logger

-LOG = get_logger(__name__)
+LOG = logging.getLogger(__name__)


 class BFloat16CastPlanner(_EmptyStateDictLoadPlanner):
@@ -195,6 +197,11 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
    """
    # pylint: disable=duplicate-code
    print_axolotl_text_art()
+    parser = transformers.HfArgumentParser(TrainerCliArgs)
+    parsed_cli_args, _ = parser.parse_args_into_dataclasses(
+        return_remaining_strings=True
+    )
+    parsed_cli_args.merge_lora = True
    parsed_cfg = load_cfg(config, **kwargs)

    fsdp_dir = Path(parsed_cfg.output_dir) / "pytorch_model_fsdp_0"
--- a/src/axolotl/cli/preprocess.py
+++ b/src/axolotl/cli/preprocess.py
@@ -1,5 +1,6 @@
 """CLI to run preprocessing of a dataset."""

+import logging
 import warnings
 from pathlib import Path
 from typing import Union
@@ -17,12 +18,10 @@ from axolotl.cli.checks import check_accelerate_default_config, check_user_token
 from axolotl.cli.config import load_cfg
 from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
 from axolotl.common.datasets import load_datasets, load_preference_datasets
-from axolotl.integrations.base import PluginManager
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.logging import get_logger
 from axolotl.utils.trainer import disable_datasets_caching

-LOG = get_logger(__name__)
+LOG = logging.getLogger(__name__)


 def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None:
@@ -48,10 +47,7 @@ def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None:
        cfg.dataset_prepared_path = DEFAULT_DATASET_PREPARED_PATH

    with disable_datasets_caching():
-        plugin_manager = PluginManager.get_instance()
-        if plugin_manager.load_datasets(cfg, preprocess=True):
-            pass
-        elif cfg.rl:
+        if cfg.rl:
            load_preference_datasets(cfg=cfg, cli_args=cli_args)
        else:
            load_datasets(cfg=cfg, cli_args=cli_args)
--- a/src/axolotl/cli/quantize.py
+++ b/src/axolotl/cli/quantize.py
@@ -1,90 +0,0 @@
-"""
-CLI to post-training quantize a model using torchao
-"""
-
-from pathlib import Path
-from typing import Union
-
-from transformers import AutoModelForCausalLM
-
-from axolotl.cli.art import print_axolotl_text_art
-from axolotl.cli.config import load_cfg
-from axolotl.loaders import load_tokenizer
-from axolotl.utils.logging import get_logger
-from axolotl.utils.quantization import TorchIntDType, quantize_model_for_ptq
-
-LOG = get_logger(__name__)
-
-
-def do_quantize(
-    config: Union[Path, str],
-    cli_args: dict,
-):
-    """
-    Quantizes a model's model's weights
-
-    Args:
-        config (Union[Path, str]): The path to the config file
-        cli_args (dict): Additional command-line arguments
-    """
-    print_axolotl_text_art()
-
-    cfg = load_cfg(config)
-
-    if cfg.qat and cfg.quantization:
-        raise ValueError(
-            "QAT and quantization cannot be used together. Please specify only one of qat or quantization in your config file."
-        )
-
-    if cfg.qat:
-        quantize_cfg = cfg.qat
-    elif cfg.quantization:
-        quantize_cfg = cfg.quantization
-    else:
-        raise ValueError(
-            "No quantization configuration found. Please specify either qat or quantization in your config file."
-        )
-
-    model_path = cli_args.get("model_path") or cfg.output_dir
-    if weight_dtype := cli_args.get("weight_dtype"):
-        weight_dtype = TorchIntDType[weight_dtype]
-    else:
-        weight_dtype = quantize_cfg.weight_dtype
-    if activation_dtype := cli_args.get("activation_dtype"):
-        activation_dtype = TorchIntDType[activation_dtype]
-    else:
-        activation_dtype = quantize_cfg.activation_dtype
-    group_size = cli_args.get("group_size") or quantize_cfg.group_size
-    quantize_embedding = (
-        cli_args.get("quantize_embedding") or quantize_cfg.quantize_embedding
-    )
-    output_dir = cli_args.get("output_dir") or cfg.output_dir
-
-    LOG.info(f"Loading model from {model_path}...")
-    tokenizer = load_tokenizer(cfg)
-    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
-
-    LOG.info(
-        f"Quantizing model with configuration: \n"
-        f"\tweight_dtype: {weight_dtype}\n"
-        f"\tactivation_dtype: {activation_dtype}\n"
-        f"\tgroup_size: {group_size}\n"
-        f"\tquantize_embedding: {quantize_embedding}"
-    )
-
-    quantize_model_for_ptq(
-        model, weight_dtype, group_size, activation_dtype, quantize_embedding
-    )
-
-    LOG.info(f"Saving quantized model to: {str(Path(output_dir) / 'quantized')}...")
-    model.save_pretrained(
-        str(Path(output_dir) / "quantized"),
-        safe_serialization=False,
-        progressbar=True,
-    )
-    tokenizer.save_pretrained(
-        str(Path(output_dir) / "quantized"),
-        safe_serialization=False,
-        progressbar=True,
-    )
-    LOG.info(f"Quantized model saved to: {str(Path(output_dir) / 'quantized')}...")
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -1,6 +1,6 @@
 """CLI to run training on a model."""

-import gc
+import logging
 import os
 from pathlib import Path
 from typing import Union
@@ -17,10 +17,12 @@ from axolotl.cli.config import load_cfg
 from axolotl.common.datasets import load_datasets, load_preference_datasets
 from axolotl.integrations.base import PluginManager
 from axolotl.train import train
-from axolotl.utils import patch_optimized_env
+from axolotl.utils import set_pytorch_cuda_alloc_conf
 from axolotl.utils.config import normalize_config, resolve_dtype
 from axolotl.utils.dict import DictDefault

+LOG = logging.getLogger(__name__)
+

 def do_train(cfg: DictDefault, cli_args: TrainerCliArgs):
    """
@@ -33,27 +35,21 @@ def do_train(cfg: DictDefault, cli_args: TrainerCliArgs):
        cli_args: Training-specific CLI arguments.
    """
    # Enable expandable segments for cuda allocation to improve VRAM usage
-    patch_optimized_env()
+    set_pytorch_cuda_alloc_conf()

    print_axolotl_text_art()
    check_accelerate_default_config()
    if int(os.getenv("LOCAL_RANK", "0")) == 0:
        check_user_token()

-    plugin_manager = PluginManager.get_instance()
-    dataset_meta = plugin_manager.load_datasets(cfg, preprocess=False)
-    if not dataset_meta:
-        if cfg.rl:
-            dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args)
-        else:
-            dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+    if cfg.rl:
+        dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args)
+    else:
+        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

    model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta)
-
    del model, tokenizer, trainer

-    gc.collect()
-
    plugin_manager = PluginManager.get_instance()
    plugin_manager.post_train_unload(cfg)

--- a/src/axolotl/cli/utils.py
+++ b/src/axolotl/cli/utils.py
@@ -4,6 +4,7 @@ import concurrent.futures
 import dataclasses
 import hashlib
 import json
+import logging
 from functools import wraps
 from pathlib import Path
 from types import NoneType
@@ -19,12 +20,12 @@ from transformers import (
    ProcessorMixin,
 )

-from axolotl.loaders import load_processor, load_tokenizer
-from axolotl.loaders.model import ModelLoader
+from axolotl.logging_config import configure_logging
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.logging import get_logger
+from axolotl.utils.models import load_model, load_processor, load_tokenizer

-LOG = get_logger(__name__)
+configure_logging()
+LOG = logging.getLogger(__name__)


 def strip_optional_type(field_type: type | str | None):
@@ -319,8 +320,7 @@ def load_model_and_tokenizer(
    tokenizer = load_tokenizer(cfg)

    LOG.info("loading model...")
-    model_loader = ModelLoader(cfg, tokenizer, inference=inference)
-    model, _ = model_loader.load()
+    model, _ = load_model(cfg, tokenizer, inference=inference)

    processor = None
    if cfg.is_multimodal:
--- a/src/axolotl/cli/vllm_serve.py
+++ b/src/axolotl/cli/vllm_serve.py
@@ -2,27 +2,15 @@
 CLI to start the vllm server for online RL
 """

-import os
-from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Union

-import trl
 from trl.scripts.vllm_serve import ScriptArguments
+from trl.scripts.vllm_serve import main as vllm_serve_main

 from axolotl.cli.config import load_cfg


-@dataclass
-class AxolotlScriptArguments(ScriptArguments):
-    """
-    Additional arguments for the VLLM server
-    """
-
-    reasoning_parser: str = field(default="", kw_only=True)
-    enable_reasoning: bool | None = field(default=None, kw_only=True)
-
-
 def do_vllm_serve(
    config: Union[Path, str],
    cli_args: dict,
@@ -37,13 +25,9 @@ def do_vllm_serve(
    Returns:
        process_id: the process id of the started VLLM server
    """
-    patch_vllm_worker()
    cfg = load_cfg(config)
    model = cfg.base_model

-    serve_module = cli_args.get("serve_module", "trl.scripts.vllm_serve")
-    vllm_serve_main = getattr(__import__(serve_module, fromlist=["main"]), "main")
-
    tensor_parallel_size = (
        cli_args.get("tensor_parallel_size") or cfg.vllm.tensor_parallel_size
    )
@@ -57,16 +41,9 @@ def do_vllm_serve(
    enable_prefix_caching = (
        cli_args.get("enable_prefix_caching") or cfg.vllm.enable_prefix_caching
    )
-    reasoning_parser = (
-        cli_args.get("reasoning_parser") or cfg.vllm.reasoning_parser or ""
-    )
-    enable_reasoning = (
-        cli_args.get("enable_reasoning") or cfg.vllm.enable_reasoning or False
-    )

-    # pylint: disable=unexpected-keyword-arg
-    vllm_script_args = AxolotlScriptArguments(
-        model=model,
+    vllm_script_args = ScriptArguments(
+        model,
        tensor_parallel_size=tensor_parallel_size,
        host=host,
        port=port,
@@ -74,67 +51,5 @@ def do_vllm_serve(
        dtype=dtype,
        max_model_len=max_model_len,
        enable_prefix_caching=enable_prefix_caching,
-        reasoning_parser=reasoning_parser,
-        enable_reasoning=enable_reasoning,
    )
    vllm_serve_main(vllm_script_args)
-
-
-def patch_vllm_worker():
-    from multiprocessing.connection import Connection
-
-    from vllm import LLM
-
-    def llm_worker(
-        script_args: AxolotlScriptArguments,
-        data_parallel_rank: int,
-        master_port: int,
-        connection: Connection,
-    ) -> None:
-        # Set required environment variables for DP to work with vLLM
-        os.environ["VLLM_DP_RANK"] = str(data_parallel_rank)
-        os.environ["VLLM_DP_RANK_LOCAL"] = str(data_parallel_rank)
-        os.environ["VLLM_DP_SIZE"] = str(script_args.data_parallel_size)
-        os.environ["VLLM_DP_MASTER_PORT"] = str(master_port)
-
-        llm = LLM(
-            model=script_args.model,
-            revision=script_args.revision,
-            tensor_parallel_size=script_args.tensor_parallel_size,
-            gpu_memory_utilization=script_args.gpu_memory_utilization,
-            enforce_eager=script_args.enforce_eager,
-            dtype=script_args.dtype,
-            # Automatic Prefix Caching caches the KV cache of existing queries, so that a new query can
-            # directly reuse the KV cache if it shares the same prefix with one of the existing queries.
-            # This is particularly useful here because we generate completions from the same prompts.
-            enable_prefix_caching=script_args.enable_prefix_caching,
-            kv_cache_dtype=script_args.kv_cache_dtype,
-            max_model_len=script_args.max_model_len,
-            worker_extension_cls="trl.scripts.vllm_serve.WeightSyncWorkerExtension",
-            enable_reasoning=script_args.enable_reasoning,
-            reasoning_parser=script_args.reasoning_parser,
-        )
-
-        # Send ready signal to parent process
-        connection.send({"status": "ready"})
-
-        while True:
-            # Wait for commands from the parent process
-            try:
-                command = connection.recv()
-            except KeyboardInterrupt:
-                llm.collective_rpc(method="close_communicator")
-                break
-
-            # Handle commands
-            if command["type"] in ["call", "fire_and_forget"]:
-                method_name = command["method"]
-                args, kwargs = command.get("args", ()), command.get("kwargs", {})
-                method = getattr(llm, method_name)
-                result = method(*args, **kwargs)
-                if command["type"] == "call":
-                    connection.send(result)
-            elif command["type"] == "shutdown":
-                break
-
-    trl.scripts.vllm_serve.llm_worker = llm_worker
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Sunny Liu	0179021780	fix attribute error	2025-04-21 22:29:24 -04:00
Sunny Liu	c4910da015	update more tests + better hqq validation	2025-04-21 22:17:08 -04:00
Sunny Liu	db7e92f6a6	check if self.cfg.quantization exists when directly setting load_in_4bit	2025-04-21 21:42:23 -04:00
Sunny Liu	136b37e4d4	restore support for legacy cfg.load_in_xbit	2025-04-21 21:32:01 -04:00
Sunny Liu	92644513c4	update relora	2025-04-21 21:22:44 -04:00
Sunny Liu	266ef3f479	skip set_quant_config if quantization not given	2025-04-21 17:17:41 -04:00
Sunny Liu	fcef8c95fe	skip set_quant_config if quantization not given	2025-04-21 17:17:20 -04:00
Sunny Liu	136407c556	update multigpu/test_qwen2	2025-04-21 17:04:17 -04:00
Sunny Liu	3251b3235f	update test_mixtral	2025-04-21 17:01:07 -04:00
Sunny Liu	1aa9f7d952	update multigpu/test_eval, multigpu/test_llama	2025-04-21 16:49:08 -04:00
Sunny Liu	a20e753321	update test_falcon_samplepack	2025-04-21 16:29:49 -04:00
Sunny Liu	cb121ab91b	update test_mixtral [skip e2e]	2025-04-21 16:27:26 -04:00
Sunny Liu	b59640a4c7	amend model loading for hqq + fix hqq version	2025-04-21 15:53:43 -04:00
Sunny Liu	f0a189131b	amend model loading for hqq + fix hqq version	2025-04-21 15:53:29 -04:00
Sunny Liu	c8fb5baad6	amend unittests pt2	2025-04-21 13:28:52 -04:00
Sunny Liu	9be971d47c	update test_models.py to conform to new quantization config	2025-04-21 11:34:37 -04:00
Sunny Liu	ffd4ef1ece	nit	2025-04-21 11:28:59 -04:00
Sunny Liu	320aff1867	update config doc	2025-04-21 10:59:04 -04:00
Sunny Liu	ac24eba2ac	include HQQLinear in find target_linear	2025-04-21 10:36:39 -04:00
Sunny Liu	8a5ad8aee3	typo	2025-04-21 10:36:39 -04:00
Sunny Liu	843b50fdaa	rigorous qlora validation	2025-04-21 10:36:39 -04:00
Sunny Liu	098ffcc5a2	removed redundant hqq config validation	2025-04-21 10:36:39 -04:00
Sunny Liu	ba8e29c841	quantization config refactoring - better integration	2025-04-21 10:36:39 -04:00
Sunny Liu	143b2e082c	nit [skip e2e]	2025-04-21 10:36:39 -04:00
Sunny Liu	aba484de97	WIP quant config refactor	2025-04-21 10:36:39 -04:00
Sunny Liu	f6f5f89c6d	fix more typo	2025-04-21 10:36:39 -04:00
Sunny Liu	8926fe9981	lax config requirement - qlora + hqq	2025-04-21 10:36:39 -04:00
Sunny Liu	987c5217a0	fix typos	2025-04-21 10:36:39 -04:00
Sunny Liu	feaef03cb9	didn't realise model_config.quantization_config is just a regular dict	2025-04-21 10:36:39 -04:00
Sunny Liu	ba5d917845	add e2e test for hqq training	2025-04-21 10:36:39 -04:00
Sunny Liu	0e9b060b4d	add doc + requirement for hqq	2025-04-21 10:36:39 -04:00
Sunny Liu	0c40d12a18	more comprehensive hqq config options	2025-04-21 10:36:39 -04:00
Sunny Liu	f55b3c805b	hqq_nbits triggers prepare_model_for_kbit_training	2025-04-21 10:36:39 -04:00
Sunny Liu	a64601f957	fix wrong variable name	2025-04-21 10:36:39 -04:00
Sunny Liu	eb7bc70b99	fix dumb mistake	2025-04-21 10:36:39 -04:00
Sunny Liu	db6c76b147	forgot to return data in check	2025-04-21 10:36:39 -04:00
Sunny Liu	99730ce40a	hqq integration	2025-04-21 10:36:39 -04:00