pre-patch the mlp

use new patch
wip patch
2025-07-13 23:01:49 -04:00 · 2025-07-13 22:40:37 -04:00 · 2025-07-13 22:37:18 -04:00 · 2025-07-13 22:37:18 -04:00 · 2025-07-12 11:41:34 -04:00 · 2025-07-12 11:40:30 -04:00
556 changed files with 41459 additions and 13245 deletions
--- a/.bandit
+++ b/.bandit
@@ -1,3 +1,3 @@
 [bandit]
 exclude = tests
-skips = B101
+skips = B101,B615
--- a/.coveragerc
+++ b/.coveragerc
@@ -0,0 +1,14 @@
+[run]
+source = axolotl
+omit =
+    */tests/*
+    setup.py
+
+[report]
+exclude_lines =
+    pragma: no cover
+    def __repr__
+    raise NotImplementedError
+    if __name__ == .__main__.:
+    pass
+    raise ImportError
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -5,59 +5,76 @@ on:
    branches:
      - "main"
    paths:
-      - 'Dockerfile-base'
+      - 'docker/Dockerfile-base'
+      - 'docker/Dockerfile-uv-base'
      - '.github/workflows/base.yml'
  pull_request:
    paths:
-      - 'Dockerfile-base'
+      - 'docker/Dockerfile-base'
+      - 'docker/Dockerfile-uv-base'
      - '.github/workflows/base.yml'
  workflow_dispatch:

 jobs:
  build-base:
    if: github.repository_owner == 'axolotl-ai-cloud'
+    timeout-minutes: 480
    # this job needs to be run on self-hosted GPU runners...
-    runs-on: axolotl-gpu-runner
+    runs-on: ubuntu-latest-m
    strategy:
      fail-fast: false
      matrix:
        include:
-          - cuda: "124"
-            cuda_version: 12.4.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.4.1
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-          - cuda: "124"
-            cuda_version: 12.4.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.5.1
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "124"
            cuda_version: 12.4.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.6.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-base"
          - cuda: "126"
            cuda_version: 12.6.3
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.6.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-base"
+          - cuda: "126"
+            cuda_version: 12.6.3
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: 2.7.0
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-base"
+          - cuda: "126"
+            cuda_version: 12.6.3
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: 2.7.1
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-base"
+          - cuda: "128"
+            cuda_version: 12.6.3
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: 2.7.1
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-base"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: nightly
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: next
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-base-nightly"
+#          # "next" is for release candidates of pytorch
+#          - cuda: "128"
+#            cuda_version: 12.8.1
+#            cudnn_version: ""
+#            python_version: "3.11"
+#            pytorch: next
+#            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+#            dockerfile: "Dockerfile-base-next"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -79,7 +96,60 @@ jobs:
        uses: docker/build-push-action@v4
        with:
          context: .
-          file: ${{ matrix.pytorch == 'nightly' && './docker/Dockerfile-base-nightly' || matrix.pytorch == 'next' && './docker/Dockerfile-base-next' || './docker/Dockerfile-base' }}
+          file: ./docker/${{ matrix.dockerfile }}
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+          labels: ${{ steps.metadata.outputs.labels }}
+          build-args: |
+            CUDA_VERSION=${{ matrix.cuda_version }}
+            CUDNN_VERSION=${{ matrix.cudnn_version }}
+            CUDA=${{ matrix.cuda }}
+            PYTHON_VERSION=${{ matrix.python_version }}
+            PYTORCH_VERSION=${{ matrix.pytorch }}
+            TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }}
+  build-base-uv:
+    if: github.repository_owner == 'axolotl-ai-cloud'
+    timeout-minutes: 480
+    runs-on: ubuntu-latest-m
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - cuda: "126"
+            cuda_version: 12.6.3
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: 2.6.0
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-uv-base"
+          - cuda: "128"
+            cuda_version: 12.8.1
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: 2.7.1
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-uv-base"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Docker metadata
+        id: metadata
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            axolotlai/axolotl-base-uv
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Build
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: ./docker/${{ matrix.dockerfile }}
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -23,7 +23,7 @@ jobs:
        - name: Install dependencies
          run: |
            python3 -m pip install jupyter quartodoc
-            python3 -m pip install -e . --no-deps
+            python3 -m pip install -e .
        - name: Build autodoc
          run: quartodoc build
        - name: Publish to GitHub Pages (and render)
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -9,6 +9,7 @@ on:
       - '.github/workflows/*.yml'
       - "*.[q]md"
       - "examples/**/*.y[a]?ml"
+       - ".pre-commit-config.yaml"
  workflow_dispatch:

 jobs:
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -15,22 +15,26 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.4.1
-            axolotl_extras:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.5.1
-            axolotl_extras: vllm
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.6.0
            axolotl_extras:
-            is_latest: true
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.7.0
+            axolotl_extras: vllm
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.7.1
+            axolotl_extras:
+          - cuda: 128
+            cuda_version: 12.8.1
+            python_version: "3.11"
+            pytorch: 2.7.1
+            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -62,6 +66,7 @@ jobs:
            CUDA=${{ matrix.cuda }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
+            AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}
          file: ./docker/Dockerfile
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
@@ -77,22 +82,27 @@ jobs:
    strategy:
      matrix:
        include:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.4.1
-            axolotl_extras:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.5.1
-            axolotl_extras:
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.6.0
            axolotl_extras:
            is_latest: true
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.7.0
+            axolotl_extras:
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.7.1
+            axolotl_extras:
+          - cuda: 128
+            cuda_version: 12.8.1
+            python_version: "3.11"
+            pytorch: 2.7.1
+            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -135,10 +145,10 @@ jobs:
    strategy:
      matrix:
        include:
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.4.1
+            pytorch: 2.6.0
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -3,11 +3,13 @@ name: docker-multigpu-tests-biweekly
 on:
  pull_request:
    paths:
-      - 'tests/e2e/multigpu/*.py'
+      - 'tests/e2e/multigpu/**.py'
      - 'requirements.txt'
      - 'setup.py'
      - 'pyproject.toml'
      - '.github/workflows/multi-gpu-e2e.yml'
+      - 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
+      - 'src/axolotl/utils/distributed.py'
  workflow_dispatch:
  schedule:
    - cron: '0 0 * * 1,4'  # Runs at 00:00 UTC every monday & thursday
@@ -24,25 +26,18 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.6.0
-            axolotl_extras: vllm
+            axolotl_extras:
            num_gpus: 2
            nightly_build: "true"
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.4.1
-            axolotl_extras:  # no vllm support for 2.4.1
-            num_gpus: 2
-            nightly_build: "true"
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.5.1
-            axolotl_extras: vllm
+            pytorch: 2.7.1
+            axolotl_extras:
            num_gpus: 2
            nightly_build: "true"
    runs-on: [self-hosted, modal]
@@ -57,7 +52,7 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==0.71.8 jinja2
+          pip install modal==1.0.2 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
@@ -67,6 +62,7 @@ jobs:
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
+          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
          modal run cicd.multigpu
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -12,16 +12,6 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.4.1
-            axolotl_extras:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.5.1
-            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -73,15 +63,10 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.4.1
+            pytorch: 2.6.0
            axolotl_extras:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.5.1
-            axolotl_extras:
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.6.0
            axolotl_extras:
--- a/.github/workflows/precommit-autoupdate.yml
+++ b/.github/workflows/precommit-autoupdate.yml
@@ -25,7 +25,6 @@ jobs:
          pre-commit autoupdate
          if [[ -n $(git status --porcelain) ]]; then
            echo "changes=true" >> $GITHUB_OUTPUT
-            git diff .pre-commit-config.yaml > pre-commit-update.diff
          fi

      - name: Create Pull Request
@@ -39,11 +38,3 @@ jobs:
          commit-message: "chore: update pre-commit hooks"
          body: |
            Automated PR to update pre-commit hooks to their latest versions.
-
-            <details>
-            <summary>Changes:</summary>
-
-            ```diff
-            ${{ steps.update.outputs.diff }}
-            ```
-            </details>
--- a/.github/workflows/preview-docs.yml
+++ b/.github/workflows/preview-docs.yml
@@ -0,0 +1,76 @@
+name: Preview
+on:
+  workflow_dispatch:
+  pull_request:
+    types: [opened, synchronize, reopened]
+
+    # Run the workflow only when one of these files changes
+    paths:
+      - '**/*.md'      # any Markdown file
+      - '**/*.qmd'     # any Quarto file
+      - '_quarto.yml'
+      - docs/scripts/generate_config_docs.py
+      - src/axolotl/utils/schemas/**.py
+
+permissions:
+  checks: write
+  contents: write
+  deployments: write
+  issues: write
+  discussions: write
+  pages: write
+  pull-requests: write
+  statuses: write
+
+jobs:
+  preview:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.head.sha }}
+
+      - name: Set up Quarto
+        uses: quarto-dev/quarto-actions/setup@v2
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install dependencies
+        run: |
+          python3 -m pip install jupyter quartodoc
+          python3 -m pip install -e .
+
+      - name: Build autodoc
+        run: quartodoc build
+
+      - name: Quarto render
+        run: quarto render
+
+      - name: Netlify Publish
+        uses: nwtgck/actions-netlify@v3.0
+        id: netlify
+        with:
+          publish-dir: './_site'
+          enable-pull-request-comment: false
+          enable-github-deployment: false
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          deploy-message: "Deployed On Netlify"
+          github-deployment-environment: 'preview'
+          github-deployment-description: 'Preview Deployment'
+        env:
+          NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
+          NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }}
+
+      - name: Update PR with preview link
+        if: ${{ steps.netlify.outcome == 'success' }}
+        uses: marocchino/sticky-pull-request-comment@v2
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          message: |
+            📖 **Documentation Preview**: ${{ steps.netlify.outputs.deploy-url }}
+
+            Deployed on Netlify from commit ${{ github.event.pull_request.head.sha }}
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -26,21 +26,18 @@ jobs:
      max-parallel: 2
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
+        pytorch_version: ["2.6.0", "2.7.0"]
    timeout-minutes: 20

    steps:
      - name: Check out repository code
        uses: actions/checkout@v4

-      - name: Restore HF cache
-        id: hf-cache-restore
-        uses: actions/cache/restore@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ runner.os }}-hf-hub-cache-v2
+      - name: Restore Cache from S3
+        id: hf-cache-restore-s3
+        run: |
+          mkdir -p /home/runner/.cache/huggingface/hub
+          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd

      - name: Setup Python
        uses: actions/setup-python@v5
@@ -81,15 +78,11 @@ jobs:
        run: |
          axolotl --help

-      - name: Pre-Download dataset fixture
-        run: |
-          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
-
      - name: Run tests
        run: |
-          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
-          pytest -v tests/patched/
-          pytest -v tests/cli/
+          pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
+          pytest -v --durations=10 tests/patched/
+          pytest -v --durations=10 tests/cli/

      - name: cleanup pip cache
        run: |
@@ -106,22 +99,8 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.4.1
-            num_gpus: 1
-            axolotl_extras:
-            nightly_build: "true"
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.5.1
-            num_gpus: 1
-            axolotl_extras:
-            nightly_build: "true"
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.6.0
            num_gpus: 1
@@ -147,6 +126,7 @@ jobs:
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
+          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
          modal run cicd.e2e_tests
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -27,6 +27,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}

+env:
+  TRANSFORMERS_IS_CI: "yes"
+
 jobs:
  pre-commit:
    name: pre-commit
@@ -44,26 +47,23 @@ jobs:
  pytest:
    name: PyTest
    runs-on: ubuntu-latest
+#    needs: [preload-cache]
    strategy:
      fail-fast: false
-      max-parallel: 2
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
+        pytorch_version: ["2.6.0", "2.7.0", "2.7.1"]
    timeout-minutes: 20

    steps:
      - name: Check out repository code
        uses: actions/checkout@v4

-      - name: Restore HF cache
-        id: hf-cache-restore
-        uses: actions/cache/restore@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ runner.os }}-hf-hub-cache-v2
+      - name: Restore Cache from S3
+        id: hf-cache-restore-s3
+        run: |
+          mkdir -p /home/runner/.cache/huggingface/hub
+          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd

      - name: Setup Python
        uses: actions/setup-python@v5
@@ -102,46 +102,41 @@ jobs:

      - name: Run tests
        run: |
-          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
-          pytest -v tests/patched/
-          pytest -v tests/cli/
+          pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/ --cov=axolotl --cov-report=xml
+          pytest -v --durations=10 tests/patched/ --cov=axolotl --cov-append --cov-report=xml
+          pytest -v --durations=10 tests/cli/ --cov=axolotl --cov-append --cov-report=xml
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v5
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          files: ./coverage.xml
+          flags: unittests,pytorch-${{ matrix.pytorch_version }}
+          fail_ci_if_error: false

      - name: cleanup pip cache
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

-      - name: Save HF cache
-        id: hf-cache
-        uses: actions/cache/save@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
-
  pytest-sdist:
    name: PyTest from Source Dist
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
-      max-parallel: 1
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
+        pytorch_version: ["2.6.0", "2.7.0", "2.7.1"]
    timeout-minutes: 20

    steps:
      - name: Check out repository code
        uses: actions/checkout@v4

-      - name: Restore HF cache
-        id: hf-cache-restore
-        uses: actions/cache/restore@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ runner.os }}-hf-hub-cache-v2
+      - name: Restore Cache from S3
+        id: hf-cache-restore-s3
+        run: |
+          mkdir -p /home/runner/.cache/huggingface/hub
+          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd

      - name: Setup Python
        uses: actions/setup-python@v5
@@ -180,30 +175,121 @@ jobs:

      - name: Run tests
        run: |
-          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
-          pytest -v tests/patched/
-          pytest -v tests/cli/
+          pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
+          pytest -v --durations=10 tests/patched/
+          pytest -v --durations=10 tests/cli/

      - name: cleanup pip cache
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

-      - name: Save HF cache
-        id: hf-cache
-        uses: actions/cache/save@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
-
  docker-e2e-tests-1st:
+    # Run this job first as a gate for running the remainder of the test matrix
    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
-    timeout-minutes: 90
+    timeout-minutes: 120
    needs: [pre-commit, pytest, pytest-sdist]

+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.7.1
+            num_gpus: 1
+            axolotl_extras:
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.6.0
+            num_gpus: 1
+            axolotl_extras:
+            dockerfile: "Dockerfile-uv.jinja"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install Modal
+        run: |
+          python -m pip install --upgrade pip
+          pip install modal==1.0.2 jinja2
+      - name: Update env vars
+        run: |
+          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
+          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
+          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
+          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
+          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
+          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
+          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
+          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
+      - name: Run tests job on Modal
+        run: |
+          modal run cicd.e2e_tests
+
+  docker-e2e-tests:
+    if: github.repository_owner == 'axolotl-ai-cloud'
+    # this job needs to be run on self-hosted GPU runners...
+    runs-on: [self-hosted, modal]
+    timeout-minutes: 120
+    # Only run the remainder of the matrix if the first e2e check passed;
+    # this is to save on wasted compute costs for known failures that get caught in the first run
+    needs: [pre-commit, pytest, docker-e2e-tests-1st]
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.6.0
+            num_gpus: 1
+            axolotl_extras:
+          - cuda: 128
+            cuda_version: 12.8.1
+            python_version: "3.11"
+            pytorch: 2.7.1
+            num_gpus: 1
+            axolotl_extras:
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install Modal
+        run: |
+          python -m pip install --upgrade pip
+          pip install modal==1.0.2 jinja2
+      - name: Update env vars
+        run: |
+          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
+          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
+          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
+          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
+          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
+          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
+          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
+          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
+      - name: Run tests job on Modal
+        run: |
+          modal run cicd.e2e_tests
+
+  docker-e2e-cleanup:
+    runs-on: [self-hosted, modal]
+    timeout-minutes: 90
+    needs: [docker-e2e-tests]
+
    strategy:
      fail-fast: false
      matrix:
@@ -213,54 +299,7 @@ jobs:
            python_version: "3.11"
            pytorch: 2.6.0
            num_gpus: 1
-            axolotl_extras: vllm
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Install Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: Install Modal
-        run: |
-          python -m pip install --upgrade pip
-          pip install modal==0.71.8 jinja2
-      - name: Update env vars
-        run: |
-          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
-          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
-          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
-          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
-          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
-          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
-          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-      - name: Run tests job on Modal
-        run: |
-          modal run cicd.e2e_tests
-
-  docker-e2e-tests:
-    if: github.repository_owner == 'axolotl-ai-cloud'
-    # this job needs to be run on self-hosted GPU runners...
-    runs-on: [self-hosted, modal]
-    timeout-minutes: 90
-    needs: [pre-commit, pytest, docker-e2e-tests-1st]
-
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.4.1
-            num_gpus: 1
            axolotl_extras:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.5.1
-            num_gpus: 1
-            axolotl_extras: vllm
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -271,7 +310,7 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==0.71.8 jinja2
+          pip install modal==1.0.2 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
@@ -281,6 +320,7 @@ jobs:
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
+          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
-          modal run cicd.e2e_tests
+          modal run cicd.cleanup
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,15 +19,15 @@ repos:
    hooks:
      - id: isort
 -   repo: https://github.com/PyCQA/flake8
-    rev: 7.1.2
+    rev: 7.3.0
    hooks:
    - id: flake8
 -   repo: https://github.com/pylint-dev/pylint
-    rev: v3.3.6
+    rev: v3.3.7
    hooks:
    - id: pylint
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.15.0
+    rev: v1.16.1
    hooks:
    - id: mypy
      additional_dependencies:
@@ -36,7 +36,7 @@ repos:
            'pydantic>=2.5.3',
        ]
 -   repo: https://github.com/PyCQA/bandit
-    rev: 1.8.3
+    rev: 1.8.6
    hooks:
    -   id: bandit
        args: [
--- a/.runpod/.gitignore
+++ b/.runpod/.gitignore
@@ -0,0 +1,161 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+pod/scripts/config.yaml
--- a/.runpod/Dockerfile
+++ b/.runpod/Dockerfile
@@ -0,0 +1,18 @@
+FROM axolotlai/axolotl-cloud:main-py3.11-cu124-2.6.0
+
+COPY .runpod/requirements.txt /requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install --upgrade pip && \
+    python3 -m pip install --upgrade -r /requirements.txt
+
+# Environment settings
+ARG BASE_VOLUME="/runpod-volume"
+ENV BASE_VOLUME=$BASE_VOLUME
+ENV HF_DATASETS_CACHE="${BASE_VOLUME}/huggingface-cache/datasets"
+ENV HUGGINGFACE_HUB_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
+ENV TRANSFORMERS_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
+
+COPY .runpod/src /src
+
+WORKDIR /src
+CMD ["python3", "/src/handler.py"]
--- a/.runpod/README.md
+++ b/.runpod/README.md
@@ -0,0 +1,335 @@
+<h1>LLM Post Training- Full fine-tune, LoRA, QLoRa etc. Llama/Mistral/Gemma and more</h1>
+
+# Configuration Options
+
+This document outlines all available configuration options for training models. The configuration can be provided as a JSON request.
+
+## Usage
+
+You can use these configuration Options:
+
+1. As a JSON request body:
+
+```json
+{
+  "input": {
+    "user_id": "user",
+    "model_id": "model-name",
+    "run_id": "run-id",
+    "credentials": {
+      "wandb_api_key": "", # add your Weights & biases key. TODO:  you will be able to set this in Enviornment variables.
+      "hf_token": "", # add your HF_token. TODO:  you will be able to set this in Enviornment variables.
+    },
+    "args": {
+      "base_model": "NousResearch/Llama-3.2-1B",
+      // ... other options
+    }
+  }
+}
+```
+
+## Configuration Options
+
+### Model Configuration
+
+| Option              | Description                                                                                   | Default              |
+| ------------------- | --------------------------------------------------------------------------------------------- | -------------------- |
+| `base_model`        | Path to the base model (local or HuggingFace)                                                 | Required             |
+| `base_model_config` | Configuration path for the base model                                                         | Same as base_model   |
+| `revision_of_model` | Specific model revision from HuggingFace hub                                                  | Latest               |
+| `tokenizer_config`  | Custom tokenizer configuration path                                                           | Optional             |
+| `model_type`        | Type of model to load                                                                         | AutoModelForCausalLM |
+| `tokenizer_type`    | Type of tokenizer to use                                                                      | AutoTokenizer        |
+| `hub_model_id`      | Repository ID where the model will be pushed on Hugging Face Hub (format: username/repo-name) | Optional             |
+
+## Model Family Identification
+
+| Option                     | Default | Description                    |
+| -------------------------- | ------- | ------------------------------ |
+| `is_falcon_derived_model`  | `false` | Whether model is Falcon-based  |
+| `is_llama_derived_model`   | `false` | Whether model is LLaMA-based   |
+| `is_qwen_derived_model`    | `false` | Whether model is Qwen-based    |
+| `is_mistral_derived_model` | `false` | Whether model is Mistral-based |
+
+## Model Configuration Overrides
+
+| Option                                          | Default    | Description                        |
+| ----------------------------------------------- | ---------- | ---------------------------------- |
+| `overrides_of_model_config.rope_scaling.type`   | `"linear"` | RoPE scaling type (linear/dynamic) |
+| `overrides_of_model_config.rope_scaling.factor` | `1.0`      | RoPE scaling factor                |
+
+### Model Loading Options
+
+| Option         | Description                   | Default |
+| -------------- | ----------------------------- | ------- |
+| `load_in_8bit` | Load model in 8-bit precision | false   |
+| `load_in_4bit` | Load model in 4-bit precision | false   |
+| `bf16`         | Use bfloat16 precision        | false   |
+| `fp16`         | Use float16 precision         | false   |
+| `tf32`         | Use tensor float 32 precision | false   |
+
+## Memory and Device Settings
+
+| Option             | Default   | Description             |
+| ------------------ | --------- | ----------------------- |
+| `gpu_memory_limit` | `"20GiB"` | GPU memory limit        |
+| `lora_on_cpu`      | `false`   | Load LoRA on CPU        |
+| `device_map`       | `"auto"`  | Device mapping strategy |
+| `max_memory`       | `null`    | Max memory per device   |
+
+## Training Hyperparameters
+
+| Option                        | Default   | Description                 |
+| ----------------------------- | --------- | --------------------------- |
+| `gradient_accumulation_steps` | `1`       | Gradient accumulation steps |
+| `micro_batch_size`            | `2`       | Batch size per GPU          |
+| `eval_batch_size`             | `null`    | Evaluation batch size       |
+| `num_epochs`                  | `4`       | Number of training epochs   |
+| `warmup_steps`                | `100`     | Warmup steps                |
+| `warmup_ratio`                | `0.05`    | Warmup ratio                |
+| `learning_rate`               | `0.00003` | Learning rate               |
+| `lr_quadratic_warmup`         | `false`   | Quadratic warmup            |
+| `logging_steps`               | `null`    | Logging frequency           |
+| `eval_steps`                  | `null`    | Evaluation frequency        |
+| `evals_per_epoch`             | `null`    | Evaluations per epoch       |
+| `save_strategy`               | `"epoch"` | Checkpoint saving strategy  |
+| `save_steps`                  | `null`    | Saving frequency            |
+| `saves_per_epoch`             | `null`    | Saves per epoch             |
+| `save_total_limit`            | `null`    | Maximum checkpoints to keep |
+| `max_steps`                   | `null`    | Maximum training steps      |
+
+### Dataset Configuration
+
+```yaml
+datasets:
+  - path: vicgalle/alpaca-gpt4 # HuggingFace dataset or TODO: You will be able to add the local path.
+    type: alpaca # Format type (alpaca, gpteacher, oasst, etc.)
+    ds_type: json # Dataset type
+    data_files: path/to/data # Source data files
+    train_on_split: train # Dataset split to use
+```
+
+## Chat Template Settings
+
+| Option                   | Default                          | Description            |
+| ------------------------ | -------------------------------- | ---------------------- |
+| `chat_template`          | `"tokenizer_default"`            | Chat template type     |
+| `chat_template_jinja`    | `null`                           | Custom Jinja template  |
+| `default_system_message` | `"You are a helpful assistant."` | Default system message |
+
+## Dataset Processing
+
+| Option                        | Default                    | Description                       |
+| ----------------------------- | -------------------------- | --------------------------------- |
+| `dataset_prepared_path`       | `"data/last_run_prepared"` | Path for prepared dataset         |
+| `push_dataset_to_hub`         | `""`                       | Push dataset to HF hub            |
+| `dataset_processes`           | `4`                        | Number of preprocessing processes |
+| `dataset_keep_in_memory`      | `false`                    | Keep dataset in memory            |
+| `shuffle_merged_datasets`     | `true`                     | Shuffle merged datasets           |
+| `dataset_exact_deduplication` | `true`                     | Deduplicate datasets              |
+
+## LoRA Configuration
+
+| Option                     | Default                | Description                    |
+| -------------------------- | ---------------------- | ------------------------------ |
+| `adapter`                  | `"lora"`               | Adapter type (lora/qlora)      |
+| `lora_model_dir`           | `""`                   | Directory with pretrained LoRA |
+| `lora_r`                   | `8`                    | LoRA attention dimension       |
+| `lora_alpha`               | `16`                   | LoRA alpha parameter           |
+| `lora_dropout`             | `0.05`                 | LoRA dropout                   |
+| `lora_target_modules`      | `["q_proj", "v_proj"]` | Modules to apply LoRA          |
+| `lora_target_linear`       | `false`                | Target all linear modules      |
+| `peft_layers_to_transform` | `[]`                   | Layers to transform            |
+| `lora_modules_to_save`     | `[]`                   | Modules to save                |
+| `lora_fan_in_fan_out`      | `false`                | Fan in/out structure           |
+
+## Optimization Settings
+
+| Option                    | Default | Description                |
+| ------------------------- | ------- | -------------------------- |
+| `train_on_inputs`         | `false` | Train on input prompts     |
+| `group_by_length`         | `false` | Group by sequence length   |
+| `gradient_checkpointing`  | `false` | Use gradient checkpointing |
+| `early_stopping_patience` | `3`     | Early stopping patience    |
+
+## Learning Rate Scheduling
+
+| Option                     | Default    | Description          |
+| -------------------------- | ---------- | -------------------- |
+| `lr_scheduler`             | `"cosine"` | Scheduler type       |
+| `lr_scheduler_kwargs`      | `{}`       | Scheduler parameters |
+| `cosine_min_lr_ratio`      | `null`     | Minimum LR ratio     |
+| `cosine_constant_lr_ratio` | `null`     | Constant LR ratio    |
+| `lr_div_factor`            | `null`     | LR division factor   |
+
+## Optimizer Settings
+
+| Option                 | Default      | Description         |
+| ---------------------- | ------------ | ------------------- |
+| `optimizer`            | `"adamw_hf"` | Optimizer choice    |
+| `optim_args`           | `{}`         | Optimizer arguments |
+| `optim_target_modules` | `[]`         | Target modules      |
+| `weight_decay`         | `null`       | Weight decay        |
+| `adam_beta1`           | `null`       | Adam beta1          |
+| `adam_beta2`           | `null`       | Adam beta2          |
+| `adam_epsilon`         | `null`       | Adam epsilon        |
+| `max_grad_norm`        | `null`       | Gradient clipping   |
+
+## Attention Implementations
+
+| Option                     | Default | Description                   |
+| -------------------------- | ------- | ----------------------------- |
+| `flash_optimum`            | `false` | Use better transformers       |
+| `xformers_attention`       | `false` | Use xformers                  |
+| `flash_attention`          | `false` | Use flash attention           |
+| `flash_attn_cross_entropy` | `false` | Flash attention cross entropy |
+| `flash_attn_rms_norm`      | `false` | Flash attention RMS norm      |
+| `flash_attn_fuse_qkv`      | `false` | Fuse QKV operations           |
+| `flash_attn_fuse_mlp`      | `false` | Fuse MLP operations           |
+| `sdp_attention`            | `false` | Use scaled dot product        |
+| `s2_attention`             | `false` | Use shifted sparse attention  |
+
+## Tokenizer Modifications
+
+| Option           | Default | Description                  |
+| ---------------- | ------- | ---------------------------- |
+| `special_tokens` | -       | Special tokens to add/modify |
+| `tokens`         | `[]`    | Additional tokens            |
+
+## Distributed Training
+
+| Option                  | Default | Description           |
+| ----------------------- | ------- | --------------------- |
+| `fsdp`                  | `null`  | FSDP configuration    |
+| `fsdp_config`           | `null`  | FSDP config options   |
+| `deepspeed`             | `null`  | Deepspeed config path |
+| `ddp_timeout`           | `null`  | DDP timeout           |
+| `ddp_bucket_cap_mb`     | `null`  | DDP bucket capacity   |
+| `ddp_broadcast_buffers` | `null`  | DDP broadcast buffers |
+
+<details>
+<summary><h3>Example Configuration Request:</h3></summary>
+
+Here's a complete example for fine-tuning a LLaMA model using LoRA:
+
+```json
+{
+  "input": {
+    "user_id": "user",
+    "model_id": "llama-test",
+    "run_id": "test-run",
+    "credentials": {
+      "wandb_api_key": "",
+      "hf_token": ""
+    },
+    "args": {
+      "base_model": "NousResearch/Llama-3.2-1B",
+      "load_in_8bit": false,
+      "load_in_4bit": false,
+      "strict": false,
+      "datasets": [
+        {
+          "path": "teknium/GPT4-LLM-Cleaned",
+          "type": "alpaca"
+        }
+      ],
+      "dataset_prepared_path": "last_run_prepared",
+      "val_set_size": 0.1,
+      "output_dir": "./outputs/lora-out",
+      "adapter": "lora",
+      "sequence_len": 2048,
+      "sample_packing": true,
+      "eval_sample_packing": true,
+      "pad_to_sequence_len": true,
+      "lora_r": 16,
+      "lora_alpha": 32,
+      "lora_dropout": 0.05,
+      "lora_target_modules": [
+        "gate_proj",
+        "down_proj",
+        "up_proj",
+        "q_proj",
+        "v_proj",
+        "k_proj",
+        "o_proj"
+      ],
+      "gradient_accumulation_steps": 2,
+      "micro_batch_size": 2,
+      "num_epochs": 1,
+      "optimizer": "adamw_8bit",
+      "lr_scheduler": "cosine",
+      "learning_rate": 0.0002,
+      "train_on_inputs": false,
+      "group_by_length": false,
+      "bf16": "auto",
+      "tf32": false,
+      "gradient_checkpointing": true,
+      "logging_steps": 1,
+      "flash_attention": true,
+      "loss_watchdog_threshold": 5,
+      "loss_watchdog_patience": 3,
+      "warmup_steps": 10,
+      "evals_per_epoch": 4,
+      "saves_per_epoch": 1,
+      "weight_decay": 0,
+      "hub_model_id": "runpod/llama-fr-lora",
+      "wandb_name": "test-run-1",
+      "wandb_project": "test-run-1",
+      "wandb_entity": "axo-test",
+      "special_tokens": {
+        "pad_token": "<|end_of_text|>"
+      }
+    }
+  }
+}
+```
+
+</details>
+
+### Advanced Features
+
+#### Wandb Integration
+
+- `wandb_project`: Project name for Weights & Biases
+- `wandb_entity`: Team name in W&B
+- `wandb_watch`: Monitor model with W&B
+- `wandb_name`: Name of the W&B run
+- `wandb_run_id`: ID for the W&B run
+
+#### Performance Optimization
+
+- `sample_packing`: Enable efficient sequence packing
+- `eval_sample_packing`: Use sequence packing during evaluation
+- `torch_compile`: Enable PyTorch 2.0 compilation
+- `flash_attention`: Use Flash Attention implementation
+- `xformers_attention`: Use xFormers attention implementation
+
+### Available Optimizers
+
+The following optimizers are supported:
+
+- `adamw_hf`: HuggingFace's AdamW implementation
+- `adamw_torch`: PyTorch's AdamW
+- `adamw_torch_fused`: Fused AdamW implementation
+- `adamw_torch_xla`: XLA-optimized AdamW
+- `adamw_apex_fused`: NVIDIA Apex fused AdamW
+- `adafactor`: Adafactor optimizer
+- `adamw_anyprecision`: Anyprecision AdamW
+- `adamw_bnb_8bit`: 8-bit AdamW from bitsandbytes
+- `lion_8bit`: 8-bit Lion optimizer
+- `lion_32bit`: 32-bit Lion optimizer
+- `sgd`: Stochastic Gradient Descent
+- `adagrad`: Adagrad optimizer
+
+## Notes
+
+- Set `load_in_8bit: true` or `load_in_4bit: true` for memory-efficient training
+- Enable `flash_attention: true` for faster training on modern GPUs
+- Use `gradient_checkpointing: true` to reduce memory usage
+- Adjust `micro_batch_size` and `gradient_accumulation_steps` based on your GPU memory
+
+For more detailed information, please refer to the [documentation](https://axolotl-ai-cloud.github.io/axolotl/docs/config-reference.html).
+
+### Errors:
+
+- if you face any issues with the Flash Attention-2, Delete yoor worker and Re-start.
--- a/.runpod/hub.json
+++ b/.runpod/hub.json
@@ -0,0 +1,93 @@
+{
+  "title": "Axolotl Fine-Tuning",
+  "description": "Serverless fine-tuning of open-source LLMs with Axolotl. Supports LoRA, QLoRA, DPO, and more using Hugging Face models and datasets.",
+  "type": "serverless",
+  "category": "language",
+  "iconUrl": "https://avatars.githubusercontent.com/u/167502477",
+  "config": {
+    "runsOn": "GPU",
+    "containerDiskInGb": 200,
+    "gpuCount": 1,
+    "allowedCudaVersions": [
+      "12.8",
+      "12.7",
+      "12.6",
+      "12.5",
+      "12.4"
+    ],
+    "presets": [],
+    "env": [
+      {
+        "key": "TOKENIZER",
+        "input": {
+          "name": "Tokenizer",
+          "type": "string",
+          "description": "Name or path of the Hugging Face tokenizer to use.",
+          "default": "",
+          "advanced": true
+        }
+      },
+      {
+        "key": "MAX_NUM_SEQS",
+        "input": {
+          "name": "Max Num Seqs",
+          "type": "number",
+          "description": "Maximum number of sequences per iteration.",
+          "default": 256,
+          "advanced": true
+        }
+      },
+      {
+        "key": "DISABLE_LOG_STATS",
+        "input": {
+          "name": "Disable Log Stats",
+          "type": "boolean",
+          "description": "Disable logging statistics.",
+          "default": false,
+          "trueValue": "true",
+          "falseValue": "false"
+        }
+      },
+      {
+        "key": "LOAD_FORMAT",
+        "input": {
+          "name": "Load Format",
+          "type": "string",
+          "description": "The format of the model weights to load.",
+          "default": "auto",
+          "options": [
+            {
+              "label": "auto",
+              "value": "auto"
+            },
+            {
+              "label": "pt",
+              "value": "pt"
+            },
+            {
+              "label": "safetensors",
+              "value": "safetensors"
+            },
+            {
+              "label": "npcache",
+              "value": "npcache"
+            },
+            {
+              "label": "dummy",
+              "value": "dummy"
+            },
+            {
+              "label": "tensorizer",
+              "value": "tensorizer"
+            },
+            {
+              "label": "bitsandbytes",
+              "value": "bitsandbytes"
+            }
+          ],
+          "advanced": true
+        }
+      }
+    ]
+  }
+}
--- a/.runpod/requirements.txt
+++ b/.runpod/requirements.txt
@@ -0,0 +1,7 @@
+# Required Python packages get listed here, one per line.
+# Reccomended to lock the version number to avoid unexpected changes.
+
+# You can also install packages from a git repository, e.g.:
+# git+https://github.com/runpod/runpod-python.git
+# To learn more, see https://pip.pypa.io/en/stable/reference/requirements-file-format/
+runpod~=1.7.0
--- a/.runpod/src/config/config.yaml
+++ b/.runpod/src/config/config.yaml
@@ -0,0 +1,573 @@
+# # This is the huggingface model that contains *.pt, *.safetensors, or *.bin files
+# # This can also be a relative path to a model on disk
+# base_model: ./llama-7b-hf
+# # You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)
+# base_model_ignore_patterns:
+# # If the base_model repo on hf hub doesn't include configuration .json files,
+# # You can set that here, or leave this empty to default to base_model
+# base_model_config: ./llama-7b-hf
+# # You can specify to choose a specific model revision from huggingface hub
+# model_revision:
+# # Optional tokenizer configuration override in case you want to use a different tokenizer
+# # than the one defined in the base model
+# tokenizer_config:
+# # If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
+# model_type: AutoModelForCausalLM
+# # Corresponding tokenizer for the model AutoTokenizer is a good choice
+# tokenizer_type: AutoTokenizer
+# # Trust remote code for untrusted source
+# trust_remote_code:
+# # use_fast option for tokenizer loading from_pretrained, default to True
+# tokenizer_use_fast:
+# # Whether to use the legacy tokenizer setting, defaults to True
+# tokenizer_legacy:
+# # Resize the model embeddings when new tokens are added to multiples of 32
+# # This is reported to improve training speed on some models
+# resize_token_embeddings_to_32x:
+
+# # Used to identify which the model is based on
+# is_falcon_derived_model:
+# is_llama_derived_model:
+# # Please note that if you set this to true, `padding_side` will be set to "left" by default
+# is_mistral_derived_model:
+# is_qwen_derived_model:
+
+# # optional overrides to the base model configuration
+# model_config:
+#   # RoPE Scaling https://github.com/huggingface/transformers/pull/24653
+#   rope_scaling:
+#     type: # linear | dynamic
+#     factor: # float
+
+
+# # Whether you are training a 4-bit GPTQ quantized model
+# gptq: true
+# gptq_groupsize: 128 # group size
+# gptq_model_v1: false # v1 or v2
+
+# # This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
+# load_in_8bit: true
+# # Use bitsandbytes 4 bit
+# load_in_4bit:
+
+# # Use CUDA bf16
+# bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
+# # Use CUDA fp16
+# fp16: true
+# # Use CUDA tf32
+# tf32: true # require >=ampere
+
+# # No AMP (automatic mixed precision)
+# bfloat16: true # require >=ampere
+# float16: true
+
+# # A list of one or more datasets to finetune the model with
+# datasets:
+#   # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
+#   - path: vicgalle/alpaca-gpt4
+#   # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
+#     type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
+#     ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
+#     data_files: # Optional[str] path to source data files
+#     shards: # Optional[int] number of shards to split data into
+#     name: # Optional[str] name of dataset configuration to load
+#     train_on_split: train # Optional[str] name of dataset split to load from
+
+#     # Optional[str] fastchat conversation type, only used with type: sharegpt
+#     conversation:  # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+#     field_human: # Optional[str]. Human key to use for conversation.
+#     field_model: # Optional[str]. Assistant key to use for conversation.
+
+#   # Custom user prompt
+#   - path: repo
+#     type:
+#       # The below are defaults. only set what's needed.
+#       system_prompt: ""
+#       system_format: "{system}"
+#       field_system: system
+#       field_instruction: instruction
+#       field_input: input
+#       field_output: output
+
+#       # Customizable to be single line or multi-line
+#       # 'format' can include {input}
+#       format: |-
+#         User: {instruction} {input}
+#         Assistant:
+#       # 'no_input_format' cannot include {input}
+#       no_input_format: "{instruction} "
+
+#       # For `completion` datasets only, uses the provided field instead of `text` column
+#       field:
+
+# # Axolotl attempts to save the dataset as an arrow after packing the data together so
+# # subsequent training attempts load faster, relative path
+# dataset_prepared_path: data/last_run_prepared
+# # Push prepared dataset to hub
+# push_dataset_to_hub: # repo path
+# # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
+# # if not set.
+# dataset_processes: # defaults to os.cpu_count() if not set
+# # push checkpoints to hub
+# hub_model_id: # repo path to push finetuned model
+# # how to push checkpoints to hub
+# # https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy
+# hub_strategy:
+# # Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
+# # Required to be true when used in combination with `push_dataset_to_hub`
+# hf_use_auth_token: # boolean
+# # How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval.
+# val_set_size: 0.04
+# # Num shards for whole dataset
+# dataset_shard_num:
+# # Index of shard to use for whole dataset
+# dataset_shard_idx:
+
+# # The maximum length of an input to train with, this should typically be less than 2048
+# # as most models have a token/context limit of 2048
+# sequence_len: 2048
+# # Pad inputs so each step uses constant sized buffers
+# # This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
+# pad_to_sequence_len:
+# # Max sequence length to concatenate training samples together up to
+# # Inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
+# # FutureWarning: This will soon be DEPRECATED
+# max_packed_sequence_len: 1024
+# # Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
+# sample_packing:
+# # Set to 'false' if getting errors during eval with sample_packing on.
+# eval_sample_packing:
+# # You can set these packing optimizations AFTER starting a training at least once.
+# # The trainer will provide recommended values for these values.
+# sample_packing_eff_est:
+# total_num_tokens:
+
+# # If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model
+# adapter: lora
+# # If you already have a lora model trained that you want to load, put that here.
+# # This means after training, if you want to test the model, you should set this to the value of `lora_out_dir`.
+# lora_model_dir:
+
+# # LoRA hyperparameters
+# # For more details about the following options, see:
+# # https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2
+# lora_r: 8
+# lora_alpha: 16
+# lora_dropout: 0.05
+# lora_target_modules:
+#   - q_proj
+#   - v_proj
+# #  - k_proj
+# #  - o_proj
+# #  - gate_proj
+# #  - down_proj
+# #  - up_proj
+# lora_target_linear: # If true, will target all linear layers
+
+# # If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.
+# # For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.
+# # `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities.
+# # https://github.com/huggingface/peft/issues/334#issuecomment-1561727994
+# lora_modules_to_save:
+# #  - embed_tokens
+# #  - lm_head
+
+# # Once you complete training, the model will be saved to the following directory.
+# # If you merge the adapter to the base model, a subdirectory `merged` will be created under this directory.
+# # Make sure `lora_model_dir` points to this directory if you want to use the trained model.
+# lora_out_dir:
+# lora_fan_in_fan_out: false
+
+# # ReLoRA configuration
+# # Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
+# relora_steps: # Number of steps per ReLoRA restart
+# relora_warmup_steps: # Number of per-restart warmup steps
+# relora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings
+
+# # wandb configuration if you're using it
+# wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
+# wandb_project: # Your wandb project name
+# wandb_entity: # A wandb Team name if using a Team
+# wandb_watch:
+# wandb_run_id: # Set the name of your wandb run
+# wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training
+
+# # Where to save the full-finetuned model to
+# output_dir: ./completed-model
+
+# # Whether to use torch.compile and which backend to use
+# torch_compile:  # bool
+# torch_compile_backend:  # Optional[str]
+
+# # Training hyperparameters
+
+# # If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps.
+# gradient_accumulation_steps: 1
+# # The number of samples to include in each batch. This is the number of samples sent to each GPU.
+# micro_batch_size: 2
+# eval_batch_size:
+# num_epochs: 4
+# warmup_steps: 100  # cannot use with warmup_ratio
+# warmup_ratio: 0.05  # cannot use with warmup_steps
+# learning_rate: 0.00003
+# lr_quadratic_warmup:
+# logging_steps:
+# save_strategy: # Set to `no` to skip checkpoint saves
+# save_steps: # Leave empty to save at each epoch
+# eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
+# save_total_limit: # Checkpoints saved at a time
+# # Maximum number of iterations to train for. It precedes num_epochs which means that
+# # if both are set, num_epochs will not be guaranteed.
+# # e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
+# max_steps:
+
+# eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
+# eval_table_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
+
+# # Save model as safetensors (require safetensors package)
+# save_safetensors:
+
+# # Whether to mask out or include the human's prompt from the training labels
+# train_on_inputs: false
+# # Group similarly sized data to minimize padding.
+# # May be slower to start, as it must download and sort the entire dataset.
+# # Note that training loss may have an oscillating pattern with this enabled.
+# group_by_length: false
+
+# # Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
+# gradient_checkpointing: false
+
+# # Stop training after this many evaluation losses have increased in a row
+# # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
+# early_stopping_patience: 3
+
+# # Specify a scheduler and kwargs to use with the optimizer
+# lr_scheduler: # 'one_cycle' | empty for cosine
+# lr_scheduler_kwargs:
+
+# # For one_cycle optim
+# lr_div_factor: # Learning rate div factor
+
+# # Specify optimizer
+# # Valid values are driven by the Transformers OptimizerNames class, see:
+# # https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
+# #
+# # Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of
+# # torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used
+# # in the examples/ for your model and fine-tuning use case.
+# #
+# # Valid values for 'optimizer' include:
+# # - adamw_hf
+# # - adamw_torch
+# # - adamw_torch_fused
+# # - adamw_torch_xla
+# # - adamw_apex_fused
+# # - adafactor
+# # - adamw_anyprecision
+# # - sgd
+# # - adagrad
+# # - adamw_bnb_8bit
+# # - lion_8bit
+# # - lion_32bit
+# # - paged_adamw_32bit
+# # - paged_adamw_8bit
+# # - paged_lion_32bit
+# # - paged_lion_8bit
+# optimizer:
+# # Specify weight decay
+# weight_decay:
+# # adamw hyperparams
+# adam_beta1:
+# adam_beta2:
+# adam_epsilon:
+# # Gradient clipping max norm
+# max_grad_norm:
+
+# # Augmentation techniques
+# # NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings
+# # currently only supported on Llama and Mistral
+# noisy_embedding_alpha:
+
+# # Whether to bettertransformers
+# flash_optimum:
+# # Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
+# xformers_attention:
+# # Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
+# flash_attention:
+# flash_attn_cross_entropy:  # Whether to use flash-attention cross entropy implementation - advanced use only
+# flash_attn_rms_norm:  # Whether to use flash-attention rms norm implementation - advanced use only
+# flash_attn_fuse_qkv: # Whether to fuse QKV into a single operation
+# flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
+# # Whether to use scaled-dot-product attention
+# # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+# sdp_attention:
+# # Landmark attention (only llama)
+# landmark_attention:
+# # xpos RoPE see https://github.com/kaiokendev/cutoff-len-is-context-len/blob/main/util/xpos_rope_llama_monkey_patch.py
+# # LLaMA only
+# xpos_rope:
+
+# # Resume from a specific checkpoint dir
+# resume_from_checkpoint:
+# # If resume_from_checkpoint isn't set and you simply want it to start where it left off.
+# # Be careful with this being turned on between different models.
+# auto_resume_from_checkpoints: false
+
+# # Don't mess with this, it's here for accelerate and torchrun
+# local_rank:
+
+# # Add or change special tokens.
+# # If you add tokens here, you don't need to add them to the `tokens` list.
+# special_tokens:
+#   # bos_token: "<s>"
+#   # eos_token: "</s>"
+#   # unk_token: "<unk>"
+
+# # Add extra tokens.
+# tokens:
+
+# # FSDP
+# fsdp:
+# fsdp_config:
+
+# # Deepspeed config path. e.g., deepspeed/zero3.json
+# deepspeed:
+
+# # Advanced DDP Arguments
+# ddp_timeout:
+# ddp_bucket_cap_mb:
+# ddp_broadcast_buffers:
+
+# # Path to torch distx for optim 'adamw_anyprecision'
+# torchdistx_path:
+
+# # Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
+# pretraining_dataset:
+
+# # Debug mode
+# debug:
+
+# # Seed
+# seed:
+
+# # Allow overwrite yml config using from cli
+# strict:
+
+
+
+base_model: ${BASE_MODEL}
+base_model_ignore_patterns: ${BASE_MODEL_IGNORE_PATTERNS}
+base_model_config: ${BASE_MODEL_CONFIG}
+revision_of_model: ${REVISION_OF_MODEL}
+tokenizer_config: ${TOKENIZER_CONFIG}
+model_type: ${MODEL_TYPE}
+tokenizer_type: ${TOKENIZER_TYPE}
+trust_remote_code: ${TRUST_REMOTE_CODE}
+tokenizer_use_fast: ${TOKENIZER_USE_FAST}
+tokenizer_legacy: ${TOKENIZER_LEGACY}
+resize_token_embeddings_to_32x: ${RESIZE_TOKEN_EMBEDDINGS_TO_32X}
+
+is_falcon_derived_model: ${IS_FALCON_DERIVED_MODEL}
+is_llama_derived_model: ${IS_LLAMA_DERIVED_MODEL}
+is_qwen_derived_model: ${IS_QWEN_DERIVED_MODEL}
+is_mistral_derived_model: ${IS_MISTRAL_DERIVED_MODEL}
+
+overrides_of_model_config:
+  rope_scaling:
+    type: ${ROPE_SCALING_TYPE}
+    factor: ${ROPE_SCALING_FACTOR}
+
+bnb_config_kwargs:
+  llm_int8_has_fp16_weight: ${BNB_LLM_INT8_HAS_FP16_WEIGHT}
+  bnb_4bit_quant_type: ${BNB_4BIT_QUANT_TYPE}
+  bnb_4bit_use_double_quant: ${BNB_4BIT_USE_DOUBLE_QUANT}
+
+gptq: ${GPTQ}
+load_in_8bit: ${LOAD_IN_8BIT}
+load_in_4bit: ${LOAD_IN_4BIT}
+bf16: ${BF16}
+fp16: ${FP16}
+tf32: ${TF32}
+bfloat16: ${BFLOAT16}
+float16: ${FLOAT16}
+
+gpu_memory_limit: ${GPU_MEMORY_LIMIT}
+lora_on_cpu: ${LORA_ON_CPU}
+
+datasets:
+  - path: ${DATASET_PATH}
+    type: ${DATASET_TYPE}
+    ds_type: ${DATASET_DS_TYPE}
+    data_files: ${DATASET_DATA_FILES}
+    shards: ${DATASET_SHARDS}
+    name: ${DATASET_NAME}
+    train_on_split: ${DATASET_TRAIN_ON_SPLIT}
+    revision: ${DATASET_REVISION}
+    trust_remote_code: ${DATASET_TRUST_REMOTE_CODE}
+
+rl: ${RL}
+dpo_use_weighting: ${DPO_USE_WEIGHTING}
+
+chat_template: ${CHAT_TEMPLATE}
+chat_template_jinja: ${CHAT_TEMPLATE_JINJA}
+default_system_message: ${DEFAULT_SYSTEM_MESSAGE}
+dataset_prepared_path: ${DATASET_PREPARED_PATH}
+push_dataset_to_hub: ${PUSH_DATASET_TO_HUB}
+dataset_processes: ${DATASET_PROCESSES}
+dataset_keep_in_memory: ${DATASET_KEEP_IN_MEMORY}
+hub_model_id: ${HUB_MODEL_ID}
+hub_strategy: ${HUB_STRATEGY}
+hf_use_auth_token: ${HF_USE_AUTH_TOKEN}
+val_set_size: ${VAL_SET_SIZE}
+dataset_shard_num: ${DATASET_SHARD_NUM}
+dataset_shard_idx: ${DATASET_SHARD_IDX}
+
+sequence_len: ${SEQUENCE_LEN}
+pad_to_sequence_len: ${PAD_TO_SEQUENCE_LEN}
+sample_packing: ${SAMPLE_PACKING}
+eval_sample_packing: ${EVAL_SAMPLE_PACKING}
+sample_packing_eff_est: ${SAMPLE_PACKING_EFF_EST}
+total_num_tokens: ${TOTAL_NUM_TOKENS}
+sample_packing_group_size: ${SAMPLE_PACKING_GROUP_SIZE}
+sample_packing_bin_size: ${SAMPLE_PACKING_BIN_SIZE}
+
+batch_flattening: ${BATCH_FLATTENING}
+device_map: ${DEVICE_MAP}
+max_memory: ${MAX_MEMORY}
+
+adapter: ${ADAPTER}
+lora_model_dir: ${LORA_MODEL_DIR}
+
+lora_r: ${LORA_R}
+lora_alpha: ${LORA_ALPHA}
+lora_dropout: ${LORA_DROPOUT}
+lora_target_modules:
+  - ${LORA_TARGET_MODULES}
+lora_target_linear: ${LORA_TARGET_LINEAR}
+peft_layers_to_transform: ${PEFT_LAYERS_TO_TRANSFORM}
+lora_modules_to_save: ${LORA_MODULES_TO_SAVE}
+lora_fan_in_fan_out: ${LORA_FAN_IN_FAN_OUT}
+
+loraplus_lr_ratio: ${LORAPLUS_LR_RATIO}
+loraplus_lr_embedding: ${LORAPLUS_LR_EMBEDDING}
+
+peft:
+  loftq_config:
+    loftq_bits: ${LOFTQ_BITS}
+
+relora_steps: ${RELORA_STEPS}
+relora_warmup_steps: ${RELORA_WARMUP_STEPS}
+relora_anneal_steps: ${RELORA_ANNEAL_STEPS}
+relora_prune_ratio: ${RELORA_PRUNE_RATIO}
+relora_cpu_offload: ${RELORA_CPU_OFFLOAD}
+
+wandb_mode: ${WANDB_MODE}
+wandb_project: ${WANDB_PROJECT}
+wandb_entity: ${WANDB_ENTITY}
+wandb_watch: ${WANDB_WATCH}
+wandb_name: ${WANDB_NAME}
+wandb_run_id: ${WANDB_RUN_ID}
+wandb_log_model: ${WANDB_LOG_MODEL}
+
+mlflow_tracking_uri: ${MLFLOW_TRACKING_URI}
+mlflow_experiment_name: ${MLFLOW_EXPERIMENT_NAME}
+mlflow_run_name: ${MLFLOW_RUN_NAME}
+hf_mlflow_log_artifacts: ${HF_MLFLOW_LOG_ARTIFACTS}
+
+use_comet: ${USE_COMET}
+comet_api_key: ${COMET_API_KEY}
+comet_workspace: ${COMET_WORKSPACE}
+comet_project_name: ${COMET_PROJECT_NAME}
+comet_experiment_key: ${COMET_EXPERIMENT_KEY}
+comet_mode: ${COMET_MODE}
+comet_online: ${COMET_ONLINE}
+comet_experiment_config: ${COMET_EXPERIMENT_CONFIG}
+
+output_dir: ${OUTPUT_DIR}
+
+torch_compile: ${TORCH_COMPILE}
+torch_compile_backend: ${TORCH_COMPILE_BACKEND}
+
+gradient_accumulation_steps: ${GRADIENT_ACCUMULATION_STEPS}
+micro_batch_size: ${MICRO_BATCH_SIZE}
+eval_batch_size: ${EVAL_BATCH_SIZE}
+num_epochs: ${NUM_EPOCHS}
+warmup_steps: ${WARMUP_STEPS}
+warmup_ratio: ${WARMUP_RATIO}
+learning_rate: ${LEARNING_RATE}
+lr_quadratic_warmup: ${LR_QUADRATIC_WARMUP}
+logging_steps: ${LOGGING_STEPS}
+eval_steps: ${EVAL_STEPS}
+evals_per_epoch: ${EVALS_PER_EPOCH}
+save_strategy: ${SAVE_STRATEGY}
+save_steps: ${SAVE_STEPS}
+saves_per_epoch: ${SAVES_PER_EPOCH}
+save_total_limit: ${SAVE_TOTAL_LIMIT}
+max_steps: ${MAX_STEPS}
+
+eval_table_size: ${EVAL_TABLE_SIZE}
+eval_max_new_tokens: ${EVAL_MAX_NEW_TOKENS}
+eval_causal_lm_metrics: ${EVAL_CAUSAL_LM_METRICS}
+
+profiler_steps: ${PROFILER_STEPS}
+loss_watchdog_threshold: ${LOSS_WATCHDOG_THRESHOLD}
+loss_watchdog_patience: ${LOSS_WATCHDOG_PATIENCE}
+
+save_safetensors: ${SAVE_SAFETENSORS}
+train_on_inputs: ${TRAIN_ON_INPUTS}
+group_by_length: ${GROUP_BY_LENGTH}
+gradient_checkpointing: ${GRADIENT_CHECKPOINTING}
+early_stopping_patience: ${EARLY_STOPPING_PATIENCE}
+
+lr_scheduler: ${LR_SCHEDULER}
+lr_scheduler_kwargs: ${LR_SCHEDULER_KWARGS}
+cosine_min_lr_ratio: ${COSINE_MIN_LR_RATIO}
+cosine_constant_lr_ratio: ${COSINE_CONSTANT_LR_RATIO}
+lr_div_factor: ${LR_DIV_FACTOR}
+
+optimizer: ${OPTIMIZER}
+optim_args: ${OPTIM_ARGS}
+optim_target_modules: ${OPTIM_TARGET_MODULES}
+weight_decay: ${WEIGHT_DECAY}
+adam_beta1: ${ADAM_BETA1}
+adam_beta2: ${ADAM_BETA2}
+adam_epsilon: ${ADAM_EPSILON}
+max_grad_norm: ${MAX_GRAD_NORM}
+
+neftune_noise_alpha: ${NEFTUNE_NOISE_ALPHA}
+
+flash_optimum: ${FLASH_OPTIMUM}
+xformers_attention: ${XFORMERS_ATTENTION}
+flash_attention: ${FLASH_ATTENTION}
+flash_attn_cross_entropy: ${FLASH_ATTN_CROSS_ENTROPY}
+flash_attn_rms_norm: ${FLASH_ATTN_RMS_NORM}
+flash_attn_fuse_qkv: ${FLASH_ATTN_FUSE_QKV}
+flash_attn_fuse_mlp: ${FLASH_ATTN_FUSE_MLP}
+sdp_attention: ${SDP_ATTENTION}
+s2_attention: ${S2_ATTENTION}
+resume_from_checkpoint: ${RESUME_FROM_CHECKPOINT}
+auto_resume_from_checkpoints: ${AUTO_RESUME_FROM_CHECKPOINTS}
+
+local_rank: ${LOCAL_RANK}
+
+special_tokens:
+  bos_token: ${SPECIAL_TOKEN_BOS}
+  eos_token: ${SPECIAL_TOKEN_EOS}
+  unk_token: ${SPECIAL_TOKEN_UNK}
+  pad_token: ${SPECIAL_TOKEN_PAD}
+
+tokens: ${TOKENS}
+
+fsdp: ${FSDP}
+fsdp_config: ${FSDP_CONFIG}
+deepspeed: ${DEEPSPEED}
+
+ddp_timeout: ${DDP_TIMEOUT}
+ddp_bucket_cap_mb: ${DDP_BUCKET_CAP_MB}
+ddp_broadcast_buffers: ${DDP_BROADCAST_BUFFERS}
+
+torchdistx_path: ${TORCHDISTX_PATH}
+pretraining_dataset: ${PRETRAINING_DATASET}
+debug: ${DEBUG}
+seed: ${SEED}
+strict: ${STRICT}
--- a/.runpod/src/handler.py
+++ b/.runpod/src/handler.py
@@ -0,0 +1,66 @@
+"""
+Runpod serverless entrypoint handler
+"""
+
+import os
+
+import runpod
+import yaml
+from huggingface_hub._login import login
+from train import train
+from utils import get_output_dir
+
+BASE_VOLUME = os.environ.get("BASE_VOLUME", "/runpod-volume")
+if not os.path.exists(BASE_VOLUME):
+    os.makedirs(BASE_VOLUME)
+
+logger = runpod.RunPodLogger()
+
+
+async def handler(job):
+    runpod_job_id = job["id"]
+    inputs = job["input"]
+    run_id = inputs.get("run_id", "default_run_id")
+    args = inputs.get("args", {})
+
+    # Set output directory
+    output_dir = os.path.join(BASE_VOLUME, get_output_dir(run_id))
+    args["output_dir"] = output_dir
+
+    # First save args to a temporary config file
+    config_path = "/workspace/test_config.yaml"
+
+    # Add run_name and job_id to args before saving
+    args["run_name"] = run_id
+    args["runpod_job_id"] = runpod_job_id
+
+    yaml_data = yaml.dump(args, default_flow_style=False)
+    with open(config_path, "w", encoding="utf-8") as file:
+        file.write(yaml_data)
+
+    # Handle credentials
+    credentials = inputs.get("credentials", {})
+
+    if "wandb_api_key" in credentials:
+        os.environ["WANDB_API_KEY"] = credentials["wandb_api_key"]
+    if "hf_token" in credentials:
+        os.environ["HF_TOKEN"] = credentials["hf_token"]
+
+    if os.environ.get("HF_TOKEN"):
+        login(token=os.environ["HF_TOKEN"])
+    else:
+        logger.info("No HF_TOKEN provided. Skipping login.")
+
+    logger.info("Starting Training.")
+    async for result in train(config_path):  # Pass the config path instead of args
+        logger.info(result)
+    logger.info("Training Complete.")
+
+    # Cleanup
+    if "WANDB_API_KEY" in os.environ:
+        del os.environ["WANDB_API_KEY"]
+    if "HF_TOKEN" in os.environ:
+        del os.environ["HF_TOKEN"]
+
+
+runpod.serverless.start({"handler": handler, "return_aggregate_stream": True})
--- a/.runpod/src/test_input.json
+++ b/.runpod/src/test_input.json
@@ -0,0 +1,61 @@
+{
+  "input": {
+    "user_id": "user",
+    "model_id": "llama-test",
+    "run_id": "llama-test",
+    "credentials": {
+      "wandb_api_key": "",
+      "hf_token": ""
+    },
+    "args": {
+      "base_model": "NousResearch/Meta-Llama-3-8B",
+      "model_type": "LlamaForCausalLM",
+      "tokenizer_type": "AutoTokenizer",
+      "load_in_8bit": true,
+      "load_in_4bit": false,
+      "strict": false,
+      "datasets": [
+        {
+          "path": "mhenrichsen/alpaca_2k_test",
+          "type": "alpaca"
+        }
+      ],
+      "val_set_size": 0.05,
+      "output_dir": "./outputs/lora-out",
+      "sequence_len": 4096,
+      "sample_packing": true,
+      "eval_sample_packing": false,
+      "pad_to_sequence_len": true,
+      "adapter": "lora",
+      "lora_r": 32,
+      "lora_alpha": 16,
+      "lora_dropout": 0.05,
+      "lora_target_linear": true,
+      "lora_modules_to_save": [
+        "embed_tokens",
+        "lm_head"
+      ],
+      "gradient_accumulation_steps": 4,
+      "micro_batch_size": 2,
+      "num_epochs": 1,
+      "optimizer": "adamw_bnb_8bit",
+      "lr_scheduler": "cosine",
+      "learning_rate": 0.0002,
+      "train_on_inputs": false,
+      "group_by_length": false,
+      "bf16": "auto",
+      "tf32": false,
+      "gradient_checkpointing": true,
+      "logging_steps": 1,
+      "flash_attention": true,
+      "warmup_steps": 1,
+      "evals_per_epoch": 1,
+      "eval_max_new_tokens": 128,
+      "saves_per_epoch": 1,
+      "weight_decay": 0.0,
+      "special_tokens": {
+        "pad_token": "<|end_of_text|>"
+      }
+    }
+  }
+}
--- a/.runpod/src/train.py
+++ b/.runpod/src/train.py
@@ -0,0 +1,45 @@
+"""
+Runpod train entrypoint
+"""
+
+import asyncio
+
+
+async def train(config_path: str, gpu_id: str = "0", preprocess: bool = True):
+    """
+    Run preprocessing (if enabled) and training with the given config file
+    :param config_path: Path to the YAML config file
+    :param gpu_id: GPU ID to use (default: "0")
+    :param preprocess: Whether to run preprocessing (default: True)
+
+    """
+    # First check if preprocessing is needed
+    if preprocess:
+        # Preprocess command
+        preprocess_cmd = (
+            f"CUDA_VISIBLE_DEVICES={gpu_id} axolotl preprocess {config_path}"
+        )
+        process = await asyncio.create_subprocess_shell(
+            preprocess_cmd,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.STDOUT,
+        )
+
+        if process.stdout is not None:
+            async for line in process.stdout:
+                yield f"Preprocessing: {line.decode().strip()}"
+        await process.wait()
+        yield "Preprocessing completed."
+    else:
+        yield "Skipping preprocessing step."
+
+    # Training command
+    train_cmd = f"axolotl train {config_path}"
+    process = await asyncio.create_subprocess_shell(
+        train_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT
+    )
+
+    if process.stdout is not None:
+        async for line in process.stdout:
+            yield f"Training: {line.decode().strip()}"
+    await process.wait()
--- a/.runpod/src/utils.py
+++ b/.runpod/src/utils.py
@@ -0,0 +1,89 @@
+"""
+Runpod launcher utils
+"""
+
+import os
+
+import yaml
+
+
+def get_output_dir(run_id):
+    path = f"fine-tuning/{run_id}"
+    return path
+
+
+def make_valid_config(input_args):
+    """
+    Creates and saves updated config file, returns the path to the new config
+    :param input_args: dict of input args
+    :return: str, path to the updated config file
+    """
+    # Load default config
+    with open("config/config.yaml", "r", encoding="utf-8") as fin:
+        all_args = yaml.safe_load(fin)
+
+    if not input_args:
+        print("No args provided, using defaults")
+    else:
+        all_args.update(input_args)
+
+    # Create updated config path
+    updated_config_path = "config/updated_config.yaml"
+
+    # Save updated config to new file
+    with open(updated_config_path, "w", encoding="utf-8") as f:
+        yaml.dump(all_args, f)
+
+    return updated_config_path
+
+
+def set_config_env_vars(args: dict):
+    """
+    Convert API arguments into environment variables.
+    Handles nested dictionaries, lists, and special values.
+
+    Args:
+        args (dict): The arguments dictionary from the API request
+    """
+
+    def process_value(value):
+        """Convert Python values to string format for environment variables"""
+        if value is None:
+            return ""
+        if isinstance(value, bool):
+            return str(value).lower()
+        if isinstance(value, (list, dict)):
+            return str(value)
+        return str(value)
+
+    def set_env_vars(data, prefix=""):
+        """Recursively set environment variables from nested dictionary"""
+        for key, value in data.items():
+            env_key = prefix + key.upper()
+
+            # Handle special cases
+            if isinstance(value, dict):
+                # For nested dictionaries (like special_tokens)
+                set_env_vars(value, f"{env_key}_")
+            elif isinstance(value, list):
+                # Handle list of dictionaries (like datasets)
+                if value and isinstance(value[0], dict):
+                    for i, item in enumerate(value):
+                        set_env_vars(item, f"{env_key}_{i}_")
+                else:
+                    # For simple lists (like lora_target_modules)
+                    os.environ[env_key] = process_value(value)
+            else:
+                # Handle all other cases
+                os.environ[env_key] = process_value(value)
+
+    # Clear any existing related environment variables
+    # This prevents old values from persisting
+    for key in list(os.environ.keys()):
+        if key.startswith(
+            ("BASE_MODEL", "MODEL_TYPE", "TOKENIZER_TYPE", "DATASET", "LORA_", "WANDB_")
+        ):
+            del os.environ[key]
+
+    # Set new environment variables
+    set_env_vars(args)
--- a/.runpod/test-input.json
+++ b/.runpod/test-input.json
@@ -0,0 +1,86 @@
+{
+  "input": {
+    "name": "quick_smoke_test_sft",
+    "user_id": "user",
+    "model_id": "llama-test",
+    "run_id": "llama-test",
+    "credentials": {
+      "wandb_api_key": "",
+      "hf_token": ""
+    },
+    "args": {
+      "base_model": "HuggingFaceTB/SmolLM2-135M",
+      "model_type": "AutoModelForCausalLM",
+      "tokenizer_type": "AutoTokenizer",
+      "load_in_4bit": true,
+      "strict": false,
+      "datasets": [
+        {
+          "path": "mhenrichsen/alpaca_2k_test",
+          "type": "alpaca",
+          "split": "train[:10%]"
+        }
+      ],
+      "val_set_size": 0.02,
+      "output_dir": "./outputs/lora-out",
+      "sequence_len": 4096,
+      "sample_packing": true,
+      "eval_sample_packing": false,
+      "pad_to_sequence_len": true,
+      "adapter": "qlora",
+      "lora_r": 32,
+      "lora_alpha": 64,
+      "lora_dropout": 0.05,
+      "lora_target_linear": true,
+      "lora_modules_to_save": [
+        "embed_tokens",
+        "lm_head"
+      ],
+      "gradient_accumulation_steps": 2,
+      "micro_batch_size": 1,
+      "num_epochs": 1,
+      "optimizer": "adamw_torch_fused",
+      "lr_scheduler": "cosine",
+      "learning_rate": 0.0002,
+      "train_on_inputs": false,
+      "group_by_length": false,
+      "bf16": "auto",
+      "tf32": true,
+      "gradient_checkpointing": true,
+      "logging_steps": 1,
+      "flash_attention": true,
+      "warmup_steps": 1,
+      "evals_per_epoch": 1,
+      "eval_max_new_tokens": 128,
+      "saves_per_epoch": 1,
+      "weight_decay": 0.0,
+      "special_tokens": {
+        "pad_token": "<|endoftext|>"
+      },
+      "max_steps": 20
+    },
+    "timeout": 100000
+  },
+  "config": {
+    "gpuTypeId": "NVIDIA GeForce RTX 4090",
+    "gpuCount": 1,
+    "containerDiskInGb": 200,
+    "env": [
+      {
+        "key": "TOKENIZER",
+        "value": ""
+      },
+      {
+        "key": "DISABLE_LOG_STATS",
+        "value": "true"
+      }
+    ],
+    "allowedCudaVersions": [
+      "12.8",
+      "12.7",
+      "12.6",
+      "12.5",
+      "12.4"
+    ]
+  }
+}
--- a/.runpod/tests.json
+++ b/.runpod/tests.json
@@ -0,0 +1,90 @@
+{
+  "tests": [
+    {
+      "name": "quick_smoke_test_sft",
+      "input": {
+        "user_id": "user",
+        "model_id": "llama-test",
+        "run_id": "llama-test",
+        "credentials": {
+          "wandb_api_key": "",
+          "hf_token": ""
+        },
+        "args": {
+          "base_model": "HuggingFaceTB/SmolLM2-135M",
+          "model_type": "AutoModelForCausalLM",
+          "tokenizer_type": "AutoTokenizer",
+          "load_in_4bit": true,
+          "strict": false,
+          "datasets": [
+            {
+              "path": "mhenrichsen/alpaca_2k_test",
+              "type": "alpaca",
+              "split": "train[:10%]"
+            }
+          ],
+          "val_set_size": 0.02,
+          "output_dir": "./outputs/lora-out",
+          "sequence_len": 4096,
+          "sample_packing": true,
+          "eval_sample_packing": false,
+          "pad_to_sequence_len": true,
+          "adapter": "qlora",
+          "lora_r": 32,
+          "lora_alpha": 64,
+          "lora_dropout": 0.05,
+          "lora_target_linear": true,
+          "lora_modules_to_save": [
+            "embed_tokens",
+            "lm_head"
+          ],
+          "gradient_accumulation_steps": 2,
+          "micro_batch_size": 1,
+          "num_epochs": 1,
+          "optimizer": "adamw_torch_fused",
+          "lr_scheduler": "cosine",
+          "learning_rate": 0.0002,
+          "train_on_inputs": false,
+          "group_by_length": false,
+          "bf16": "auto",
+          "tf32": true,
+          "gradient_checkpointing": true,
+          "logging_steps": 1,
+          "flash_attention": true,
+          "warmup_steps": 1,
+          "evals_per_epoch": 1,
+          "eval_max_new_tokens": 128,
+          "saves_per_epoch": 1,
+          "weight_decay": 0.0,
+          "special_tokens": {
+            "pad_token": "<|endoftext|>"
+          },
+          "max_steps": 20
+        }
+      },
+      "timeout": 100000
+    }
+  ],
+  "config": {
+    "gpuTypeId": "NVIDIA GeForce RTX 4090",
+    "gpuCount": 1,
+    "containerDiskInGb": 200,
+    "env": [
+      {
+        "key": "TOKENIZER",
+        "value": ""
+      },
+      {
+        "key": "DISABLE_LOG_STATS",
+        "value": "true"
+      }
+    ],
+    "allowedCudaVersions": [
+      "12.8",
+      "12.7",
+      "12.6",
+      "12.5",
+      "12.4"
+    ]
+  }
+}
--- a/1
+++ b/1
@@ -0,0 +1 @@
+docs.axolotl.ai
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,4 +2,5 @@ include requirements.txt
 include README.md
 include LICENSE
 include src/setuptools_axolotl_dynamic_dependencies.py
+include src/axolotl/utils/chat_templates/templates/*.jinja
 recursive-include axolotl *.py
--- a/README.md
+++ b/README.md
@@ -9,6 +9,7 @@
 <p align="center">
    <img src="https://img.shields.io/github/license/axolotl-ai-cloud/axolotl.svg?color=blue" alt="GitHub License">
    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg" alt="tests">
+    <a href="https://codecov.io/gh/axolotl-ai-cloud/axolotl"><img src="https://codecov.io/gh/axolotl-ai-cloud/axolotl/branch/main/graph/badge.svg" alt="codecov"></a>
    <a href="https://github.com/axolotl-ai-cloud/axolotl/releases"><img src="https://img.shields.io/github/release/axolotl-ai-cloud/axolotl.svg" alt="Releases"></a>
    <br/>
    <a href="https://github.com/axolotl-ai-cloud/axolotl/graphs/contributors"><img src="https://img.shields.io/github/contributors-anon/axolotl-ai-cloud/axolotl?color=yellow&style=flat-square" alt="contributors" style="height: 20px;"></a>
@@ -21,28 +22,32 @@
    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg" alt="multigpu-semi-weekly tests">
 </p>

-Axolotl is a tool designed to streamline post-training for various AI models.
-Post-training refers to any modifications or additional training performed on
-pre-trained models - including full model fine-tuning, parameter-efficient tuning (like
-LoRA and QLoRA), supervised fine-tuning (SFT), instruction tuning, and alignment
-techniques. With support for multiple model architectures and training configurations,
-Axolotl makes it easy to get started with these techniques.

-Axolotl is designed to work with YAML config files that contain everything you need to
-preprocess a dataset, train or fine-tune a model, run model inference or evaluation,
-and much more.
+## 🎉 Latest Updates
+
+- 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral) to start training your own Magistral models with Axolotl!
+- 2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the [docs](https://docs.axolotl.ai/docs/qat.html) to learn more!
+- 2025/04: Llama 4 support has been added in Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/llama-4) to start training your own Llama 4 models with Axolotl's linearized version!
+- 2025/03: Axolotl has implemented Sequence Parallelism (SP) support. Read the [blog](https://huggingface.co/blog/axolotl-ai-co/long-context-with-sequence-parallelism-in-axolotl) and [docs](https://docs.axolotl.ai/docs/sequence_parallelism.html) to learn how to scale your context length when fine-tuning.
+- 2025/03: (Beta) Fine-tuning Multimodal models is now supported in Axolotl. Check out the [docs](https://docs.axolotl.ai/docs/multimodal.html) to fine-tune your own!
+- 2025/02: Axolotl has added LoRA optimizations to reduce memory usage and improve training speed for LoRA and QLoRA in single GPU and multi-GPU training (DDP and DeepSpeed). Jump into the [docs](https://docs.axolotl.ai/docs/lora_optims.html) to give it a try.
+- 2025/02: Axolotl has added GRPO support. Dive into our [blog](https://huggingface.co/blog/axolotl-ai-co/training-llms-w-interpreter-feedback-wasm) and [GRPO example](https://github.com/axolotl-ai-cloud/grpo_code) and have some fun!
+- 2025/01: Axolotl has added Reward Modelling / Process Reward Modelling fine-tuning support. See [docs](https://docs.axolotl.ai/docs/reward_modelling.html).
+
+## ✨ Overview
+
+Axolotl is a tool designed to streamline post-training for various AI models.

 Features:

- Train various Huggingface models such as llama, pythia, falcon, mpt
- Supports fullfinetune, lora, qlora, relora, and gptq
- Customize configurations using a simple yaml file or CLI overwrite
- Load different dataset formats, use custom formats, or bring your own tokenized datasets
- Integrated with [xformers](https://github.com/facebookresearch/xformers), flash attention, [liger kernel](https://github.com/linkedin/Liger-Kernel), rope scaling, and multipacking
- Works with single GPU or multiple GPUs via FSDP or Deepspeed
- Easily run with Docker locally or on the cloud
- Log results and optionally checkpoints to wandb, mlflow or Comet
- And more!
+- **Multiple Model Support**: Train various models like LLaMA, Mistral, Mixtral, Pythia, and more. We are compatible with HuggingFace transformers causal language models.
+- **Training Methods**: Full fine-tuning, LoRA, QLoRA, GPTQ, QAT, Preference Tuning (DPO, IPO, KTO, ORPO), RL (GRPO), Multimodal, and Reward Modelling (RM) / Process Reward Modelling (PRM).
+- **Easy Configuration**: Re-use a single YAML file between dataset preprocess, training, evaluation, quantization, and inference.
+- **Performance Optimizations**: [Multipacking](https://docs.axolotl.ai/docs/multipack.html), [Flash Attention](https://github.com/Dao-AILab/flash-attention), [Xformers](https://github.com/facebookresearch/xformers), [Flex Attention](https://pytorch.org/blog/flexattention/), [Liger Kernel](https://github.com/linkedin/Liger-Kernel), [Cut Cross Entropy](https://github.com/apple/ml-cross-entropy/tree/main), [Sequence Parallelism (SP)](https://docs.axolotl.ai/docs/sequence_parallelism.html), [LoRA optimizations](https://docs.axolotl.ai/docs/lora_optims.html), [Multi-GPU training (FSDP1, FSDP2, DeepSpeed)](https://docs.axolotl.ai/docs/multi-gpu.html), [Multi-node training (Torchrun, Ray)](https://docs.axolotl.ai/docs/multi-node.html), and many more!
+- **Flexible Dataset Handling**: Load from local, HuggingFace, and cloud (S3, Azure, GCP, OCI) datasets.
+- **Cloud Ready**: We ship [Docker images](https://hub.docker.com/u/axolotlai) and also [PyPI packages](https://pypi.org/project/axolotl/) for use on cloud platforms and local hardware.
+
+

 ## 🚀 Quick Start

@@ -50,10 +55,12 @@ Features:

 - NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
 - Python 3.11
- PyTorch ≥2.4.1
+- PyTorch ≥2.6.0

 ### Installation

+#### Using pip
+
 ```bash
 pip3 install -U packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
@@ -63,7 +70,14 @@ axolotl fetch examples
 axolotl fetch deepspeed_configs  # OPTIONAL
 ```

-Other installation approaches are described [here](https://axolotl-ai-cloud.github.io/axolotl/docs/installation.html).
+#### Using Docker
+
+Installing with Docker can be less error prone than installing in your own environment.
+```bash
+docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
+```
+
+Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html).

 ### Your First Fine-tune

@@ -78,64 +92,32 @@ axolotl fetch examples --dest path/to/folder
 axolotl train examples/llama-3/lora-1b.yml
 ```

-That's it! Check out our [Getting Started Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/getting-started.html) for a more detailed walkthrough.
+That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/getting-started.html) for a more detailed walkthrough.

-## ✨ Key Features
-
- **Multiple Model Support**: Train various models like LLaMA, Mistral, Mixtral, Pythia, and more
- **Training Methods**: Full fine-tuning, LoRA, QLoRA, and more
- **Easy Configuration**: Simple YAML files to control your training setup
- **Performance Optimizations**: Flash Attention, xformers, multi-GPU training
- **Flexible Dataset Handling**: Use various formats and custom datasets
- **Cloud Ready**: Run on cloud platforms or local hardware

 ## 📚 Documentation

- [Installation Options](https://axolotl-ai-cloud.github.io/axolotl/docs/installation.html) - Detailed setup instructions for different environments
- [Configuration Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/config.html) - Full configuration options and examples
- [Dataset Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/) - Supported formats and how to use them
- [Multi-GPU Training](https://axolotl-ai-cloud.github.io/axolotl/docs/multi-gpu.html)
- [Multi-Node Training](https://axolotl-ai-cloud.github.io/axolotl/docs/multi-node.html)
- [Multipacking](https://axolotl-ai-cloud.github.io/axolotl/docs/multipack.html)
- [API Reference](https://axolotl-ai-cloud.github.io/axolotl/docs/api/) - Auto-generated code documentation
- [FAQ](https://axolotl-ai-cloud.github.io/axolotl/docs/faq.html) - Frequently asked questions
+- [Installation Options](https://docs.axolotl.ai/docs/installation.html) - Detailed setup instructions for different environments
+- [Configuration Guide](https://docs.axolotl.ai/docs/config-reference.html) - Full configuration options and examples
+- [Dataset Loading](https://docs.axolotl.ai/docs/dataset_loading.html) - Loading datasets from various sources
+- [Dataset Guide](https://docs.axolotl.ai/docs/dataset-formats/) - Supported formats and how to use them
+- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
+- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
+- [Multipacking](https://docs.axolotl.ai/docs/multipack.html)
+- [API Reference](https://docs.axolotl.ai/docs/api/) - Auto-generated code documentation
+- [FAQ](https://docs.axolotl.ai/docs/faq.html) - Frequently asked questions

 ## 🤝 Getting Help

 - Join our [Discord community](https://discord.gg/HhrNrHJPRb) for support
 - Check out our [Examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/) directory
- Read our [Debugging Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/debugging.html)
+- Read our [Debugging Guide](https://docs.axolotl.ai/docs/debugging.html)
 - Need dedicated support? Please contact [✉️wing@axolotl.ai](mailto:wing@axolotl.ai) for options

 ## 🌟 Contributing

 Contributions are welcome! Please see our [Contributing Guide](https://github.com/axolotl-ai-cloud/axolotl/blob/main/.github/CONTRIBUTING.md) for details.

-## Supported Models
-
-|             | fp16/fp32 | lora | qlora | gptq | gptq w/flash attn | flash attn | xformers attn |
-|-------------|:----------|:-----|-------|------|-------------------|------------|--------------|
-| llama       | ✅         | ✅    | ✅     | ✅             | ✅                 | ✅          | ✅            |
-| Mistral     | ✅         | ✅    | ✅     | ✅             | ✅                 | ✅          | ✅            |
-| Mixtral-MoE | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
-| Mixtral8X22 | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
-| Pythia      | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
-| cerebras    | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
-| btlm        | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
-| mpt         | ✅         | ❌    | ❓     | ❌             | ❌                 | ❌          | ❓            |
-| falcon      | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
-| gpt-j       | ✅         | ✅    | ✅     | ❌             | ❌                 | ❓          | ❓            |
-| XGen        | ✅         | ❓    | ✅     | ❓             | ❓                 | ❓          | ✅            |
-| phi         | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
-| RWKV        | ✅         | ❓    | ❓     | ❓             | ❓                 | ❓          | ❓            |
-| Qwen        | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
-| Gemma       | ✅         | ✅    | ✅     | ❓             | ❓                 | ✅          | ❓            |
-| Jamba       | ✅         | ✅    | ✅     | ❓             | ❓                 | ✅          | ❓            |
-
-✅: supported
-❌: not supported
-❓: untested
-
 ## ❤️ Sponsors

 Thank you to our sponsors who help make Axolotl possible:
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -1,5 +1,6 @@
 project:
  type: website
+  pre-render: docs/scripts/generate_config_docs.py

 quartodoc:
  dir: docs/api
@@ -17,7 +18,9 @@ quartodoc:
        - convert
        - prompt_tokenizers
        - logging_config
-        - core.trainer_builder
+        - core.builders.base
+        - core.builders.causal
+        - core.builders.rl
        - core.training_args
        - core.chat.messages
        - core.chat.format.chatml
@@ -43,13 +46,37 @@ quartodoc:
        - cli.vllm_serve
        - cli.cloud.base
        - cli.cloud.modal_
+        - cli.quantize
    - title: Trainers
      desc: Training implementations
      contents:
        - core.trainers.base
        - core.trainers.trl
+        - core.trainers.mamba
+        - core.trainers.relora
        - core.trainers.dpo.trainer
        - core.trainers.grpo.trainer
+        - core.trainers.grpo.sampler
+        - core.trainers.utils
+    - title: Model Loading
+      desc: Functionality for loading and patching models, tokenizers, etc.
+      contents:
+        - loaders.model
+        - loaders.tokenizer
+        - loaders.processor
+        - loaders.adapter
+        - loaders.patch_manager
+        - loaders.constants
+    - title: Mixins
+      desc: Mixin classes for augmenting trainers
+      contents:
+        - core.trainers.mixins.optimizer
+        - core.trainers.mixins.rng_state_loader
+        - core.trainers.mixins.scheduler
+    - title: Context Managers
+      desc: Context managers for altering trainer behaviors
+      contents:
+        - utils.ctx_managers.sequence_parallel
    - title: Prompt Strategies
      desc: Prompt formatting strategies
      contents:
@@ -86,7 +113,7 @@ quartodoc:
        - kernels.swiglu
        - kernels.quantize
        - kernels.utils
-    - title: MonkeyPatches
+    - title: Monkey Patches
      desc: Runtime patches for model optimizations
      contents:
        - monkeypatch.llama_attn_hijack_flash
@@ -103,17 +130,16 @@ quartodoc:
        - monkeypatch.trainer_fsdp_optim
        - monkeypatch.transformers_fa_utils
        - monkeypatch.unsloth_
-        - monkeypatch.attention.mllama
        - monkeypatch.data.batch_dataset_fetcher
        - monkeypatch.mixtral
+        - monkeypatch.gradient_checkpointing.offload_cpu
+        - monkeypatch.gradient_checkpointing.offload_disk
    - title: Utils
      desc: Utility functions
      contents:
-        - utils.models
        - utils.tokenization
        - utils.chat_templates
        - utils.lora
-        - utils.lora_embeddings
        - utils.model_shard_quant
        - utils.bench
        - utils.freeze
@@ -124,7 +150,7 @@ quartodoc:
        - utils.optimizers.adopt
        - utils.data.pretraining
        - utils.data.sft
-        - utils.gradient_checkpointing.unsloth
+        - utils.quantization
    - title: Schemas
      desc: Pydantic data models for Axolotl config
      contents:
@@ -174,12 +200,14 @@ quartodoc:
        - utils.callbacks.lisa
        - utils.callbacks.mlflow_
        - utils.callbacks.comet_
-
+        - utils.callbacks.qat
 website:
  title: "Axolotl"
  description: "We make fine-tuning accessible, scalable, and fun"
  favicon: favicon.jpg

+  google-analytics: "G-9KYCVJBNMQ"
+
  navbar:
    logo: image/axolotl_logo_digital_white.svg
    title: false
@@ -208,7 +236,7 @@ website:
            - docs/installation.qmd
            - docs/inference.qmd
            - docs/cli.qmd
-            - docs/config.qmd
+            - docs/config-reference.qmd
            - text: "API Reference"
              href: docs/api

@@ -232,6 +260,8 @@ website:
            - docs/lr_groups.qmd
            - docs/lora_optims.qmd
            - docs/dataset_loading.qmd
+            - docs/qat.qmd
+            - docs/quantize.qmd

        - section: "Core Concepts"
          contents:
--- a/cicd/Dockerfile-uv.jinja
+++ b/cicd/Dockerfile-uv.jinja
@@ -0,0 +1,52 @@
+FROM axolotlai/axolotl-base-uv:{{ BASE_TAG }}
+
+ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
+ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
+ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
+ENV CUDA="{{ CUDA }}"
+ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
+ENV GITHUB_REF="{{ GITHUB_REF }}"
+ENV GITHUB_SHA="{{ GITHUB_SHA }}"
+ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
+ENV HF_HOME="{{ HF_HOME }}"
+
+RUN apt-get update && \
+    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
+
+WORKDIR /workspace
+
+RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
+
+WORKDIR /workspace/axolotl
+
+RUN git fetch origin +$GITHUB_REF && \
+    git checkout FETCH_HEAD
+
+# If AXOLOTL_EXTRAS is set, append it in brackets
+RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
+        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
+        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
+        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
+        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
+        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
+    fi
+
+RUN uv pip install packaging==23.2 setuptools==75.8.0
+RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
+        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+    else \
+        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
+    fi
+
+RUN python scripts/unsloth_install.py --uv | sh
+RUN python scripts/cutcrossentropy_install.py --uv | sh
+
+# So we can test the Docker image
+RUN uv pip install -r requirements-dev.txt -r requirements-tests.txt
+
+# fix so that git fetch/pull from remote works
+RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
+    git config --get remote.origin.fetch
+
+# helper for huggingface-login cli
+RUN git config --global credential.helper store
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -9,6 +9,7 @@ ENV GITHUB_REF="{{ GITHUB_REF }}"
 ENV GITHUB_SHA="{{ GITHUB_SHA }}"
 ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
 ENV HF_HOME="{{ HF_HOME }}"
+ENV AXOLOTL_DATASET_PROCESSES="8"

 RUN apt-get update && \
    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
--- a/tests/utils/init.py
+++ b/tests/utils/init.py
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -3,10 +3,53 @@ set -e

 python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"

-pytest -v --durations=10 -n8 --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli /workspace/axolotl/tests/
-pytest -v --durations=10 /workspace/axolotl/tests/e2e/patched/lora_kernels  # running these with the other patches causes a failure
-pytest -v --durations=10 --ignore=tests/e2e/patched/lora_kernels /workspace/axolotl/tests/e2e/patched
-pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/solo/
-pytest -v --durations=10 /workspace/axolotl/tests/e2e/integrations/
-pytest -v --durations=10 /workspace/axolotl/tests/cli
-pytest -v --durations=10 --ignore=tests/e2e/solo/ --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ --ignore=tests/e2e/integrations/ --ignore=tests/cli /workspace/axolotl/tests/e2e/
+# Run unit tests with initial coverage report
+pytest -v --durations=10 -n8 \
+  --ignore=tests/e2e/ \
+  --ignore=tests/patched/ \
+  --ignore=tests/cli \
+  /workspace/axolotl/tests/ \
+  --cov=axolotl
+
+# Run lora kernels tests with coverage append
+pytest -v --durations=10 \
+  /workspace/axolotl/tests/e2e/patched/lora_kernels \
+  --cov=axolotl \
+  --cov-append
+
+# Run patched tests excluding lora kernels with coverage append
+pytest --full-trace -vvv --durations=10 \
+  --ignore=tests/e2e/patched/lora_kernels \
+  /workspace/axolotl/tests/e2e/patched \
+  --cov=axolotl \
+  --cov-append
+
+# Run solo tests with coverage append
+pytest -v --durations=10 -n1 \
+  /workspace/axolotl/tests/e2e/solo/ \
+  --cov=axolotl \
+  --cov-append
+
+# Run integration tests with coverage append
+pytest -v --durations=10 \
+  /workspace/axolotl/tests/e2e/integrations/ \
+  --cov=axolotl \
+  --cov-append
+
+pytest -v --durations=10 /workspace/axolotl/tests/cli \
+  --cov=axolotl \
+  --cov-append
+
+# Run remaining e2e tests with coverage append and final report
+pytest -v --durations=10 \
+  --ignore=tests/e2e/solo/ \
+  --ignore=tests/e2e/patched/ \
+  --ignore=tests/e2e/multigpu/ \
+  --ignore=tests/e2e/integrations/ \
+  --ignore=tests/cli \
+  /workspace/axolotl/tests/e2e/ \
+  --cov=axolotl \
+  --cov-append \
+  --cov-report=xml:e2e-coverage.xml
+
+codecov upload-process -t $CODECOV_TOKEN -f e2e-coverage.xml -F e2e,pytorch-${PYTORCH_VERSION} || true
--- a/cicd/cleanup.py
+++ b/cicd/cleanup.py
@@ -0,0 +1,19 @@
+"""Modal app to run axolotl GPU cleanup"""
+
+from .single_gpu import VOLUME_CONFIG, app, cicd_image, run_cmd
+
+
+@app.function(
+    image=cicd_image,
+    timeout=60 * 60,
+    cpu=8.0,
+    memory=131072,
+    volumes=VOLUME_CONFIG,
+)
+def cleanup():
+    run_cmd("./cicd/cleanup.sh", "/workspace/axolotl")
+
+
+@app.local_entrypoint()
+def main():
+    cleanup.remote()
--- a/cicd/cleanup.sh
+++ b/cicd/cleanup.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+set -e
+
+# cleanup old cache files for datasets processing and intermediate mappings
+find /workspace/data/huggingface-cache/hub/datasets -name "cache-*" -type f -mtime +1 -exec rm {} \;
+find /workspace/data/huggingface-cache/hub/datasets -name "*.lock" -type f -mtime +1 -exec rm {} \;
--- a/cicd/e2e_tests.py
+++ b/cicd/e2e_tests.py
@@ -1,74 +1,12 @@
 """Modal app to run axolotl GPU tests"""

-# pylint: disable=duplicate-code
-
-import os
-import pathlib
-import tempfile
-
-import jinja2
-import modal
-from jinja2 import select_autoescape
-from modal import App, Image
-
-cicd_path = pathlib.Path(__file__).parent.resolve()
-
-template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
-template_env = jinja2.Environment(
-    loader=template_loader, autoescape=select_autoescape()
-)
-df_template = template_env.get_template("Dockerfile.jinja")
-
-df_args = {
-    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
-    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
-    "CUDA": os.environ.get("CUDA", "121"),
-    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
-    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
-    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
-    "HF_HOME": "/workspace/data/huggingface-cache/hub",
-}
-
-dockerfile_contents = df_template.render(**df_args)
-
-temp_dir = tempfile.mkdtemp()
-with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
-    f.write(dockerfile_contents)
-
-cicd_image = Image.from_dockerfile(
-    pathlib.Path(temp_dir) / "Dockerfile",
-    context_mount=None,
-    force_build=True,
-    gpu="A10G",
-).env(df_args)
-
-app = App("Axolotl CI/CD", secrets=[])
-
-hf_cache_volume = modal.Volume.from_name(
-    "axolotl-ci-hf-hub-cache", create_if_missing=True
-)
-VOLUME_CONFIG = {
-    "/workspace/data/huggingface-cache/hub": hf_cache_volume,
-}
-
-N_GPUS = int(os.environ.get("N_GPUS", 1))
-GPU_CONFIG = modal.gpu.L40S(count=N_GPUS)
-
-
-def run_cmd(cmd: str, run_folder: str):
-    import subprocess  # nosec
-
-    # Propagate errors from subprocess.
-    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
-        exit(exit_code)  # pylint: disable=consider-using-sys-exit
+from .single_gpu import GPU_CONFIG, VOLUME_CONFIG, app, cicd_image, run_cmd


@app.function(
    image=cicd_image,
    gpu=GPU_CONFIG,
-    timeout=60 * 60,
+    timeout=120 * 60,  # 90 min
    cpu=8.0,
    memory=131072,
    volumes=VOLUME_CONFIG,
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -24,11 +24,12 @@ df_template = template_env.get_template("Dockerfile.jinja")
 df_args = {
    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
-    "CUDA": os.environ.get("CUDA", "121"),
+    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.6.0"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu126-2.6.0"),
+    "CUDA": os.environ.get("CUDA", "126"),
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
+    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
    "HF_HOME": "/workspace/data/huggingface-cache/hub",
 }

@@ -54,7 +55,7 @@ VOLUME_CONFIG = {
 }

 N_GPUS = int(os.environ.get("N_GPUS", 2))
-GPU_CONFIG = modal.gpu.H100(count=N_GPUS)
+GPU_CONFIG = f"H100:{N_GPUS}"


 def run_cmd(cmd: str, run_folder: str):
@@ -68,8 +69,8 @@ def run_cmd(cmd: str, run_folder: str):
@app.function(
    image=cicd_image,
    gpu=GPU_CONFIG,
-    timeout=90 * 60,
-    cpu=8.0,
+    timeout=120 * 60,
+    cpu=16.0,
    memory=131072 * N_GPUS,
    volumes=VOLUME_CONFIG,
 )
--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
@@ -1,6 +1,23 @@
 #!/bin/bash
 set -e

-# only run one test at a time so as not to OOM the GPU
-pytest -v  --durations=10 -n2 /workspace/axolotl/tests/e2e/multigpu/ --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/
-pytest -v  --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/solo/
+# Only run two tests at a time to avoid OOM on GPU (with coverage collection)
+pytest -v -n2 \
+  --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ \
+  --ignore=/workspace/axolotl/tests/e2e/multigpu/patched/ \
+  /workspace/axolotl/tests/e2e/multigpu/ \
+  --cov=axolotl
+
+# Run solo tests with coverage append
+pytest -v --durations=10 -n1 \
+  /workspace/axolotl/tests/e2e/multigpu/solo/ \
+  --cov=axolotl \
+  --cov-append
+
+pytest -v  --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/patched/ \
+  --cov=axolotl \
+  --cov-append \
+  --cov-report=xml:multigpu-coverage.xml
+
+# Upload coverage to Codecov
+codecov upload-process -t "${CODECOV_TOKEN}" -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION} || true
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
@@ -0,0 +1,70 @@
+"""Modal app to run axolotl GPU tests"""
+
+# pylint: disable=duplicate-code
+
+import os
+import pathlib
+import tempfile
+
+import jinja2
+import modal
+import modal.experimental
+from jinja2 import select_autoescape
+from modal import App
+
+cicd_path = pathlib.Path(__file__).parent.resolve()
+
+template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
+template_env = jinja2.Environment(
+    loader=template_loader, autoescape=select_autoescape()
+)
+dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja")
+df_template = template_env.get_template(dockerfile)
+
+df_args = {
+    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
+    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
+    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.6.0"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu126-2.6.0"),
+    "CUDA": os.environ.get("CUDA", "126"),
+    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
+    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
+    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
+    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
+    "HF_HOME": "/workspace/data/huggingface-cache/hub",
+    "PYTHONUNBUFFERED": os.environ.get("PYTHONUNBUFFERED", "1"),
+    "DEEPSPEED_LOG_LEVEL": os.environ.get("DEEPSPEED_LOG_LEVEL", "WARNING"),
+}
+
+dockerfile_contents = df_template.render(**df_args)
+
+temp_dir = tempfile.mkdtemp()
+with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
+    f.write(dockerfile_contents)
+
+cicd_image = modal.experimental.raw_dockerfile_image(
+    pathlib.Path(temp_dir) / "Dockerfile",
+    # context_mount=None,
+    force_build=True,
+    # gpu="A10G",
+).env(df_args)
+
+app = App("Axolotl CI/CD", secrets=[])
+
+hf_cache_volume = modal.Volume.from_name(
+    "axolotl-ci-hf-hub-cache", create_if_missing=True
+)
+VOLUME_CONFIG = {
+    "/workspace/data/huggingface-cache/hub": hf_cache_volume,
+}
+
+N_GPUS = int(os.environ.get("N_GPUS", 1))
+GPU_CONFIG = f"L40S:{N_GPUS}"
+
+
+def run_cmd(cmd: str, run_folder: str):
+    import subprocess  # nosec
+
+    # Propagate errors from subprocess.
+    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
+        exit(exit_code)  # pylint: disable=consider-using-sys-exit
--- a/codecov.yml
+++ b/codecov.yml
@@ -0,0 +1,56 @@
+codecov:
+  require_ci_to_pass: yes
+  notify:
+    wait_for_ci: true
+
+coverage:
+  precision: 2
+  round: down
+  range: "70...100"
+  status:
+    project:
+      default:
+        # basic
+        target: auto
+        threshold: 0%
+        base: auto
+        # advanced
+        branches: null
+        if_no_uploads: error
+        if_not_found: success
+        if_ci_failed: error
+        only_pulls: true
+        flags: null
+        paths: null
+    patch:
+      default:
+        # basic
+        target: auto
+        threshold: 0%
+        base: auto
+        # advanced
+        branches: null
+        if_no_uploads: error
+        if_not_found: success
+        if_ci_failed: error
+        only_pulls: false
+        flags: null
+        paths: null
+
+parsers:
+  gcov:
+    branch_detection:
+      conditional: yes
+      loop: yes
+      method: no
+      macro: no
+
+comment:
+  layout: "reach,diff,flags,files,footer"
+  behavior: default
+  require_changes: no
+  require_base: no
+  require_head: yes
+
+github_checks:
+  annotations: false
--- a/deepspeed_configs/zero2_torch_compile.json
+++ b/deepspeed_configs/zero2_torch_compile.json
@@ -0,0 +1,31 @@
+{
+  "compile": {
+    "disable": false,
+    "backend": "inductor"
+  },
+  "zero_optimization": {
+    "stage": 2,
+    "offload_optimizer": {
+      "device": "cpu"
+    },
+    "contiguous_gradients": true,
+    "overlap_comm": true
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "fp16": {
+    "enabled": "auto",
+    "auto_cast": false,
+    "loss_scale": 0,
+    "initial_scale_power": 32,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "wall_clock_breakdown": false
+}
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -37,3 +37,7 @@ RUN git lfs install --skip-repo && \
    pip3 install awscli && \
    # The base image ships with `pydantic==1.8.2` which is not working
    pip3 install -U --no-cache-dir pydantic==1.10.10
+
+RUN if [ "$PYTORCH_VERSION" = "2.6.0" ] && [ "$CUDA" = "124" ] ; then \
+        FLASH_ATTENTION_FORCE_BUILD="TRUE" pip3 install --no-build-isolation flash-attn==2.8.0.post2; \
+    fi
--- a/docker/Dockerfile-base-next
+++ b/docker/Dockerfile-base-next
@@ -29,7 +29,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
 WORKDIR /workspace

 RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
-    python3 -m pip install --no-cache-dir -U torch==2.7.0 --extra-index-url https://download.pytorch.org/whl/test/cu$CUDA && \
+    python3 -m pip install --no-cache-dir -U torch==2.7.1 --extra-index-url https://download.pytorch.org/whl/test/cu$CUDA && \
    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"

--- a/docker/Dockerfile-uv-base
+++ b/docker/Dockerfile-uv-base
@@ -0,0 +1,36 @@
+ARG CUDA_VERSION="12.6.3"
+ARG CUDNN_VERSION=""
+ARG UBUNTU_VERSION="22.04"
+ARG MAX_JOBS=4
+
+FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
+
+ARG PYTHON_VERSION="3.11"
+ARG PYTORCH_VERSION="2.6.0"
+ARG CUDA="126"
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
+
+ENV PYTHON_VERSION=$PYTHON_VERSION
+ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
+ENV UV_TORCH_BACKEND="cu${CUDA}"
+
+RUN apt-get update \
+    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config curl && rm -rf /var/lib/apt/lists/* \
+    && git lfs install --skip-repo \
+    && curl -LsSf https://astral.sh/uv/install.sh | sh
+
+ENV PATH="/root/.local/bin:${PATH}"
+
+RUN uv python install ${PYTHON_VERSION}
+
+WORKDIR /workspace
+
+RUN uv venv --no-project --relocatable axolotl-venv
+
+ENV PATH="/workspace/axolotl-venv/bin:${PATH}"
+
+RUN uv pip install packaging setuptools wheel psutil \
+    && uv pip install torch==${PYTORCH_VERSION} \
+    && uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" \
+    && uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" \
+    && uv pip install awscli pydantic
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -2,3 +2,4 @@
 _site/
 /api/*.qmd
 /api/*.html
+config-reference.qmd
--- a/docs/cli.qmd
+++ b/docs/cli.qmd
@@ -199,6 +199,27 @@ output_dir: # Directory to save evaluation results

 See [LM Eval Harness](https://github.com/EleutherAI/lm-evaluation-harness) for more details.

+### delinearize-llama4
+
+Delinearizes a Llama 4 linearized model into a regular HuggingFace Llama 4 model. This only works with the non-quantized linearized model.
+
+```bash
+axolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir
+```
+
+This would be necessary to use with other frameworks. If you have an adapter, merge it with the non-quantized linearized model before delinearizing.
+
+### quantize
+
+Quantizes a model using the quantization configuration specified in your YAML file.
+
+```bash
+axolotl quantize config.yml
+```
+
+See [Quantization](./quantize.qmd) for more details.
+
+
 ## Legacy CLI Usage

 While the new Click-based CLI is preferred, Axolotl still supports the legacy module-based CLI:
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -1,711 +0,0 @@
---
-title: Config Reference
-description: A complete list of all configuration options.
---
-
-```yaml
-# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files
-# This can also be a relative path to a model on disk
-base_model: ./llama-7b-hf
-# You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)
-base_model_ignore_patterns:
-# If the base_model repo on hf hub doesn't include configuration .json files,
-# You can set that here, or leave this empty to default to base_model
-base_model_config: ./llama-7b-hf
-# You can specify to choose a specific model revision from huggingface hub
-revision_of_model:
-# Optional tokenizer configuration path in case you want to use a different tokenizer
-# than the one defined in the base model
-tokenizer_config:
-# If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
-model_type: AutoModelForCausalLM
-# Corresponding tokenizer for the model AutoTokenizer is a good choice
-tokenizer_type: AutoTokenizer
-# Trust remote code for untrusted source
-trust_remote_code:
-# use_fast option for tokenizer loading from_pretrained, default to True
-tokenizer_use_fast:
-# Whether to use the legacy tokenizer setting, defaults to True
-tokenizer_legacy:
-# Resize the model embeddings when new tokens are added to multiples of 32
-# This is reported to improve training speed on some models
-resize_token_embeddings_to_32x:
-# Optional[bool] Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.
-shrink_embeddings:
-# Whether to load the model with randomly initialized weights. Useful for
-# pre-training a model from scratch or debugging purposes.
-random_init_weights:
-
-# (Internal use only)
-# Used to identify which the model is based on
-is_falcon_derived_model:
-is_llama_derived_model:
-is_qwen_derived_model:
-# Please note that if you set this to true, `padding_side` will be set to "left" by default
-is_mistral_derived_model:
-
-# optional overrides to the base model configuration
-overrides_of_model_config:
-  # RoPE Scaling https://github.com/huggingface/transformers/pull/24653
-  rope_scaling:
-    type: # linear | dynamic
-    factor: # float
-
-# optional overrides the base model loading from_pretrained
-overrides_of_model_kwargs:
-  # use_cache: False
-
-# optional overrides to the bnb 4bit quantization configuration
-# https://huggingface.co/docs/transformers/main/main_classes/quantization#transformers.BitsAndBytesConfig
-bnb_config_kwargs:
-  # These are default values
-  llm_int8_has_fp16_weight: false
-  bnb_4bit_quant_type: nf4
-  bnb_4bit_use_double_quant: true
-
-
-# Whether you are training a 4-bit GPTQ quantized model
-gptq: true
-
-# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
-load_in_8bit: true
-# Use bitsandbytes 4 bit
-load_in_4bit:
-
-# Use CUDA bf16
-bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
-# Use CUDA fp16
-fp16: true
-# Use CUDA tf32
-tf32: true # require >=ampere
-
-# No AMP (automatic mixed precision)
-bfloat16: true # require >=ampere
-float16: true
-
-# Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset
-gpu_memory_limit: 20GiB
-# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge
-lora_on_cpu: true
-
-# List[str]. Add plugins to extend the pipeline.
-# See `src/axolotl/integrations` for the available plugins or doc below for more details.
-# https://axolotl-ai-cloud.github.io/axolotl/docs/custom_integrations.html
-plugins:
-  # - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
-
-# A list of one or more datasets to finetune the model with
-datasets:
-  # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
-  - path: vicgalle/alpaca-gpt4
-    # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]
-    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
-    ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
-    data_files: # Optional[str] path to source data files
-
-    shards: # Optional[int] split dataset into N pieces (use with shards_idx)
-    shards_idx: # Optional[int] = 0 the index of sharded dataset to use
-
-    preprocess_shards: # Optional[int] process dataset in N sequential chunks for memory efficiency (exclusive with `shards`)
-
-    name: # Optional[str] name of dataset configuration to load
-    split: train # Optional[str] name of dataset split to load from
-    revision: # Optional[str] The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets.
-    trust_remote_code: # Optional[bool] Trust remote code for untrusted source
-
-  # Custom user instruction prompt
-  - path: repo
-    type:
-      # The below are defaults. only set what's needed if you use a different column name.
-      system_prompt: ""
-      system_format: "{system}"
-      field_system: system
-      field_instruction: instruction
-      field_input: input
-      field_output: output
-
-      # Customizable to be single line or multi-line
-      # Use {instruction}/{input} as key to be replaced
-      # 'format' can include {input}
-      format: |-
-        User: {instruction} {input}
-        Assistant:
-      # 'no_input_format' cannot include {input}
-      no_input_format: "{instruction} "
-
-      # For `completion` datsets only, uses the provided field instead of `text` column
-      field:
-
-  # Using chat template
-  - path: ...
-    # Set type to `chat_template` to use this strategy
-    type: chat_template
-    # Specify the name of the chat template to use
-    # The name of the chat template to use for training, following values are supported:
-    # - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default.
-    # - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py
-    # - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to if the tokenizer does not have a chat template else default to tokenizer. E.g. tokenizer_default_fallback_chatml.
-    # - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.
-    chat_template: tokenizer_default
-
-    # Custom jinja chat template. Used only if `chat_template: jinja` or empty.
-    chat_template_jinja:
-
-    # Key containing the messages (default: "messages")
-    field_messages: messages
-
-    # Mapping of properties from the input dataset to the chat template.
-    # (default: message_property_mappings={'role':'role', 'content':'content'})
-    # If a property exists in the template but not in this mapping, the system will attempt
-    # to load it directly from the message using the property name as the key.
-    # Example: In the mapping below, 'from' is loaded from input dataset and used as 'role',
-    # while 'value' is loaded and used as 'content' in the chat template.
-    message_property_mappings:
-      role: from
-      content: value
-      # ...
-
-    # Optional[Dict[str, List]]. Roles mapping in the messages.
-    # The format is {target_role: [source_roles]}. All source roles will be mapped to the target role.
-    # The default is:
-    roles:
-      user: ["human", "user"]
-      assistant: ["gpt", "assistant"]
-      system: ["system"]
-      tool: ["tool"]
-
-    # Optional[bool]. Whether to drop the system turn from the dataset. Only works with chat_template.
-    # This does not drop the default system message from chat_template if it exists. If you wish to,
-    # we recommend using a custom jinja template with the default system message removed or
-    # adding a system turn with empty content.
-    drop_system_message:
-
-    # IMPORTANT: The following fields determine which parts of the conversation to train on.
-    # Priority order: message_field_training > message_field_training_detail > train_on_inputs or role in roles_to_train
-    # See examples at `docs/dataset-formats/conversation.qmd`
-    # Note: If the below 4 fields are set to empty, defaults to training only on the last message.
-
-    # Optional[List[str]]. Roles to train on. The tokens from these roles will be considered for the loss.
-    roles_to_train: ["assistant"]  # default
-    # Optional[str]. Which EOS tokens to train on in the conversation. Possible values are:
-    # - all: train on all EOS tokens
-    # - turn (default): train on the EOS token at the end of each trainable turn
-    # - last: train on the last EOS token in the conversation
-    # TIP: Please make sure that your `tokenizer.eos_token` is same as EOS/EOT token in template. Otherwise, set `eos_token` under `special_tokens`.
-    train_on_eos: last
-    # The key in the message turn that indicates via boolean whether tokens of a turn should be considered for training. Useful to selectively train on certain turns besides the `roles_to_train`.
-    message_field_training: training
-    # The key in the message turn that contains the training details. Useful to selectively train on certain tokens in a turn.
-    # The value of the key is a List[Dict] containing `begin_offset` (start character index in content), `end_offset` (end character index in content), and `train` (boolean whether to train).
-    message_field_training_detail: train_detail
-
-
-# If false, the datasets will not be shuffled and will keep their original order in `datasets`.
-# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.
-shuffle_merged_datasets: true
-
-Deduplicates datasets and test_datasets with identical entries.
-dataset_exact_deduplication: true
-
-# A list of one or more datasets to eval the model with.
-# You can use either test_datasets, or val_set_size, but not both.
-test_datasets:
-  - path: /workspace/data/eval.jsonl
-    ds_type: json
-    # You need to specify a split. For "json" datasets the default split is called "train".
-    split: train
-    type: completion
-    data_files:
-      - /workspace/data/eval.jsonl
-
-# use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo'
-rl:
-rl_beta:  # Optional[float]. The beta parameter for the RL training.
-
-# dpo
-dpo_use_weighting:  # Optional[bool]. Whether to perform weighting.
-rpo_alpha: # Optional[float]. Weighting of NLL term in loss from RPO paper.
-
-# orpo
-orpo_alpha: 0.1  # Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to `beta` in `ORPOConfig` due to trl mapping.
-
-# kto
-kto_desirable_weight: # Optional[float]. Factor for desirable loss term in KTO loss.
-kto_undesirable_weight: # Optional[float]. Factor for undesirable loss term in KTO loss.
-
-# simpo
-cpo_alpha: 1.0  # Weight of the BC regularizer
-simpo_gamma: 0.5  # Target reward margin for the SimPO loss
-
-# grpo
-trl:
-  use_vllm: # Optional[bool]. Whether to use VLLM for RL training.
-  vllm_server_host: # Optional[str]. Host of the vLLM server to connect to.
-  vllm_server_port: # Optional[int]. Port of the vLLM server to connect to.
-  vllm_server_timeout: # Optional[int]. Total timeout (in seconds) to wait for the vLLM server to respond.
-  vllm_guided_decoding_regex: # Optional[str]. Regex for vLLM guided decoding.
-
-  beta: # Optional[float]. Beta parameter for the RL training. Same as `rl_beta`. Use
-  max_completion_length: # Optional[int]. Maximum length of the completion for RL training.
-
-  reward_funcs: # Optional[list[str]]. List of reward functions to load. Paths must be importable from current dir.
-  reward_weights: # Optional[list[float]]. List of reward weights for the reward functions.
-
-  num_generations: # Optional[int]. Number of generations to sample.
-  log_completions: # Optional[bool]. Whether to log completions.
-
-  sync_ref_model: # Optional[bool]. Whether to sync the reference model.
-  ref_model_mixup_alpha: # Optional[float]. Mixup alpha for the reference model.
-  ref_model_sync_steps: # Optional[int]. Sync steps for the reference model.
-
-
-# reward modelling: `True` or `False`
-reward_model:
-
-# process reward modelling: `True` or `False`
-process_reward_model:
-
-# The name of the chat template to use for training, following values are supported:
-# - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default value.
-# - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py
-# - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to. E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not available in the tokenizer.
-# - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.
-# The selected chat template will be saved to the tokenizer_config.json for easier inferencing
-# Note: It is recommended to set train_on_inputs to true when using a chat template that is different from the model's default chat template.
-chat_template: tokenizer_default
-# custom jinja template for chat template. This will be only used if chat_template is set to `jinja` or `null` (in which case chat_template is automatically set to `jinja`). Default is null.
-chat_template_jinja: null
-# Changes the default system message. Currently only supports chatml.
-default_system_message: You are a helpful assistant. Please give a long and detailed answer.
-# Axolotl attempts to save the dataset as an arrow after packing the data together so
-# subsequent training attempts load faster, relative path
-dataset_prepared_path: data/last_run_prepared
-# Push prepared dataset to hub
-push_dataset_to_hub: # Optional[str] repo_org/repo_name
-# The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
-# if not set.
-dataset_processes: # defaults to os.cpu_count() if not set
-# Keep dataset in memory while preprocessing
-# Only needed if cached dataset is taking too much storage
-dataset_keep_in_memory:
-# push checkpoints to hub
-hub_model_id: # private repo path to push finetuned model
-# how to push checkpoints to hub
-# https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy
-hub_strategy:
-# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
-# Required to be true when used in combination with `push_dataset_to_hub`
-hf_use_auth_token: # boolean
-# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval.
-val_set_size: 0.04
-# Num shards for whole dataset
-dataset_shard_num:
-# Index of shard to use for whole dataset
-dataset_shard_idx:
-
-# The maximum length of an input to train with, this should typically be less than 2048
-# as most models have a token/context limit of 2048
-sequence_len: 2048
-# Pad inputs so each step uses constant sized buffers
-# This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
-pad_to_sequence_len:
-# Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
-sample_packing:
-# Set to 'false' if getting errors during eval with sample_packing on.
-eval_sample_packing:
-# You can set these packing optimizations AFTER starting a training at least once.
-# The trainer will provide recommended values for these values.
-sample_packing_eff_est:
-total_num_tokens:
-# Increasing the following values helps with packing, but usually only slightly (<%1.)
-# The number of samples packed at a time.
-sample_packing_group_size: 100000
-# The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples.
-sample_packing_bin_size: 200
-sample_pack_sequentially: # Optional[bool]. Whether to pack samples sequentially.
-
-# whether to concatenate samples during pretraining
-pretraining_sample_concatenation:
-
-curriculum_sampling: # Optional[bool]. Whether to use sequential sampling for curriculum learning
-
-# Use batch flattening for speedups when not using sample_packing
-batch_flattening:
-
-# Passed through to transformers when loading the model when launched without accelerate
-# Use `sequential` when training w/ model parallelism to limit memory
-device_map:
-# Defines the max memory usage per gpu on the system. Passed through to transformers when loading the model.
-max_memory:
-
-# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model
-adapter: lora
-# If you already have a lora model trained that you want to load, put that here.
-# This means after training, if you want to test the model, you should set this to the value of `output_dir`.
-# Note that if you merge an adapter to the base model, a new subdirectory `merged` will be created under the `output_dir`.
-lora_model_dir:
-
-# LoRA hyperparameters
-# For more details about the following options, see:
-# https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-  - q_proj
-  - v_proj
-#  - k_proj
-#  - o_proj
-#  - gate_proj
-#  - down_proj
-#  - up_proj
-lora_target_linear: # If true, will target all linear modules
-
-# List[int] | int. # The layer indices to transform, otherwise, apply to all layers
-# https://huggingface.co/docs/peft/v0.15.0/en/package_reference/lora#peft.LoraConfig.layers_to_transform
-peft_layers_to_transform:
-
-# Optional[bool]. Whether to use DoRA.
-# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#weight-decomposed-low-rank-adaptation-dora
-peft_use_dora:
-
-# Optional[bool]. Whether to use RSLoRA.
-# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#rank-stabilized-lora
-peft_use_rslora:
-
-# Optional[list[tuple[int, int]]]. List of layer indices to replicate.
-# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#memory-efficient-layer-replication-with-lora
-peft_layer_replication:
-
-# bool | Literal["gaussian", "eva", "olora", "pissa", "pissa_niter_[number of iters]", "corda", "loftq"]
-# How to initialize LoRA weights. Default to True which is MS original implementation.
-# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#initialization
-peft_init_lora_weights:
-
-# If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.
-# For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.
-# `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities.
-# https://github.com/huggingface/peft/issues/334#issuecomment-1561727994
-lora_modules_to_save:
-#  - embed_tokens
-#  - lm_head
-
-lora_fan_in_fan_out: false
-
-# Apply custom LoRA autograd functions and activation function Triton kernels for
-# speed and memory savings
-# See: https://axolotl-ai-cloud.github.io/axolotl/docs/lora_optims.html
-lora_mlp_kernel: true
-lora_qkv_kernel: true
-lora_o_kernel: true
-
-# LoRA+ hyperparameters
-# For more details about the following options, see:
-# https://arxiv.org/abs/2402.12354  and `src/axolotl/core/train_builder.py`
-loraplus_lr_ratio: # loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.
-loraplus_lr_embedding: #  loraplus learning rate for lora embedding layers. Default value is 1e-6.
-
-peft:
-  # Configuration options for loftq initialization for LoRA
-  # https://huggingface.co/docs/peft/developer_guides/quantization#loftq-initialization
-  loftq_config:
-    loftq_bits:  # typically 4 bits
-
-# ReLoRA configuration
-# Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
-relora_steps: # Number of steps per ReLoRA restart
-relora_warmup_steps: # Number of per-restart warmup steps
-relora_anneal_steps: # Number of anneal steps for each relora cycle
-relora_prune_ratio: # threshold for optimizer magnitude when pruning
-relora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings
-
-# wandb configuration if you're using it
-# Make sure your `WANDB_API_KEY` environment variable is set (recommended) or you login to wandb with `wandb login`.
-wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
-wandb_project: # Your wandb project name
-wandb_entity: # A wandb Team name if using a Team
-wandb_watch:
-wandb_name: # Set the name of your wandb run
-wandb_run_id: # Set the ID of your wandb run
-wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training
-
-# mlflow configuration if you're using it
-mlflow_tracking_uri: # URI to mlflow
-mlflow_experiment_name: # Your experiment name
-mlflow_run_name: # Your run name
-hf_mlflow_log_artifacts:  # set to true to copy each saved checkpoint on each save to mlflow artifact registry
-
-# Comet configuration if you're using it
-# Make sure your `COMET_API_KEY` environment variable is set (recommended) or you login to Comet with `comet login`.
-# Check out our documentation for more details https://www.comet.com/docs/v2/api-and-sdk/python-sdk/reference/Experiment-Creation/#comet_ml.start
-use_comet: # Enable or disable Comet integration.
-comet_api_key: # API key for Comet. Recommended to set via `comet login`.
-comet_workspace: # Workspace name in Comet. Defaults to the user's default workspace.
-comet_project_name: # Project name in Comet. Defaults to Uncategorized.
-comet_experiment_key: # Identifier for the experiment. Used to append data to an existing experiment or control the key of new experiments. Default to a random key.
-comet_mode: # Create a new experiment ("create") or log to an existing one ("get"). Default ("get_or_create") auto-selects based on configuration.
-comet_online: # Set to True to log data to Comet server, or False for offline storage. Default is True.
-comet_experiment_config: # Dictionary for additional configuration settings, see the doc for more details.
-
-# Tensorboard
-use_tensorboard: # Optional[bool]
-
-# Where to save the full-finetuned model to
-output_dir: ./completed-model
-
-# Whether to use torch.compile and which backend to use
-# setting to `auto` will enable torch compile when torch>=2.5.1
-torch_compile:  # Optional[Union[Literal["auto"], bool]]
-torch_compile_backend:  # Optional[str]
-
-# Training hyperparameters
-
-# If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps.
-gradient_accumulation_steps: 1
-# The number of samples to include in each batch. This is the number of samples sent to each GPU.
-# Batch size per gpu = micro_batch_size * gradient_accumulation_steps
-micro_batch_size: 2
-eval_batch_size:
-num_epochs: 4
-warmup_steps: 100  # cannot use with warmup_ratio
-warmup_ratio: 0.05  # cannot use with warmup_steps
-learning_rate: 0.00003
-lr_quadratic_warmup:
-logging_steps:
-eval_steps: # Leave empty to eval at each epoch, integer for every N steps. float for fraction of total steps
-evals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps
-eval_strategy: # Set to `"no"` to skip evaluation, `"epoch"` at end of each epoch, leave empty to infer from `eval_steps`.
-save_strategy: # Set to `"no"` to skip checkpoint saves, `"epoch"` at end of each epoch, `"best"` when better result is achieved, leave empty to infer from `save_steps`.
-save_steps: # Leave empty to save at each epoch, integer for every N steps. float for fraction of total steps
-saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
-save_total_limit: # Checkpoints saved at a time
-# Maximum number of iterations to train for. It precedes num_epochs which means that
-# if both are set, num_epochs will not be guaranteed.
-# e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
-max_steps:
-
-# bool of whether to include tokens trainer per second in the training metrics. This iterates over the entire dataset once, so it takes some time.
-include_tokens_per_second: # Optional[bool]
-
-# whether to find batch size that fits in memory. Passed to underlying transformers Trainer
-auto_find_batch_size: # Optional[bool]
-
-eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
-eval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
-do_causal_lm_eval: # Whether to run causal language model evaluation for metrics in `eval_causal_lm_metrics`.
-eval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is ["sacrebleu", "comet", "ter", "chrf", "perplexity"]
-
-profiler_steps: # enable the pytorch profiler to capture the first N steps of training to the output_dir.
-                # see https://pytorch.org/blog/understanding-gpu-memory-1/ for more information
-                # snapshots can be visualized @ https://pytorch.org/memory_viz
-
-loss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)
-loss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)
-
-# Save model as safetensors (require safetensors package)
-save_safetensors:
-
-# Whether to mask out or include the human's prompt from the training labels
-train_on_inputs: false
-# Group similarly sized data to minimize padding.
-# May be slower to start, as it must download and sort the entire dataset.
-# Note that training loss may have an oscillating pattern with this enabled.
-group_by_length: false
-
-# Whether to use gradient checkpointing. Available options are: true, false, "offload".
-# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
-gradient_checkpointing: false
-# additional kwargs to pass to the trainer for gradient checkpointing
-# gradient_checkpointing_kwargs:
-#   use_reentrant: true
-
-# Stop training after this many evaluation losses have increased in a row
-# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
-early_stopping_patience: 3
-
-# Specify a scheduler and kwargs to use with the optimizer
-lr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | empty for cosine
-lr_scheduler_kwargs:
-cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
-cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)
-
-# For one_cycle optim
-lr_div_factor: # Learning rate div factor
-
-# Specify optimizer
-# Valid values are driven by the Transformers OptimizerNames class, see:
-# https://github.com/huggingface/transformers/blob/cbf924b76c03828101a34069a96d209314114fd5/src/transformers/training_args.py#L144-L189
-#
-# Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of
-# torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used
-# in the examples/ for your model and fine-tuning use case.
-#
-# Valid values for 'optimizer' include:
-# - adamw_torch
-# - adamw_torch_fused
-# - adamw_torch_xla
-# - adamw_torch_npu_fused
-# - adamw_apex_fused
-# - adopt_adamw  (an EXPERIMENTAL optimizer, only for torch version >= 2.5.1)
-# - adafactor
-# - adamw_anyprecision
-# - adamw_torch_4bit
-# - ademamix
-# - sgd
-# - adagrad
-# - adamw_bnb_8bit
-# - adamw_8bit   # alias for adamw_bnb_8bit
-# - ademamix_8bit
-# - lion_8bit
-# - lion_32bit
-# - paged_adamw_32bit
-# - paged_adamw_8bit
-# - paged_ademamix_32bit
-# - paged_ademamix_8bit
-# - paged_lion_32bit
-# - paged_lion_8bit
-# - rmsprop
-# - rmsprop_bnb
-# - rmsprop_bnb_8bit
-# - rmsprop_bnb_32bit
-# - galore_adamw
-# - galore_adamw_8bit
-# - galore_adafactor
-# - galore_adamw_layerwise
-# - galore_adamw_8bit_layerwise
-# - galore_adafactor_layerwise
-# - lomo
-# - adalomo
-# - grokadamw
-# - schedule_free_adamw
-# - schedule_free_sgd
-# - apollo_adamw
-# - apollo_adamw_layerwise
-#
-# Additional custom optimizers include:
-# - optimi_adamw
-# - ao_adamw_8bit
-# - ao_adamw_fp8
-optimizer:
-# Dictionary of arguments to pass to the optimizer
-optim_args:
-# For Galore Optimizers the following optim_args are available
-# rank:  # type: int
-# update_proj_gap  # type: int
-# scale  # type: float
-# proj_type:  # type: str, default = std
-
-# The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm
-optim_target_modules:
-# - self_attn  # for llama
-# - mlp
-
-# Specify weight decay
-weight_decay:
-# adamw hyperparams
-adam_beta1:
-adam_beta2:
-adam_epsilon:
-# Gradient clipping max norm
-max_grad_norm:
-
-# Augmentation techniques
-# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings
-# currently only supported on Llama and Mistral
-neftune_noise_alpha:
-
-# Optional[bool]. Whether to bettertransformers
-flash_optimum:
-
-# Note: Only one of the following attention patches can be used at a time.
-# For example, if you set `xformers_attention` to `true`, do not set `flash_attention` to `true`.
-
-# Optional[bool]. Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
-xformers_attention:
-# Optional[bool]. Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
-flash_attention:
-flash_attn_cross_entropy:  # Optional[bool]. Whether to use flash-attention cross entropy implementation - advanced use only
-flash_attn_rms_norm:  # Optional[bool]. Whether to use flash-attention rms norm implementation - advanced use only
-flash_attn_fuse_qkv: # Optional[bool]. Whether to fuse QKV into a single operation
-flash_attn_fuse_mlp: # Optional[bool]. Whether to fuse part of the MLP into a single operation
-# Optional[bool]. Whether to use scaled-dot-product attention
-# https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
-sdp_attention:
-# Optional[bool]. Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
-s2_attention:
-
-# Optional[bool]. Whether to use low_cpu_mem_usage
-low_cpu_mem_usage:
-# Optional[str]. Resume from a specific checkpoint dir
-resume_from_checkpoint:
-# Optional[bool]. If resume_from_checkpoint isn't set and you simply want it to start where it left off.
-# Be careful with this being turned on between different models.
-auto_resume_from_checkpoints: false
-
-## Multimodal section
-# int | tuple[int, int] | None . Size to resize images to, width x height.
-# Will read from model/processor config if not set.
-image_size:
-# str. Algorithm to use for image resizing. "bilinear", "bicubic", "lanczos". Default is "bilinear".
-image_resize_algorithm: 'bilinear'
-## End of multimodal section
-
-# Don't mess with this, it's here for accelerate and torchrun
-local_rank:
-
-# Add or change special tokens.
-# If you add tokens here, you don't need to add them to the `tokens` list.
-special_tokens:
-  # bos_token: "<s>"
-  # eos_token: "</s>"
-  # unk_token: "<unk>"
-  # pad_token: "[PAD]"
-
-# Add extra tokens.
-tokens:
-
-# Mapping token_id to new_token_string to override reserved added_tokens in the tokenizer.
-# Only works for tokens that are not part of the base vocab (aka are added_tokens).
-# Can be checked if they exist in tokenizer.json added_tokens.
-added_tokens_overrides:  # Dict[int, str]
-#  128041: "<|im_start|>"
-#  128042: "<|im_end|>"
-
-# FSDP
-fsdp:
-fsdp_config:
-
-# Deepspeed config path. e.g., deepspeed_configs/zero3.json
-deepspeed:
-
-# Advanced DDP Arguments
-ddp_timeout:
-ddp_bucket_cap_mb:
-ddp_broadcast_buffers:
-
-# Sequence parallelism
-# Set to a divisor of the number of GPUs available to split sequences into chunks of equal size.
-# Use in long context training to prevent OOM when sequences cannot fit into a single GPU's VRAM.
-# E.g., if 4 GPUs are available, set this value to 2 to split each sequence into two equal-sized
-# subsequences, or set to 4 to split into four equal-sized subsequences.
-# See https://axolotl-ai-cloud.github.io/axolotl/docs/sequence_parallelism.html for more details.
-sequence_parallel_degree:
-# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
-# Must evenly divide the number of KV heads in your model.
-heads_k_stride: 1
-
-# Path to torch distx for optim 'adamw_anyprecision'
-torchdistx_path:
-
-# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
-pretraining_dataset:
-
-# Debug mode
-debug:
-
-# Seed
-seed:
-
-# Allow overwrite yml config using from cli
-strict:
-```
--- a/docs/custom_integrations.qmd
+++ b/docs/custom_integrations.qmd
@@ -7,6 +7,7 @@ toc-depth: 3
 ```{python}
 #| echo: false

+import os
 import re

 def process_readme(integration_name):
@@ -49,9 +50,28 @@ sections = [
    ("Knowledge Distillation (KD)", "kd"),
    ("Liger Kernels", "liger"),
    ("Language Model Evaluation Harness (LM Eval)", "lm_eval"),
-    ("Spectrum", "spectrum")
+    ("Spectrum", "spectrum"),
+    ("LLMCompressor", "llm_compressor")
 ]

+for folder_name in os.listdir("../src/axolotl/integrations/"):
+    if folder_name in [path for name, path in sections]:
+        # skip if already in sections
+        continue
+    if os.path.exists(f"../src/axolotl/integrations/{folder_name}/README.md"):
+        # grab the first heading in README.md as the section name
+        with open(f"../src/axolotl/integrations/{folder_name}/README.md", "r") as f:
+            txt = f.read()
+            matches = re.search(r'^# (.*)\n?', txt, flags=re.MULTILINE)
+            if matches:
+                name = matches.group(1)
+            else:
+                continue
+            sections.append((name, folder_name))
+
+# sort sections by name
+sections = sorted(sections, key=lambda x: x[0])
+
 for section_name, folder_name in sections:
    print(print_section(section_name, folder_name))
 ```
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -4,27 +4,15 @@ description: Conversation format for supervised fine-tuning.
 order: 3
 ---

-## sharegpt
-
-::: {.callout-important}
-ShareGPT is deprecated!. Please see [chat_template](#chat_template) section below.
-:::
-
-## pygmalion
-
-```{.json filename="data.jsonl"}
-{"conversations": [{"role": "...", "value": "..."}]}
-```
-
 ## chat_template

 Chat Template strategy uses a jinja2 template that converts a list of messages into a prompt. Support using tokenizer's template, a supported template, or custom jinja2.

 ```{.json filename="data.jsonl"}
-{"conversations": [{"role": "...", "content": "..."}]}
+{"messages": [{"role": "...", "content": "..."}, {"role": "...", "content": "..."}, ...]}
 ```

-See [configs](../config.qmd) for full configs and supported templates.
+See [configs](../config-reference.qmd) for full configs and supported templates.

 ### Migrating from sharegpt

@@ -64,7 +52,9 @@ We recommend checking the below examples for other usecases.

 ### Examples

-1. Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.
+#### Training on last message
+
+(Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.

 ```yaml
 datasets:
@@ -78,7 +68,9 @@ datasets:
 If you receive an error like "`chat_template` choice is `tokenizer_default` but tokenizer's `chat_template` is null.", it means the tokenizer does not have a default `chat_template`. Follow the examples below instead to set a custom `chat_template`.
 :::

-2. Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages.
+#### Overriding default chat template
+
+Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages.

 ```yaml
 chat_template: gemma # this overwrites the tokenizer's chat_template
@@ -88,7 +80,13 @@ datasets:
    roles_to_train: ["assistant"]  # default value
 ```

-3. Using the tokenizer_config.json's chat template or `chatml` as fallback if the former's chat template does not exist, on OpenAI messages format, training on all assistant messages.
+::: {.callout-note}
+If you want to use built-in chat_template, use `chat_template: tokenizer_default` (this is set by default).
+:::
+
+#### Using default chat template with fallback
+
+Using the tokenizer_config.json's chat template or `chatml` as fallback if the former's chat template does not exist, on OpenAI messages format, training on all assistant messages.

 ```yaml
 chat_template: tokenizer_default_fallback_chatml # this overwrites the tokenizer's chat_template
@@ -97,7 +95,9 @@ datasets:
    type: chat_template
 ```

-4. Using a custom jinja template on OpenAI messages format, training on all assistant messages.
+#### Custom Jinja template
+
+Using a custom jinja template on OpenAI messages format, training on all assistant messages.

 ```yaml
 # chat_template: jinja # `jinja` will be implied if the `chat_template_jinja` is set and this field is empty
@@ -109,10 +109,125 @@ datasets:
 ```

 ::: {.callout-important}
-Please make sure that your `tokenizer.eos_token` is same as EOS/EOT token in template. Otherwise, set `eos_token` under `special_tokens`.
+Please make sure that your `tokenizer.eos_token` is same as EOS (End-of-Sequence) token in template. Otherwise, set `eos_token` under `special_tokens: `.
 :::

-5. (Advanced) Using fine-grained control over tokens and turns to train in a conversation
+#### Using template with different token for EOT and EOS
+
+- If you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the `eot_tokens: ` config. The handling of EOT tokens follows `train_on_eos: ` which defaults to turn.
+
+```yaml
+eot_tokens:
+  - "[/INST]"
+  # - "[/SYSTEM_PROMPT]"
+
+datasets:
+  - path: ...
+    type: chat_template
+
+    # optional
+    train_on_eot: turn  # defaults read from train_on_eos (which defaults to turn)
+```
+
+::: {.callout-tip}
+See [config documentation](../config-reference.qmd) for detailed explanations of "turn", "last", and "all" options for training on tokens.
+:::
+
+::: {.callout-note}
+Using `eot_tokens` requires each token that exists in `chat_template` to be a single token in the tokenizer. Otherwise, the tokenizer will split the token and cause unexpected behavior.
+
+You can add those tokens as new tokens under `tokens: ` or (recommended) override unused added_tokens via `added_tokens_overrides: `. See [config](../config-reference.qmd) for more details.
+:::
+
+- Continuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set `train_on_eos: last`.
+
+```yaml
+eot_tokens:
+  - "[/INST]"
+  # ...
+
+datasets:
+  - path: ...
+    type: chat_template
+
+    train_on_eos: last
+    train_on_eot: turn
+```
+
+::: {.callout-tip}
+If EOS token only appears at the end of a prompt, `train_on_eos: last` is equivalent to `train_on_eos: turn`. Therefore, generally, you can leave them to their defaults and omit them.
+:::
+
+
+#### Using tool use
+
+Instead of passing `tools` via the system prompt, an alternative method would be to have the `tools` in a separate column and loaded via `chat_template` to let the template dynamically build it.
+
+```json
+{
+    "tools": [
+        {
+            "type": "...",
+            "function": {
+                "name": "...",
+                "description": "...",
+                "parameters": {
+                    "type": "...",
+                    "properties": {
+                        // ...
+                    },
+                    "required": ["..."],
+                },
+            },
+        },
+    ],
+    "messages": [
+        // ...
+        {
+            "role": "assistant", // call the function via assistant
+            "tool_calls": [
+                {
+                    "id": "...",  // required only for mistral
+                    "type": "function",
+                    "function": {
+                        "name": "...",
+                        "arguments": {
+                            "...": "...",
+                        }
+                    }
+                }
+            ]
+        },
+        {
+            "role": "tool",
+            "tool_call_id": "...",  // required only for mistral
+            "name": "...",
+            "content": "..."
+        },
+    ],
+}
+```
+
+::: {.callout-note}
+Tools need to follow [JSON schema](https://json-schema.org/learn/getting-started-step-by-step).
+:::
+
+```yaml
+chat_template: llama4
+datasets:
+  - path: ...
+    type: chat_template
+    # field_tools: tools # default is `tools`
+```
+
+::: {.callout-tip}
+Look into the `chat_template` you are using to see if it supports `tools` and what the expected role is for the tool answer. In the example above, the tool answer is expected to be in the `tool` or `ipython` role for `llama4` template.
+:::
+
+
+#### Using fine-grained control over token masking
+
+(Advanced) Using fine-grained control over tokens and turns to train in a conversation

 For a data sample that looks like:

@@ -162,3 +277,45 @@ datasets:
 ::: {.callout-tip}
 It is not necessary to set both `message_field_training` and `message_field_training_detail` at once.
 :::
+
+#### Reasoning split
+
+(For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.
+
+```yaml
+datasets:
+  - path: ...
+    type: chat_template
+    chat_template: qwen3
+    split_thinking: true
+```
+
+For example, a content can look like:
+
+```json
+{
+  "content": "<think>Some thinking outputs</think>Output after thinking."
+}
+```
+
+After split, it will look like:
+
+```json
+{
+  "reasoning_content": "Some thinking outputs",
+  "content": "Output after thinking..."
+}
+```
+
+
+## sharegpt
+
+::: {.callout-important}
+ShareGPT is deprecated!. Please see [chat_template](#chat_template) section.
+:::
+
+## pygmalion
+
+```{.json filename="data.jsonl"}
+{"conversations": [{"role": "...", "value": "..."}]}
+```
--- a/docs/dataset-formats/index.qmd
+++ b/docs/dataset-formats/index.qmd
@@ -36,10 +36,6 @@ It is typically recommended to save your dataset as `.jsonl` due to its flexibil

 Axolotl supports loading from a Hugging Face hub repo or from local files.

-::: {.callout-important}
-For pre-training only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts.
-:::
-
 ### Pre-training from Hugging Face hub datasets

 As an example, to train using a Hugging Face dataset `hf_org/name`, you can pass the following config:
@@ -77,18 +73,21 @@ datasets:
    type: completion
 ```

-From local files (either example works):
+From local files:

 ```yaml
 datasets:
  - path: A.jsonl
    type: completion

-  - path: json
-    data_files: ["A.jsonl", "B.jsonl", "C.jsonl"]
+  - path: B.jsonl
    type: completion
 ```

+::: {.callout-important}
+For `completion` only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts. If you are interested in having this for `pretraining_dataset` too, please let us know or help make a PR!
+:::
+
 ### Pre-training dataset configuration tips

 #### Setting max_steps
@@ -457,10 +456,7 @@ datasets:
    type: alpaca
 ```

-Axolotl supports many kinds of instruction dataset. All of them can be found here (https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/inst_tune.html) with their respective type and sample row format.
-
-
-Reference: [Instruction Dataset Documentation](inst_tune.qmd).
+Axolotl supports many kinds of instruction dataset. All of them can be found in the [Instruction Dataset Documentation](inst_tune.qmd) with their respective type and sample row format.

 #### Custom Instruct Prompt Format

--- a/docs/dataset-formats/inst_tune.qmd
+++ b/docs/dataset-formats/inst_tune.qmd
@@ -186,4 +186,4 @@ datasets:
      no_input_format: "[INST] {instruction} [/INST]"
 ```

-See full config options under [here](../config.qmd).
+See full config options under [here](../config-reference.qmd).
--- a/docs/dataset_loading.qmd
+++ b/docs/dataset_loading.qmd
@@ -36,7 +36,7 @@ This matches the API of [`datasets.load_dataset`](https://github.com/huggingface

 For HuggingFace's guide to load different dataset types, see [here](https://huggingface.co/docs/datasets/loading).

-For full details on the config, see [config.qmd](config.qmd).
+For full details on the config, see [config-reference.qmd](config-reference.qmd).

 ::: {.callout-note}

@@ -54,7 +54,7 @@ datasets:

 #### Files

-Usually, to load a JSON file, you would do something like this:
+To load a JSON file, you would do something like this:

 ```python
 from datasets import load_dataset
@@ -66,20 +66,12 @@ Which translates to the following config:

 ```yaml
 datasets:
-  - path: json
-    data_files: /path/to/your/file.jsonl
-```
-
-However, to make things easier, we have added a few shortcuts for loading local dataset files.
-
-You can just point the `path` to the file or directory along with the `ds_type` to load the dataset. The below example shows for a JSON file:
-
-```yaml
-datasets:
-  - path: /path/to/your/file.jsonl
+  - path: data.json
    ds_type: json
 ```

+In the example above, it can be seen that we can just point the `path` to the file or directory along with the `ds_type` to load the dataset.
+
 This works for CSV, JSON, Parquet, and Arrow files.

 ::: {.callout-tip}
--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -8,6 +8,10 @@ format:

 This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai).

+::: {.callout-important}
+For Blackwell GPUs, please use the tags with PyTorch 2.7.1 and CUDA 12.8.
+:::
+
 ## Base

 The base image is the most minimal image that can install Axolotl. It is based on the `nvidia/cuda` image. It includes python, torch, git, git-lfs, awscli, pydantic, and more.
@@ -28,9 +32,11 @@ main-base-py{python_version}-cu{cuda_version}-{pytorch_version}

 Tags examples:

+- `main-base-py3.11-cu128-2.7.1`
+- `main-base-py3.11-cu126-2.7.1`
+- `main-base-py3.11-cu126-2.7.0`
+- `main-base-py3.11-cu126-2.6.0`
 - `main-base-py3.11-cu124-2.6.0`
- `main-base-py3.11-cu124-2.5.1`
- `main-base-py3.11-cu124-2.4.1`

 ## Main

@@ -50,7 +56,7 @@ Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl)
 # on push to main
 main-py{python_version}-cu{cuda_version}-{pytorch_version}

-# latest main (currently torch 2.5.1, python 3.11, cuda 12.4)
+# latest main (currently torch 2.6.0, python 3.11, cuda 12.4)
 main-latest

 # nightly build
@@ -68,14 +74,15 @@ There may be some extra tags appended to the image, like `-vllm` which installs

 Tags examples:

+- `main-py3.11-cu128-2.7.1`
+- `main-py3.11-cu126-2.7.1`
+- `main-py3.11-cu126-2.7.0`
+- `main-py3.11-cu126-2.6.0`
 - `main-py3.11-cu124-2.6.0`
- `main-py3.11-cu124-2.5.1`
- `main-py3.11-cu124-2.4.1`
 - `main-latest`
 - `main-20250303-py3.11-cu124-2.6.0`
- `main-20250303-py3.11-cu124-2.5.1`
- `main-20250303-py3.11-cu124-2.4.1`
- `0.7.1`
+- `main-20250303-py3.11-cu126-2.6.0`
+- `0.10.1`

 ## Cloud

--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -9,11 +9,11 @@ description: Frequently asked questions

 > A: Usually an issue with the GPUs communicating with each other. See the [NCCL doc](nccl.qmd)

-**Q: Exitcode -9**
+**Q: exitcode: -9**

 > A: This usually happens when you run out of system RAM.

-**Q: Exitcode -7 while using deepspeed**
+**Q: exitcode: -7 while using deepspeed**

 > A: Try upgrading deepspeed w: `pip install -U deepspeed`

@@ -51,6 +51,18 @@ description: Frequently asked questions
 >   pad_token: "..."
 > ```

+**Q: `IterableDataset error` or `KeyError: 'input_ids'` when using `preprocess` CLI**
+
+> A: This is because you may be using `preprocess` CLI with `pretraining_dataset:` or `skip_prepare_dataset: true` respectively. Please use `axolotl train` CLI directly instead as these datasets are prepared on demand.
+
+**Q: vLLM is not working with Axolotl**
+
+> A: We currently recommend torch 2.6.0 for use with `vllm`. Please ensure you use the right version. For Docker, please use the `main-py3.11-cu124-2.6.0` tag.
+
+**Q: FA2 2.8.0 `undefined symbol` runtime error on CUDA 12.4**
+
+> A: There seems to be a wheel issue with FA2 2.8.0 on CUDA 12.4. Try CUDA 12.6 instead or downgrade to FA2 2.7.4. Please refer to the upstream issue: https://github.com/Dao-AILab/flash-attention/issues/1717.
+
 ### Chat templates

 **Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**
@@ -73,10 +85,54 @@ description: Frequently asked questions

 > A: This is likely an empty turn.

-**Q: The EOS/EOT token is incorrectly being masked or not being masked.**
+**Q: The EOS token is incorrectly being masked or not being masked / `EOS token __ not found in chat template`.**

-> A: This is because of the mismatch between `tokenizer.eos_token` and EOS/EOT token in template. Please make sure to set `eos_token` under `special_tokens` to the same EOS/EOT token as in template.
+> A: There can be two reasons:
+
+> 1. This is because of the mismatch between `tokenizer.eos_token` and EOS token in template. Please make sure to set `eos_token: ` under `special_tokens: ` to the same EOS token as in template.
+
+> 2. The EOS token is not in the template. Please check if your template is correct. As an example, `phi_35` template does not use its dedicated EOS token `<|endoftext|>` at the end.

 **Q: "`chat_template` choice is `tokenizer_default` but tokenizer's `chat_template` is null. Please add a `chat_template` in tokenizer config"**

 > A: This is because the tokenizer does not have a chat template. Please add a chat template in the tokenizer config. See [chat_template](dataset-formats/conversation.qmd#chat-template) for more details.
+
+**Q: The EOT token(s) are incorrectly being masked or not being masked / `EOT token __ not found in chat template`.**
+
+> A: There can be two reasons:
+
+> 1. The EOT token is different from the EOS token and was not specified under `eot_tokens: `. Please set `eot_tokens: ` to the same EOT token(s) as in template.
+
+> 2. There is more than one EOT token per turn in the template. Please raise an issue with examples as we recognize this as an edge case.
+
+**Q: `EOT token encoding failed. Please check if the token is valid and can be encoded.`**
+
+> A: There could be some issue with the tokenizer or unicode encoding. Please raise an issue with examples with the EOT token & tokenizer causing the issue.
+
+**Q: `EOT token __ is encoded as multiple tokens.`**
+
+> A: This is because the EOT token is encoded as multiple tokens which can cause unexpected behavior. Please add it under `tokens: ` or (recommended) override unused added_tokens via `added_tokens_overrides: `.
+
+**Q: `Conflict between train_on_eos and train_on_eot. eos_token is in eot_tokens and train_on_eos != train_on_eot`**
+
+> A: This is because the EOS token is in the `eot_tokens: ` while mismatch between `train_on_eos: ` and `train_on_eot: `. This will cause one to override the other. Please ensure that `train_on_eos: ` and `train_on_eot: ` are the same or remove the EOS token from `eot_tokens: `.
+
+**Q: If `eot_tokens: ` is not provided, what happens?**
+
+> A: If `eot_tokens: ` is not provided, the default behavior is the same as before. EOS tokens used to delimit turns are masked/unmasked depending on whether the turn is trainable.
+
+> Internally, `eot_tokens: tokenizer.eos_token` and `train_on_eot: train_on_eos` (which defaults to `turn`). This transition helps clarify the naming and behavior of EOT/EOS tokens.
+
+**Q: `Data processing error: CAS service error`**
+
+> A: Try disabling XET with `export HF_HUB_DISABLE_XET=1`
+
+**Q: `torch._inductor.exc.LoweringException: NoValidChoicesError: No choices to select, please consider adding ATEN into max_autotune_gemm_backends config (defined in torch/_inductor/config.py) to allow at least one choice. `**
+
+> A: Depending on the version of torch, you may need to include this in your YAML:
+
+> ```yaml
+> flex_attn_compile_kwargs:
+>   dynamic: false
+>   mode: max-autotune-no-cudagraphs
+> ```
--- a/docs/fsdp_qlora.qmd
+++ b/docs/fsdp_qlora.qmd
@@ -20,7 +20,7 @@ To enable `QLoRA` with `FSDP`, you need to perform the following steps:
 > See the [example config](#example-config) file in addition to reading these instructions.

 1. Set `adapter: qlora` in your axolotl config file.
-2. Enable FSDP in your axolotl config, as [described here](https://github.com/axolotl-ai-cloud/axolotl?tab=readme-ov-file#fsdp).
+2. Enable FSDP in your axolotl config, as [described here](multi-gpu.qmd#sec-fsdp).
 3. Use one of the supported model types: `llama`, `mistral` or `mixtral`.

 ## Example Config
--- a/docs/getting-started.qmd
+++ b/docs/getting-started.qmd
@@ -55,7 +55,7 @@ output_dir: ./outputs/lora-out
 - To perform QLoRA finetuning, replace with `load_in_4bit: true` and `adapter: qlora`.
 :::

-See our [Config options](config.qmd) for more details.
+See our [config options](config-reference.qmd) for more details.

 ### Training {#sec-training}

@@ -104,7 +104,7 @@ the `alpaca` dataset format, which has the following format:
 Please see our [Dataset Formats](dataset-formats) for more dataset formats and how to
 format them.

-2. Prepare your JSONL data in the specified format (in this case, the expected `alpaca
+2. Prepare your JSONL data in the specified format (in this case, the expected `alpaca`
 format):

 ```json
@@ -120,6 +120,12 @@ axolotl train my_training.yml

 ## Common Tasks {#sec-common-tasks}

+::: {.callout-tip}
+
+The same yaml file is used for training, inference, and merging.
+
+:::
+
 ### Testing Your Model {#sec-testing}

 After training, test your model:
@@ -128,6 +134,16 @@ After training, test your model:
 axolotl inference my_training.yml --lora-model-dir="./outputs/lora-out"
 ```

+More details can be found in [Inference](inference.qmd).
+
+### Using a UI {#sec-ui}
+
+Launch a Gradio interface:
+
+```bash
+axolotl inference my_training.yml --lora-model-dir="./outputs/lora-out" --gradio
+```
+
 ### Preprocessing Data {#sec-preprocessing}

 For large datasets, preprocess first:
@@ -136,14 +152,22 @@ For large datasets, preprocess first:
 axolotl preprocess my_training.yml
 ```

-### Using a UI {#sec-ui}
+Please make sure to set `dataset_prepared_path: ` in your config to set the path to save the prepared dataset.

-Launch a Gradio interface:
+More details can be found in [Dataset Preprocessing](dataset_preprocessing.qmd).
+
+### Merging LoRA weights {#sec-merging-lora}
+
+To merge the LoRA weights back into the base model, run:

 ```bash
-axolotl inference my_training.yml --lora-model-dir="./outputs/lora-out" --gradio
+axolotl merge-lora my_training.yml --lora-model-dir="./outputs/lora-out"
 ```

+The merged model will be saved in the `{output_dir}/merged` directory.
+
+More details can be found in [Merging LoRA weights](inference.qmd#sec-merging).
+
 ## Next Steps {#sec-next-steps}

 Now that you have the basics, you might want to:
@@ -155,7 +179,8 @@ Now that you have the basics, you might want to:

 Check our other guides for details on these topics:

- [Configuration Guide](config.qmd) - Full configuration options
+- [Configuration Guide](config-reference.qmd) - Full configuration options
+- [Dataset Loading](dataset_loading.qmd) - Loading datasets from various sources
 - [Dataset Formats](dataset-formats) - Working with different data formats
 - [Multi-GPU Training](multi-gpu.qmd)
 - [Multi-Node Training](multi-node.qmd)
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -14,11 +14,21 @@ This guide covers all the ways you can install and set up Axolotl for your envir
 ## Requirements {#sec-requirements}

 - NVIDIA GPU (Ampere architecture or newer for `bf16` and Flash Attention) or AMD GPU
- Python ≥3.10
- PyTorch ≥2.4.1
+- Python ≥3.11
+- PyTorch ≥2.6.0

 ## Installation Methods {#sec-installation-methods}

+::: {.callout-important}
+Please make sure to have Pytorch installed before installing Axolotl in your local environment.
+
+Follow the instructions at: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
+:::
+
+::: {.callout-important}
+For Blackwell GPUs, please use Pytorch 2.7.0 and CUDA 12.8.
+:::
+
 ### PyPI Installation (Recommended) {#sec-pypi}

 ```{.bash}
@@ -31,6 +41,40 @@ installed) in order not to clobber it, and so that we set the correct version of
 dependencies that are specific to the PyTorch version or other installed
 co-dependencies.

+### uv Installation {#sec-uv}
+
+uv is a fast, reliable Python package installer and resolver built in Rust. It offers significant performance improvements over pip and provides better dependency resolution, making it an excellent choice for complex environments.
+
+Install uv if not already installed
+```{.bash}
+curl -LsSf https://astral.sh/uv/install.sh | sh
+source $HOME/.local/bin/env
+```
+
+Choose your CUDA version to use with PyTorch; e.g. `cu124`, `cu126`, `cu128`,
+then create the venv and activate
+```{.bash}
+export UV_TORCH_BACKEND=cu126
+uv venv --no-project --relocatable
+source .venv/bin/activate
+```
+
+Install PyTorch
+- PyTorch 2.6.0 recommended
+```{.bash}
+uv pip install packaging setuptools wheel
+uv pip install torch==2.6.0
+uv pip install awscli pydantic
+```
+
+Install axolotl from PyPi
+```{.bash}
+uv pip install --no-build-isolation axolotl[deepspeed,flash-attn]
+
+# optionally install with vLLM if you're using torch==2.6.0 and want to train w/ GRPO
+uv pip install --no-build-isolation axolotl[deepspeed,flash-attn,vllm]
+```
+
 ### Edge/Development Build {#sec-edge-build}

 For the latest features between releases:
@@ -66,6 +110,10 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it \
 ```
 :::

+::: {.callout-important}
+For Blackwell GPUs, please use `axolotlai/axolotl:main-py3.11-cu128-2.7.0` or the cloud variant `axolotlai/axolotl-cloud:main-py3.11-cu128-2.7.0`.
+:::
+
 Please refer to the [Docker documentation](docker.qmd) for more information on the different Docker images that are available.

 ## Cloud Environments {#sec-cloud}
@@ -105,7 +153,7 @@ We recommend using WSL2 (Windows Subsystem for Linux) or Docker.

 ### Conda/Pip venv {#sec-conda}

-1. Install Python ≥3.10
+1. Install Python ≥3.11
 2. Install PyTorch: https://pytorch.org/get-started/locally/
 3. Install Axolotl:
   ```{.bash}
--- a/docs/lora_optims.qmd
+++ b/docs/lora_optims.qmd
@@ -84,6 +84,10 @@ lora_qkv_kernel: true
 lora_o_kernel: true
 ```

+::: {.callout-note}
+Currently, LoRA kernels are not supported for RLHF training, only SFT.
+:::
+
 ## Requirements

 - One or more NVIDIA or AMD GPUs (in order to use the Triton kernels)
--- a/docs/multi-gpu.qmd
+++ b/docs/multi-gpu.qmd
@@ -23,8 +23,6 @@ Axolotl supports several methods for multi-GPU training:

 ## DeepSpeed {#sec-deepspeed}

-DeepSpeed is the recommended approach for multi-GPU training due to its stability and performance. It provides various optimization levels through ZeRO stages.
-
 ### Configuration {#sec-deepspeed-config}

 Add to your YAML config:
@@ -32,10 +30,12 @@ Add to your YAML config:
 ```{.yaml}
 deepspeed: deepspeed_configs/zero1.json
 ```
-
 ### Usage {#sec-deepspeed-usage}

 ```{.bash}
+# Fetch deepspeed configs (if not already present)
+axolotl fetch deepspeed_configs
+
 # Passing arg via config
 axolotl train config.yml

@@ -48,14 +48,90 @@ axolotl train config.yml --deepspeed deepspeed_configs/zero1.json
 We provide default configurations for:

 - ZeRO Stage 1 (`zero1.json`)
+- ZeRO Stage 1 with torch compile (`zero1_torch_compile.json`)
 - ZeRO Stage 2 (`zero2.json`)
 - ZeRO Stage 3 (`zero3.json`)
+- ZeRO Stage 3 with bf16 (`zero3_bf16.json`)
+- ZeRO Stage 3 with bf16 and CPU offload params(`zero3_bf16_cpuoffload_params.json`)
+- ZeRO Stage 3 with bf16 and CPU offload params and optimizer (`zero3_bf16_cpuoffload_all.json`)

-Choose based on your memory requirements and performance needs.
+::: {.callout-tip}

-## FSDP {#sec-fsdp}
+Choose the configuration that offloads the least amount to memory while still being able to fit on VRAM for best performance.

-### Basic FSDP Configuration {#sec-fsdp-config}
+Start from Stage 1 -> Stage 2 -> Stage 3.
+
+:::
+
+::: {.callout-tip}
+
+Using ZeRO Stage 3 with Single-GPU training
+
+ZeRO Stage 3 can be used for training on a single GPU by manually setting the environment variables:
+`WORLD_SIZE=1 LOCAL_RANK=0 MASTER_ADDR=0.0.0.0 MASTER_PORT=29500`
+
+:::
+
+## Fully Sharded Data Parallel (FSDP) {#sec-fsdp}
+
+::: {.callout-note}
+
+FSDP2 is recommended for new users. FSDP1 is deprecated and will be removed in an upcoming release of Axolotl.
+
+:::
+
+### Migrating from FSDP1 to FSDP2 {#sec-migrate-fsdp1-fsdp2}
+
+To migrate your config from FSDP1 to FSDP2, you must use the `fsdp_version` top-level config field to specify the FSDP version, and
+also follow the config field mapping below to update field names.
+
+#### Config mapping
+
+FSDP1 | FSDP2
+-------- | --------
+fsdp_sharding_strategy | reshard_after_forward
+fsdp_backward_prefetch_policy | **REMOVED**
+fsdp_backward_prefetch | **REMOVED**
+fsdp_forward_prefetch | **REMOVED**
+fsdp_sync_module_states | **REMOVED**
+fsdp_cpu_ram_efficient_loading | cpu_ram_efficient_loading
+fsdp_state_dict_type | state_dict_type
+fsdp_use_orig_params | **REMOVED**
+
+
+For example, if you were using the following FSDP1 config:
+
+```{.yaml}
+fsdp_version: 1
+fsdp_config:
+  fsdp_offload_params: false
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_transformer_layer_cls_to_wrap: Qwen3DecoderLayer
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sharding_strategy: FULL_SHARD
+```
+
+You can migrate to the following FSDP2 config:
+
+```{.yaml}
+fsdp_version: 2
+fsdp_config:
+  offload_params: false
+  cpu_ram_efficient_loading: true
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: Qwen3DecoderLayer
+  state_dict_type: FULL_STATE_DICT
+  reshard_after_forward: true
+```
+
+### FSDP1 (deprecated) {#sec-fsdp-config}
+
+::: {.callout-note}
+
+Using `fsdp` to configure FSDP is deprecated and will be removed in an upcoming release of Axolotl. Please use `fsdp_config` as above instead.
+
+:::

 ```{.yaml}
 fsdp:
@@ -67,6 +143,7 @@ fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
 ```

+
 ## Sequence parallelism {#sec-sequence-parallelism}

 We support sequence parallelism (SP) via the
@@ -74,20 +151,7 @@ We support sequence parallelism (SP) via the
 allows one to split up sequences across GPUs, which is useful in the event that a
 single sequence causes OOM errors during model training.

-First, install `ring-flash-attn`, recommended via `pip install axolotl[ring-flash-attn]`,
-or from source with `pip install .[ring-flash-attn]`.
-
-Your Axolotl YAML config should contain the following lines:
-
-```{.yaml}
-sequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU
-flash_attention: true  # Required with sequence parallelism
-
-# Optional; strides across the key dimension. Larger values use more memory but will make training faster.
-heads_k_stride: 1
-```
-
-See our [dedicated guide](sequence_parallelism.qmd) for more details.
+See our [dedicated guide](sequence_parallelism.qmd) for more information.

 ### FSDP + QLoRA {#sec-fsdp-qlora}

--- a/docs/multi-node.qmd
+++ b/docs/multi-node.qmd
@@ -40,13 +40,13 @@ use_cpu: false

 Configure your model to use FSDP in the Axolotl yaml. For example:
 ```yaml
-fsdp:
-  - full_shard
-  - auto_wrap
+fsdp_version: 2
 fsdp_config:
-  fsdp_offload_params: true
-  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  offload_params: true
+  state_dict_type: FULL_STATE_DICT
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  reshard_after_forward: true
 ```

 All you have to do now is launch using accelerate as you would usually do on each machine and voila, the processes will start once you have launched accelerate on every machine.
--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -43,7 +43,7 @@ datasets:
 # leave the vision model and vision tower frozen
 # load_in_8bit: true
 adapter: lora
-lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 # (optional) if you want to resize images to a set size
 image_size: 512
@@ -164,7 +164,7 @@ Here is an example of a multi-modal dataset:
        {
            "role": "user",
            "content": [
-                {"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
+                {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
                {"type": "text", "text": "Describe this image in detail."}
            ]
        },
--- a/docs/qat.qmd
+++ b/docs/qat.qmd
@@ -0,0 +1,32 @@
+---
+title: "Quantization Aware Training (QAT)"
+back-to-top-navigation: true
+toc: true
+toc-expand: 2
+toc-depth: 4
+---
+
+## Overview
+
+[Quantization Aware Training](https://pytorch.org/blog/introduction-to-quantization-on-pytorch/#quantization-aware-training) (QAT) is a technique for improving the accuracy of models which are quantized
+by applying "fake" quantizations to the model's weights (and optionally, activations) during training. This fake
+quantization allows for the model to adjust for noise introduced by the quantization, so when the model is eventually
+quantized, the accuracy loss is minimized. We use the quantization techniques implemented in [torchao](https://github.com/pytorch/ao) to provide
+support for QAT and post-training quantization (PTQ) in axolotl.
+
+We recommend reviewing the excellent QAT tutorial in the [torchtune library](https://pytorch.org/torchtune/main/tutorials/qat_finetune.html#quantizing-the-qat-model),
+and the QAT documentation in the [torchao library](https://github.com/pytorch/ao/tree/main/torchao/quantization/qat), for more details.
+
+## Configuring QAT in Axolotl
+
+To enable QAT in axolotl, add the following to your configuration file:
+
+```yaml
+qat:
+  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
+  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4" and "int8"
+  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
+  fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after
+```
+
+Once you have finished training, you must quantize your model by using the same quantization configuration which you used to train the model with. You can use the [`quantize`](./quantize.qmd) command to do this.
--- a/docs/quantize.qmd
+++ b/docs/quantize.qmd
@@ -0,0 +1,53 @@
+---
+title: "Quantization with torchao"
+back-to-top-navigation: true
+toc: true
+toc-expand: 2
+toc-depth: 4
+---
+
+Quantization is a technique to lower the memory footprint of your model, potentially at the cost of accuracy or model performance. We support quantizing your model using the [torchao](https://github.com/pytorch/ao) library. Quantization is supported for both post-training quantization (PTQ) and quantization-aware training (QAT).
+
+
+::: {.callout-note}
+
+We do not currently support quantization techniques such as GGUF/GPTQ,EXL2 at the moment.
+
+:::
+
+## Configuring Quantization in Axolotl
+
+Quantization is configured using the `quantization` key in your configuration file.
+
+```yaml
+base_model: # The path to the model to quantize.
+quantization:
+  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are uintX for X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8
+  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
+  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
+  quantize_embedding: # Optional[bool] = False. Whether to quantize the embedding layer.
+
+output_dir:  # The path to the output directory.
+```
+
+Once quantization is complete, your quantized model will be saved in the `{output_dir}/quantized` directory.
+
+You may also use the `quantize` command to quantize a model which has been trained with [QAT](./qat.qmd) - you can do this by using the existing QAT configuration file which
+you used to train the model:
+
+```yaml
+# qat.yml
+qat:
+  activation_dtype: int8
+  weight_dtype: int8
+  group_size: 256
+  quantize_embedding: true
+
+output_dir: # The path to the output directory used during training where the final checkpoint has been saved.
+```
+
+```bash
+axolotl quantize qat.yml
+```
+
+This ensures that an identical quantization configuration is used to quantize the model as was used to train it.
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -16,7 +16,7 @@ feedback. Various methods include, but not limited to:
 - [Identity Preference Optimization (IPO)](#ipo)
 - [Kahneman-Tversky Optimization (KTO)](#kto)
 - [Odds Ratio Preference Optimization (ORPO)](#orpo)
- Proximal Policy Optimization (PPO) (not yet supported in axolotl)
+- [Group Relative Policy Optimization (GRPO)](#grpo)


 ## RLHF using Axolotl
@@ -274,15 +274,14 @@ rl: dpo
 datasets:
  - path: ...
    split: train
-    type: user_defined.default
-
-    field_prompt: "prompt"
-    field_system: "system"
-    field_chosen: "chosen"
-    field_rejected: "rejected"
-    prompt_format: "{prompt}"
-    chosen_format: "{chosen}"
-    rejected_format: "{rejected}"
+    type:
+      field_prompt: "prompt"
+      field_system: "system"
+      field_chosen: "chosen"
+      field_rejected: "rejected"
+      prompt_format: "{prompt}"
+      chosen_format: "{chosen}"
+      rejected_format: "{rejected}"
 ```

 The input format is a simple JSON input with customizable fields based on the above config.
@@ -475,14 +474,13 @@ rl: kto
 datasets:
  - path: ...
    split: train
-    type: user_defined.default
-
-    field_prompt: "prompt"
-    field_system: "system"
-    field_completion: "completion"
-    field_label: "label"
-    prompt_format: "{prompt}"
-    completion_format: "{completion}"
+    type:
+      field_prompt: "prompt"
+      field_system: "system"
+      field_completion: "completion"
+      field_label: "label"
+      prompt_format: "{prompt}"
+      completion_format: "{completion}"
 ```

 The input format is a simple JSON input with customizable fields based on the above config.
@@ -499,12 +497,10 @@ The input format is a simple JSON input with customizable fields based on the ab
 ### GRPO

 ::: {.callout-tip}
-Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/axolotl-cookbook/tree/main/grpo#training-an-r1-style-large-language-model-using-grpo).
+Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/grpo_code).
 :::

-If you have multiple GPUs available, we reccomend using `vLLM` with the `GRPOTrainer` to significantly speedup trajectory generation during training.
-First, launch a `vLLM` server using `trl vllm-serve` - you may use a config file or CLI overrides to configure your vLLM server. In this example, we're
-using 4 GPUs - 2 for training, and 2 for vLLM:
+In the latest GRPO implementation, `vLLM` is used to significantly speedup trajectory generation during training. In this example, we're using 4 GPUs - 2 for training, and 2 for vLLM:

 ::: {.callout-important}
 Make sure you've installed the correct version of vLLM by including it as an extra when installing axolotl, e.g. `pip install axolotl[vllm]`.
@@ -530,7 +526,7 @@ trl:
 ```

 ```bash
-CUDA_VISIBLE_DEVICES=2,3 axolotl vllm_serve grpo.yaml
+CUDA_VISIBLE_DEVICES=2,3 axolotl vllm-serve grpo.yaml
 ```

 Your `vLLM` instance will now attempt to spin up, and it's time to kick off training utilizing our remaining two GPUs. In another terminal, execute:
@@ -539,6 +535,10 @@ Your `vLLM` instance will now attempt to spin up, and it's time to kick off trai
 CUDA_VISIBLE_DEVICES=0,1 axolotl train grpo.yaml --num-processes 2
 ```

+::: {.callout-note}
+Due to TRL's implementation with vLLM, the vLLM instance must use the last N GPUs instead of the first N GPUs. This is why in the example above, we use `CUDA_VISIBLE_DEVICES=2,3` for the vLLM instance.
+:::
+
 #### Reward functions

 GRPO uses custom reward functions and transformations. Please have them ready locally.
@@ -580,7 +580,20 @@ datasets:

 To see other examples of custom reward functions, please see [TRL GRPO Docs](https://github.com/huggingface/trl/blob/main/docs/source/grpo_trainer.md#using-a-custom-reward-function).

-To see description of the configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/config/models/input/v0_4_1/trl.py).
+To see all configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/v0.9.2/src/axolotl/utils/schemas/trl.py).
+
+#### GRPO with DAPO/Dr. GRPO loss
+
+The DAPO paper and subsequently Dr. GRPO paper proposed an alternative loss function for GRPO to remediate the penalty in longer responses.
+
+```yaml
+trl:
+  loss_type: dr_grpo
+  # Normalizes loss based on max completion length (default: 256)
+  max_completion_length:
+```
+
+For more information, see [GRPO docs](https://huggingface.co/docs/trl/v0.17.0/en/grpo_trainer#loss-types).

 ### SimPO

--- a/docs/scripts/generate_config_docs.py
+++ b/docs/scripts/generate_config_docs.py
@@ -0,0 +1,752 @@
+# type: ignore
+
+"""
+Quarto documentation generation from Pydantic models. Uses Pydantic model source code
+to automatically group fields, including inherited fields from parent classes.
+"""
+
+import ast
+import inspect
+import textwrap
+import types
+import typing
+from typing import Any, FrozenSet, Type, Union
+
+from pydantic import BaseModel
+
+from axolotl.utils.schemas.config import AxolotlInputConfig
+
+
+class QuartoGenerator:
+    """Generate Quarto documentation from Pydantic models."""
+
+    def __init__(self):
+        self._class_fields_cache = {}
+        self._inheritance_map_cache = {}
+        self._nested_models_cache = {}
+
+    def _get_direct_fields(self, cls: Type[BaseModel]) -> FrozenSet[str]:
+        """Get fields defined directly in a single class (not inherited)."""
+        if cls in self._class_fields_cache:
+            return self._class_fields_cache[cls]
+
+        fields = set()
+
+        # Get annotated fields
+        if hasattr(cls, "__annotations__"):
+            fields.update(cls.__annotations__.keys())
+
+        # Filter out private/special methods
+        fields = {f for f in fields if not f.startswith("_")}
+
+        result = frozenset(fields)
+        self._class_fields_cache[cls] = result
+        return result
+
+    def _is_pydantic_model(self, type_obj) -> bool:
+        """Check if a type is a Pydantic BaseModel."""
+        return inspect.isclass(type_obj) and issubclass(type_obj, BaseModel)
+
+    # pylint: disable=too-many-return-statements
+    def _extract_nested_type(self, field_type) -> Any:
+        """Extract the actual type from complex type annotations."""
+        # Handle Annotated types (Python 3.9+)
+        if hasattr(typing, "get_origin") and hasattr(typing, "get_args"):
+            origin = typing.get_origin(field_type)
+            args = typing.get_args(field_type)
+
+            if origin is not None:
+                # Handle Annotated[SomeType, ...] - extract the first argument
+                if hasattr(typing, "Annotated") and origin is typing.Annotated:
+                    if args:
+                        return self._extract_nested_type(
+                            args[0]
+                        )  # Recursively process the actual type
+
+                # Handle list[SomeType], List[SomeType], etc.
+                elif origin in (list, typing.List):
+                    if args:
+                        return self._extract_nested_type(
+                            args[0]
+                        )  # Extract element type
+
+                # Handle Union types (including | syntax)
+                elif origin is typing.Union:
+                    # Get non-None types from the Union
+                    non_none_types = [arg for arg in args if arg is not type(None)]
+                    if len(non_none_types) >= 1:
+                        # Prioritize Pydantic models over primitive types
+                        pydantic_models = [
+                            arg
+                            for arg in non_none_types
+                            if self._is_pydantic_model(arg)
+                        ]
+                        if pydantic_models:
+                            # Return the first Pydantic model found
+                            return self._extract_nested_type(pydantic_models[0])
+
+                        # No Pydantic models, return the first non-None type
+                        return self._extract_nested_type(non_none_types[0])
+
+        # Handle new Python 3.10+ union syntax (PeftConfig | None)
+        if hasattr(field_type, "__class__") and field_type.__class__ is types.UnionType:
+            # Get non-None types from the Union
+            non_none_types = [
+                arg for arg in field_type.__args__ if arg is not type(None)
+            ]
+            if len(non_none_types) >= 1:
+                # Prioritize Pydantic models over primitive types
+                pydantic_models = [
+                    arg for arg in non_none_types if self._is_pydantic_model(arg)
+                ]
+                if pydantic_models:
+                    return self._extract_nested_type(pydantic_models[0])
+                return self._extract_nested_type(non_none_types[0])
+
+        # Handle old typing.Union syntax (fallback)
+        if hasattr(field_type, "__origin__"):
+            if field_type.__origin__ is Union:
+                # Get non-None types from the Union
+                non_none_types = [
+                    arg for arg in field_type.__args__ if arg is not type(None)
+                ]
+                if len(non_none_types) >= 1:
+                    # Prioritize Pydantic models over primitive types
+                    pydantic_models = [
+                        arg for arg in non_none_types if self._is_pydantic_model(arg)
+                    ]
+                    if pydantic_models:
+                        return self._extract_nested_type(pydantic_models[0])
+                    return self._extract_nested_type(non_none_types[0])
+            # Handle other generic types like dict[str, Any], etc.
+            elif hasattr(field_type, "__args__"):
+                return field_type
+
+        return field_type
+
+    # pylint: disable=too-many-return-statements
+    def _extract_all_pydantic_models_from_type(
+        self, field_type
+    ) -> list[type[BaseModel]]:
+        """Extract all Pydantic models from a type annotation, including from Unions."""
+        models = []
+
+        if field_type is None:
+            return models
+
+        # Handle Annotated types
+        if hasattr(typing, "get_origin") and hasattr(typing, "get_args"):
+            origin = typing.get_origin(field_type)
+            args = typing.get_args(field_type)
+
+            if origin is not None:
+                # Handle Annotated[SomeType, ...] - extract from the first argument
+                if hasattr(typing, "Annotated") and origin is typing.Annotated:
+                    if args:
+                        models.extend(
+                            self._extract_all_pydantic_models_from_type(args[0])
+                        )
+                    return models
+
+                # Handle list[SomeType], List[SomeType], etc.
+                if origin in (list, typing.List):
+                    if args:
+                        models.extend(
+                            self._extract_all_pydantic_models_from_type(args[0])
+                        )
+                    return models
+
+                # Handle Union types
+                if origin is typing.Union:
+                    for arg in args:
+                        if arg is not type(None):  # Skip None type
+                            models.extend(
+                                self._extract_all_pydantic_models_from_type(arg)
+                            )
+                    return models
+
+        # Handle new Python 3.10+ union syntax
+        if hasattr(field_type, "__class__") and field_type.__class__ is types.UnionType:
+            for arg in field_type.__args__:
+                if arg is not type(None):  # Skip None type
+                    models.extend(self._extract_all_pydantic_models_from_type(arg))
+            return models
+
+        # Handle old typing.Union syntax (fallback)
+        if hasattr(field_type, "__origin__") and field_type.__origin__ is Union:
+            for arg in field_type.__args__:
+                if arg is not type(None):  # Skip None type
+                    models.extend(self._extract_all_pydantic_models_from_type(arg))
+            return models
+
+        # Check if this type itself is a Pydantic model
+        if self._is_pydantic_model(field_type):
+            models.append(field_type)
+
+        return models
+
+    def _get_nested_models(
+        self, model_class: type[BaseModel], visited=None
+    ) -> dict[str, type[BaseModel]]:
+        """Get all nested Pydantic models from a model class."""
+        if visited is None:
+            visited = set()
+
+        # Avoid infinite recursion
+        if model_class in visited:
+            return {}
+
+        if model_class in self._nested_models_cache:
+            return self._nested_models_cache[model_class]
+
+        visited.add(model_class)
+        nested_models = {}
+
+        # Check all fields in the model
+        for field_info in model_class.model_fields.values():
+            field_type = self._extract_nested_type(field_info.annotation)
+
+            if self._is_pydantic_model(field_type):
+                nested_models[field_type.__name__] = field_type
+                # Recursively get nested models from this nested model
+                deeper_nested = self._get_nested_models(field_type, visited.copy())
+                nested_models.update(deeper_nested)
+
+        self._nested_models_cache[model_class] = nested_models
+        return nested_models
+
+    def _build_inheritance_map(self, child_class: Type[BaseModel]):
+        """Build inheritance map for a class and all its parents."""
+        if child_class in self._inheritance_map_cache:
+            return self._inheritance_map_cache[child_class]
+
+        inheritance_map = {}
+
+        # Get MRO and filter out BaseModel and object
+        mro_classes = [
+            cls
+            for cls in child_class.__mro__
+            if cls not in (BaseModel, object) and hasattr(cls, "__annotations__")
+        ]
+
+        # Process each class in the MRO
+        for cls in mro_classes:
+            inheritance_map[cls] = self._get_direct_fields(cls)
+
+        self._inheritance_map_cache[child_class] = inheritance_map
+        return inheritance_map
+
+    def _wrap_comment(self, text: str, width: int = 88) -> list[str]:
+        """Wrap a comment to specified width, accounting for '# ' prefix."""
+        if not text.strip():
+            return ["#"]
+
+        # Account for "# " prefix (2 characters)
+        content_width = width - 2
+        wrapped_lines = textwrap.wrap(text, width=content_width)
+        return [f"# {line}" for line in wrapped_lines]
+
+    def _extract_type_from_source(
+        self, model_class: type[BaseModel], field_name: str
+    ) -> str:
+        """Extract the actual type annotation text from source code, checking inheritance chain."""
+        # Use inheritance map to check classes efficiently
+        inheritance_map = self._build_inheritance_map(model_class)
+
+        # Check classes in MRO order
+        for cls in model_class.__mro__:
+            if cls in inheritance_map and field_name in inheritance_map[cls]:
+                type_annotation = self._get_type_from_class_source(cls, field_name)
+                if type_annotation != "unknown":
+                    return type_annotation
+
+        return "unknown"
+
+    def _get_type_from_class_source(self, class_obj: type, field_name: str) -> str:
+        """Extract type annotation from a specific class's source code."""
+        try:
+            source = inspect.getsource(class_obj)
+            tree = ast.parse(source)
+        except (OSError, TypeError):
+            return "unknown"
+
+        # Find the class definition
+        for node in tree.body:
+            if isinstance(node, ast.ClassDef) and node.name == class_obj.__name__:
+                # Find the field assignment
+                for body_node in node.body:
+                    if isinstance(body_node, ast.AnnAssign) and isinstance(
+                        body_node.target, ast.Name
+                    ):
+                        if body_node.target.id == field_name and body_node.annotation:
+                            return ast.unparse(body_node.annotation)
+                break
+
+        return "unknown"
+
+    def _extract_field_groups_from_all_classes(
+        self, model_class: type[BaseModel]
+    ) -> list[dict]:
+        """Extract field groups from all classes in the inheritance hierarchy."""
+        all_groups = []
+        inheritance_map = self._build_inheritance_map(model_class)
+
+        # Get all Pydantic base classes in MRO order (most specific first)
+        # This puts AxolotlInputConfig fields first, then parent class fields
+        pydantic_classes = [
+            cls
+            for cls in model_class.__mro__
+            if cls in inheritance_map and inheritance_map[cls]
+        ]
+
+        # Extract groups from each class
+        for cls in pydantic_classes:
+            class_groups = self._extract_field_groups_from_source(cls)
+            for group in class_groups:
+                all_groups.append(group)
+
+        # If no groups found, create a default grouping by class
+        if not all_groups:
+            for cls in pydantic_classes:
+                fields_in_class = inheritance_map[cls]
+                if fields_in_class:
+                    all_groups.append(
+                        {
+                            "fields": list(fields_in_class),
+                        }
+                    )
+
+        return all_groups
+
+    # pylint: disable=too-many-return-statements
+    def _extract_field_groups_from_source(
+        self, model_class: type[BaseModel]
+    ) -> list[dict]:
+        """Extract field groups from source code based on blank lines and comments."""
+        try:
+            source = inspect.getsource(model_class)
+            tree = ast.parse(source)
+        except (OSError, TypeError):
+            # Fallback if we can't get source code
+            fields_in_class = self._get_direct_fields(model_class)
+            if fields_in_class:
+                return [
+                    {
+                        "fields": list(fields_in_class),
+                    }
+                ]
+            return []
+
+        groups = []
+        current_group_fields = []
+        current_group_comment = None
+
+        # Find the class definition
+        class_node = None
+        for node in ast.walk(tree):
+            if isinstance(node, ast.ClassDef) and node.name == model_class.__name__:
+                class_node = node
+                break
+
+        if not class_node:
+            fields_in_class = self._get_direct_fields(model_class)
+            if fields_in_class:
+                return [
+                    {
+                        "fields": list(fields_in_class),
+                    }
+                ]
+            return []
+
+        # Parse the source lines to detect groupings
+        source_lines = source.split("\n")
+
+        # Get fields that are actually defined in this specific class
+        fields_in_class = self._get_direct_fields(model_class)
+
+        # Find assignments that correspond to model fields for THIS class only
+        field_assignments = []
+        for node in class_node.body:
+            if isinstance(node, ast.AnnAssign) and isinstance(node.target, ast.Name):
+                field_name = node.target.id
+                if field_name in fields_in_class:
+                    field_assignments.append(
+                        {
+                            "name": field_name,
+                            "lineno": node.lineno,
+                            "end_lineno": getattr(node, "end_lineno", node.lineno),
+                        }
+                    )
+
+        if not field_assignments:
+            if fields_in_class:
+                return [
+                    {
+                        "fields": list(fields_in_class),
+                    }
+                ]
+            return []
+
+        # Sort by line number
+        field_assignments.sort(key=lambda x: x["lineno"])
+
+        # Group fields based on blank lines and comments
+        for i, field_info in enumerate(field_assignments):
+            field_name = field_info["name"]
+            current_line = field_info["lineno"]
+
+            # Check if this starts a new group (blank line before or significant gap)
+            is_new_group = False
+
+            if i == 0:
+                is_new_group = True
+            else:
+                prev_end_line = field_assignments[i - 1]["end_lineno"]
+
+                # Check for blank lines or comments between fields
+                lines_between = source_lines[prev_end_line : current_line - 1]
+                has_blank_line = any(line.strip() == "" for line in lines_between)
+                has_comment = any(
+                    line.strip().startswith("#") for line in lines_between
+                )
+
+                # Start new group if there's a blank line or comment, or significant gap
+                if has_blank_line or has_comment or (current_line - prev_end_line > 3):
+                    is_new_group = True
+
+            if is_new_group and current_group_fields:
+                # Save the previous group
+                groups.append(
+                    {
+                        "fields": current_group_fields.copy(),
+                        "description": current_group_comment,
+                    }
+                )
+                current_group_fields = []
+                current_group_comment = None
+
+            current_group_fields.append(field_name)
+
+        # Add the final group
+        if current_group_fields:
+            groups.append(
+                {
+                    "fields": current_group_fields,
+                    "description": current_group_comment,
+                }
+            )
+
+        return groups
+
+    def _generate_field_documentation(
+        self,
+        model_class: type[BaseModel],
+        field_name: str,
+        field_info: dict,
+        field_type_str: str,
+        is_required: bool,
+        indent_level: int = 0,
+        visited_models: set = None,
+    ) -> list[str]:
+        """Generate documentation for a single field, expanding nested models inline."""
+        if visited_models is None:
+            visited_models = set()
+
+        lines = []
+        indent = "  " * indent_level
+
+        # Get the actual field type for nested model detection
+        if field_name in model_class.model_fields:
+            pydantic_field_info = model_class.model_fields[field_name]
+            actual_field_type = pydantic_field_info.annotation
+        else:
+            actual_field_type = None
+
+        # Add description comment if available
+        description = field_info.get("description", "")
+        if description:
+            wrapped_lines = self._wrap_comment(description, width=88 - len(indent))
+            for line in wrapped_lines:
+                lines.append(f"{indent}{line}")
+
+        # Extract nested Pydantic models from the type annotation
+        nested_models = self._extract_all_pydantic_models_from_type(actual_field_type)
+
+        # Filter out already visited models to prevent infinite recursion
+        expandable_models = [
+            model for model in nested_models if model not in visited_models
+        ]
+
+        if expandable_models:
+            # This field contains Pydantic models that can be expanded
+
+            # Show the field with its full type annotation
+            field_line = f"{indent}{field_name}: {field_type_str}"
+            if field_info.get("default") is not None:
+                field_line += f" = {field_info['default']}"
+            if is_required:
+                field_line += " (required)"
+            lines.append(field_line)
+
+            # Add to visited to prevent infinite recursion
+            new_visited = visited_models.copy()
+            new_visited.update(expandable_models)
+
+            # Expand each nested Pydantic model
+            for i, nested_model in enumerate(expandable_models):
+                if i > 0:
+                    lines.append("\n")
+                lines.append(f"{indent}  # For {nested_model.__name__}:")
+
+                # Get nested model schema
+                try:
+                    nested_schema = nested_model.model_json_schema()
+                    nested_properties = nested_schema.get("properties", {})
+                    nested_required = nested_schema.get("required", [])
+                except Exception:  # pylint: disable=broad-exception-caught
+                    # Fallback: use model fields directly
+                    nested_properties = {}
+                    nested_required = []
+                    for (
+                        nested_field_name,
+                        nested_field_info,
+                    ) in nested_model.model_fields.items():
+                        nested_description = ""
+                        if (
+                            hasattr(nested_field_info, "json_schema_extra")
+                            and nested_field_info.json_schema_extra
+                        ):
+                            nested_description = (
+                                nested_field_info.json_schema_extra.get(
+                                    "description", ""
+                                )
+                            )
+                        elif (
+                            hasattr(nested_field_info, "description")
+                            and nested_field_info.description
+                        ):
+                            nested_description = nested_field_info.description
+
+                        nested_default_val = None
+                        if (
+                            hasattr(nested_field_info, "default")
+                            and nested_field_info.default is not None
+                        ):
+                            if str(nested_field_info.default) != "PydanticUndefined":
+                                nested_default_val = nested_field_info.default
+
+                        nested_properties[nested_field_name] = {
+                            "type": "unknown",
+                            "description": nested_description,
+                            "default": nested_default_val,
+                        }
+
+                        if nested_field_info.is_required():
+                            nested_required.append(nested_field_name)
+
+                # Get field groups for the nested model
+                nested_field_groups = self._extract_field_groups_from_all_classes(
+                    nested_model
+                )
+
+                # Generate nested fields with increased indentation
+                for i, group in enumerate(nested_field_groups):
+                    if not group["fields"]:
+                        continue
+
+                    # Add blank line between groups (except before first group)
+                    if i > 0:
+                        lines.append("")
+
+                    # Process nested fields
+                    for nested_field_name in group["fields"]:
+                        if nested_field_name not in nested_properties:
+                            continue
+
+                        nested_field_info = nested_properties[nested_field_name]
+                        nested_field_type = self._extract_type_from_source(
+                            nested_model, nested_field_name
+                        )
+                        nested_is_required = nested_field_name in nested_required
+
+                        # Recursively generate documentation for nested field
+                        nested_lines = self._generate_field_documentation(
+                            nested_model,
+                            nested_field_name,
+                            nested_field_info,
+                            nested_field_type,
+                            nested_is_required,
+                            indent_level + 1,
+                            new_visited,
+                        )
+                        lines.extend(nested_lines)
+        else:
+            # Regular field (no expandable nested models)
+            field_line = f"{indent}{field_name}: {field_type_str}"
+            if field_info.get("default") is not None:
+                field_line += f" = {field_info['default']}"
+            if is_required:
+                field_line += " (required)"
+            lines.append(field_line)
+
+        return lines
+
+    def generate_qmd(
+        self,
+        model_class: type[BaseModel],
+        title: str | None = None,
+        expand_nested: bool = True,
+    ) -> str:
+        """Auto-generate config reference documentation including inherited fields."""
+
+        if title is None:
+            title = f"{model_class.__name__} Reference"
+
+        # Try to get JSON schema, with fallback for serialization issues
+        try:
+            schema = model_class.model_json_schema()
+            properties = schema.get("properties", {})
+            required = schema.get("required", [])
+        except Exception as e:  # pylint: disable=broad-exception-caught
+            print(
+                f"Warning: Could not generate JSON schema ({e}). Using model fields instead."
+            )
+            # Fallback: use model fields directly
+            properties = {}
+            required = []
+            for field_name, field_info in model_class.model_fields.items():
+                # Extract description from json_schema_extra or field info
+                description = ""
+                if (
+                    hasattr(field_info, "json_schema_extra")
+                    and field_info.json_schema_extra
+                ):
+                    description = field_info.json_schema_extra.get("description", "")
+                elif hasattr(field_info, "description") and field_info.description:
+                    description = field_info.description
+
+                # Get default value
+                default_val = None
+                if hasattr(field_info, "default") and field_info.default is not None:
+                    # Handle special Pydantic default markers
+                    if str(field_info.default) != "PydanticUndefined":
+                        default_val = field_info.default
+
+                properties[field_name] = {
+                    "type": "unknown",
+                    "description": description,
+                    "default": default_val,
+                }
+
+                if field_info.is_required():
+                    required.append(field_name)
+
+        # Extract field groups from all classes in inheritance hierarchy
+        field_groups = self._extract_field_groups_from_all_classes(model_class)
+
+        # Start building QMD content
+        qmd_lines = [
+            "---",
+            f"title: {title}",
+            "description: A complete list of all configuration options.",
+            "---",
+            "",
+        ]
+
+        # Generate one big code block with all fields (inline nested expansion)
+        qmd_lines.append("```yaml")
+
+        for i, group in enumerate(field_groups):
+            if not group["fields"]:
+                continue
+
+            # Add blank line between groups (except before first group)
+            if i > 0:
+                qmd_lines.append("")
+
+            # Process fields in the order they appear in source
+            for field_name in group["fields"]:
+                if field_name not in properties:
+                    continue
+
+                field_info = properties[field_name]
+                field_type = self._extract_type_from_source(model_class, field_name)
+                is_required = field_name in required
+
+                if expand_nested:
+                    # Check if this field has nested models
+                    if field_name in model_class.model_fields:
+                        pydantic_field_info = model_class.model_fields[field_name]
+                        nested_models = self._extract_all_pydantic_models_from_type(
+                            pydantic_field_info.annotation
+                        )
+                        has_nested = bool(nested_models)
+                    else:
+                        has_nested = False
+
+                    # Add blank line before nested config
+                    if has_nested:
+                        qmd_lines.append("")
+
+                    # Use the new inline generation method
+                    field_lines = self._generate_field_documentation(
+                        model_class,
+                        field_name,
+                        field_info,
+                        field_type,
+                        is_required,
+                        indent_level=0,
+                        visited_models=set(),
+                    )
+                    qmd_lines.extend(field_lines)
+
+                    # Add blank line after nested config
+                    if has_nested:
+                        qmd_lines.append("")
+                else:
+                    # Original simple approach
+                    description = field_info.get("description", "")
+                    default = field_info.get("default")
+
+                    # Add wrapped comment for description
+                    if description:
+                        wrapped_lines = self._wrap_comment(description)
+                        qmd_lines.extend(wrapped_lines)
+
+                    line = f"{field_name}: {field_type}"
+                    if default is not None:
+                        line += f" = {default}"
+                    if is_required:
+                        line += " (required)"
+                    qmd_lines.append(line)
+
+        qmd_lines.append("```")
+
+        # Join all lines and clean up any double newlines
+        content = "\n".join(qmd_lines)
+
+        # Replace multiple consecutive newlines with just two newlines (one blank line)
+        import re
+
+        content = re.sub(r"\n{3,}", "\n\n", content)
+
+        # Ensure single newline at the very end
+        content = content.rstrip("\n") + "\n"
+
+        return content
+
+
+def main():
+    generator = QuartoGenerator()
+
+    print("Generating config reference content...")
+    qmd_content = generator.generate_qmd(AxolotlInputConfig, "Config Reference", True)
+
+    print("Writing to file...")
+    with open("docs/config-reference.qmd", "w", encoding="utf-8") as f:
+        f.write(qmd_content)
+    print("Done!")
+
+
+if __name__ == "__main__":
+    main()
--- a/docs/sequence_parallelism.qmd
+++ b/docs/sequence_parallelism.qmd
@@ -3,8 +3,6 @@ title: Sequence Parallelism
 description: Train with long sequences split across multiple GPUs.
 ---

-# Sequence Parallelism
-
 Sequence parallelism is a technique that splits sequences across multiple GPUs,
 allowing you to train with very long sequences that wouldn't fit on a single GPU. Each
 GPU processes a different portion of the sequence, and the results are aggregated
@@ -27,6 +25,9 @@ To enable sequence parallelism, add the following to your configuration file:
 sequence_parallel_degree: 4  # Split sequences across 4 GPUs
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 heads_k_stride: 1
+# Optional; one of "varlen_llama3" or "batch_ring". Defaults to
+# "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
+ring_attn_func:
 ```

 The `sequence_parallel_degree` should be a divisor of the total number of GPUs. For example:
@@ -40,7 +41,7 @@ When sequence parallelism is enabled:

 1. Each sequence is divided into equal chunks across the GPUs in a sequence parallel group
 2. The data collator handles the chunking of input_ids, attention_mask, labels, and position_ids
-3. Position IDs are adjusted to maintain proper relative positions, especially for packed sequences
+3. Position IDs are adjusted to maintain proper relative positions
 4. The trainer uses special ring communication patterns for attention operations

 ## Requirements
@@ -66,9 +67,11 @@ sequence_len: 8192
 ...

 sequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU
-flash_attention: true  # Required with sequence parallelism
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 heads_k_stride: 1
+# Optional; one of "varlen_llama3" or "batch_ring". Defaults to
+# "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
+ring_attn_func:

 ...
 ```
--- a/examples/archived/README.md
+++ b/examples/archived/README.md
@@ -0,0 +1,5 @@
+# Archived Examples
+
+This directory contains examples that are no longer maintained and may no longer be functional.
+
+We keep them around for archival purposes in case they are useful to others.
--- a/examples/archived/cerebras/btlm-ft.yml
+++ b/examples/archived/cerebras/btlm-ft.yml
@@ -8,7 +8,6 @@ tokenizer_type: GPT2Tokenizer
 trust_remote_code: true
 tokenizer_use_fast: true
 tokenizer_legacy: true
-strict: false
 push_dataset_to_hub:
 hf_use_auth_token: true
 datasets:
--- a/examples/archived/cerebras/qlora.yml
+++ b/examples/archived/cerebras/qlora.yml
@@ -4,7 +4,6 @@ base_model: cerebras/Cerebras-GPT-1.3B

 load_in_8bit: false
 load_in_4bit: true
-strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/archived/code-llama/13b/lora.yml
+++ b/examples/archived/code-llama/13b/lora.yml
@@ -7,7 +7,6 @@ tokenizer_type: CodeLlamaTokenizer

 load_in_8bit: true
 load_in_4bit: false
-strict: false

 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/archived/code-llama/13b/qlora.yml
+++ b/examples/archived/code-llama/13b/qlora.yml
@@ -7,7 +7,6 @@ tokenizer_type: CodeLlamaTokenizer

 load_in_8bit: false
 load_in_4bit: true
-strict: false

 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/archived/code-llama/34b/lora.yml
+++ b/examples/archived/code-llama/34b/lora.yml
@@ -7,7 +7,6 @@ tokenizer_type: CodeLlamaTokenizer

 load_in_8bit: true
 load_in_4bit: false
-strict: false

 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/archived/code-llama/34b/qlora.yml
+++ b/examples/archived/code-llama/34b/qlora.yml
@@ -7,7 +7,6 @@ tokenizer_type: CodeLlamaTokenizer

 load_in_8bit: false
 load_in_4bit: true
-strict: false

 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/archived/code-llama/7b/lora.yml
+++ b/examples/archived/code-llama/7b/lora.yml
@@ -7,7 +7,6 @@ tokenizer_type: CodeLlamaTokenizer

 load_in_8bit: true
 load_in_4bit: false
-strict: false

 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/archived/code-llama/7b/qlora.yml
+++ b/examples/archived/code-llama/7b/qlora.yml
@@ -7,7 +7,6 @@ tokenizer_type: CodeLlamaTokenizer

 load_in_8bit: false
 load_in_4bit: true
-strict: false

 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/archived/code-llama/README.md
+++ b/examples/archived/code-llama/README.md
--- a/examples/archived/dbrx/16bit-lora.yaml
+++ b/examples/archived/dbrx/16bit-lora.yaml
@@ -3,7 +3,6 @@ base_model: LnL-AI/dbrx-base-converted-v2
 # hub_model_id: username/custom_model_name

 trust_remote_code: true
-strict: false

 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/archived/dbrx/8bit-lora.yaml
+++ b/examples/archived/dbrx/8bit-lora.yaml
@@ -6,7 +6,6 @@ trust_remote_code: true

 load_in_8bit: true
 load_in_4bit: false
-strict: false

 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/archived/dbrx/README.md
+++ b/examples/archived/dbrx/README.md
--- a/examples/archived/dbrx/fft-ds-zero3.yaml
+++ b/examples/archived/dbrx/fft-ds-zero3.yaml
@@ -3,7 +3,6 @@ base_model: LnL-AI/dbrx-base-converted-v2
 # hub_model_id: username/custom_model_name

 trust_remote_code: true
-strict: false

 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
+++ b/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
@@ -0,0 +1,58 @@
+base_model: agentica-org/DeepCoder-14B-Preview
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: fozziethebeat/alpaca_messages_2k_test
+    type: chat_template
+    field_messages: messages
+    message_property_mappings:
+      role: role
+      content: content
+
+dataset_prepared_path:
+val_set_size: 0.05
+output_dir: ./outputs/lora-out
+
+sequence_len: 4096
+sample_packing: true
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 2
+micro_batch_size: 2
+num_epochs: 4
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_steps: 10
+evals_per_epoch: 1
+saves_per_epoch: 1
+weight_decay: 0.0
+special_tokens:
--- a/examples/archived/falcon/config-7b-lora.yml
+++ b/examples/archived/falcon/config-7b-lora.yml
@@ -11,7 +11,6 @@ trust_remote_code: true
 load_in_8bit: true
 load_in_4bit: false
 gptq: false
-strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/archived/falcon/config-7b-qlora.yml
+++ b/examples/archived/falcon/config-7b-qlora.yml
@@ -15,7 +15,6 @@ load_in_8bit: false
 # enable 4bit for QLoRA
 load_in_4bit: true
 gptq: false
-strict: false
 push_dataset_to_hub:
 datasets:
  - path: QingyiSi/Alpaca-CoT
--- a/examples/archived/falcon/config-7b.yml
+++ b/examples/archived/falcon/config-7b.yml
@@ -8,7 +8,6 @@ tokenizer_type: AutoTokenizer
 # required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
 trust_remote_code: true
 gptq: false
-strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/archived/gemma/qlora.yml
+++ b/examples/archived/gemma/qlora.yml
@@ -8,7 +8,6 @@ tokenizer_type: AutoTokenizer

 load_in_8bit: false
 load_in_4bit: true
-strict: false

 # huggingface repo
 datasets:
--- a/examples/archived/gptj/qlora.yml
+++ b/examples/archived/gptj/qlora.yml
@@ -4,7 +4,6 @@ base_model: EleutherAI/gpt-j-6b

 load_in_8bit: false
 load_in_4bit: true
-strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/archived/jeopardy-bot/config.yml
+++ b/examples/archived/jeopardy-bot/config.yml
--- a/examples/archived/mpt-7b/README.md
+++ b/examples/archived/mpt-7b/README.md
--- a/examples/archived/mpt-7b/config.yml
+++ b/examples/archived/mpt-7b/config.yml
--- a/examples/archived/openllama-3b/README.md
+++ b/examples/archived/openllama-3b/README.md
--- a/examples/archived/openllama-3b/config.yml
+++ b/examples/archived/openllama-3b/config.yml
@@ -4,7 +4,6 @@ model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
-strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/archived/openllama-3b/lora.yml
+++ b/examples/archived/openllama-3b/lora.yml
@@ -7,7 +7,6 @@ tokenizer_type: LlamaTokenizer

 load_in_8bit: true
 load_in_4bit: false
-strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/archived/openllama-3b/qlora.yml
+++ b/examples/archived/openllama-3b/qlora.yml
@@ -7,7 +7,6 @@ tokenizer_type: LlamaTokenizer

 load_in_8bit: false
 load_in_4bit: true
-strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/archived/pythia-12b/README.md
+++ b/examples/archived/pythia-12b/README.md
--- a/examples/archived/pythia-12b/config.yml
+++ b/examples/archived/pythia-12b/config.yml
--- a/examples/archived/pythia/lora.yml
+++ b/examples/archived/pythia/lora.yml
--- a/examples/archived/qwen/README.md
+++ b/examples/archived/qwen/README.md
--- a/examples/archived/qwen/lora.yml
+++ b/examples/archived/qwen/lora.yml
@@ -9,7 +9,6 @@ trust_remote_code: true

 load_in_8bit: true
 load_in_4bit: false
-strict: false

 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/archived/qwen/qlora.yml
+++ b/examples/archived/qwen/qlora.yml
@@ -9,7 +9,6 @@ trust_remote_code: true

 load_in_8bit: false
 load_in_4bit: true
-strict: false

 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/archived/qwen/qwen2-moe-lora.yaml
+++ b/examples/archived/qwen/qwen2-moe-lora.yaml
@@ -3,7 +3,6 @@ base_model: Qwen/Qwen1.5-MoE-A2.7B
 # hub_model_id: username/custom_model_name

 trust_remote_code: true
-strict: false

 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/Show More
+++ b/Show More