fix(test): replace jackfram llama with smollm

2025-02-28 16:40:49 +07:00
421 changed files with 4913 additions and 16336 deletions
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,14 +0,0 @@
-[run]
-source = axolotl
-omit =
-    */tests/*
-    setup.py
-
-[report]
-exclude_lines =
-    pragma: no cover
-    def __repr__
-    raise NotImplementedError
-    if __name__ == .__main__.:
-    pass
-    raise ImportError
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -40,36 +40,6 @@ jobs:
            python_version: "3.11"
            pytorch: 2.6.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-          - cuda: "126"
-            cuda_version: 12.6.3
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.6.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-          - cuda: "126"
-            cuda_version: 12.6.3
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.7.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-          - cuda: "128"
-            cuda_version: 12.6.3
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.7.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: nightly
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: next
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -91,7 +61,7 @@ jobs:
        uses: docker/build-push-action@v4
        with:
          context: .
-          file: ${{ matrix.pytorch == 'nightly' && './docker/Dockerfile-base-nightly' || matrix.pytorch == 'next' && './docker/Dockerfile-base-next' || './docker/Dockerfile-base' }}
+          file: ./docker/Dockerfile-base
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -20,12 +20,9 @@ jobs:
          uses: actions/setup-python@v5
          with:
            python-version: '3.11'
-        - name: Install dependencies
+        - name: install dependencies
          run: |
-            python3 -m pip install jupyter quartodoc
-            python3 -m pip install -e . --no-deps
-        - name: Build autodoc
-          run: quartodoc build
+            python3 -m pip install jupyter
        - name: Publish to GitHub Pages (and render)
          uses: quarto-dev/quarto-actions/publish@v2
          with:
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -25,17 +25,12 @@ jobs:
            python_version: "3.11"
            pytorch: 2.5.1
            axolotl_extras: vllm
+            is_latest: true
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
-            axolotl_extras: vllm
-            is_latest: true
-          - cuda: 126
-            cuda_version: 12.6.3
-            python_version: "3.11"
-            pytorch: 2.7.0
-            axolotl_extras: vllm
+            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -92,17 +87,7 @@ jobs:
            python_version: "3.11"
            pytorch: 2.5.1
            axolotl_extras:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.6.0
-            axolotl_extras:
            is_latest: true
-          - cuda: 126
-            cuda_version: 12.6.3
-            python_version: "3.11"
-            pytorch: 2.7.0
-            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -148,7 +133,7 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.6.0
+            pytorch: 2.4.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -24,13 +24,6 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.6.0
-            axolotl_extras: vllm
-            num_gpus: 2
-            nightly_build: "true"
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -45,10 +38,11 @@ jobs:
            axolotl_extras: vllm
            num_gpus: 2
            nightly_build: "true"
-          - cuda: 126
-            cuda_version: 12.6.3
+          - cuda: 124
+            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.7.0
+            pytorch: 2.6.0
+            # awaiting vllm#12721
            axolotl_extras:
            num_gpus: 2
            nightly_build: "true"
@@ -74,7 +68,6 @@ jobs:
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
-          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
          modal run cicd.multigpu
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -80,11 +80,6 @@ jobs:
            python_version: "3.11"
            pytorch: 2.5.1
            axolotl_extras:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.6.0
-            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
--- a/.github/workflows/precommit-autoupdate.yml
+++ b/.github/workflows/precommit-autoupdate.yml
@@ -1,49 +0,0 @@
-name: Pre-commit auto-update
-
-on:
-  schedule:
-    - cron: '0 0 * * 0'  # Run weekly
-  workflow_dispatch:  # Manual kickoff
-
-jobs:
-  auto-update:
-    runs-on: ubuntu-latest
-    permissions:
-      contents: write
-      pull-requests: write
-    steps:
-      - uses: actions/checkout@v4
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-
-      - name: Update pre-commit hooks
-        id: update
-        run: |
-          pip install pre-commit
-          pre-commit autoupdate
-          if [[ -n $(git status --porcelain) ]]; then
-            echo "changes=true" >> $GITHUB_OUTPUT
-            git diff .pre-commit-config.yaml > pre-commit-update.diff
-          fi
-
-      - name: Create Pull Request
-        if: steps.update.outputs.changes == 'true'
-        uses: peter-evans/create-pull-request@v6
-        with:
-          token: ${{ secrets.GITHUB_TOKEN }}
-          branch: update/pre-commit-hooks
-          delete-branch: true
-          title: "chore: update pre-commit hooks"
-          commit-message: "chore: update pre-commit hooks"
-          body: |
-            Automated PR to update pre-commit hooks to their latest versions.
-
-            <details>
-            <summary>Changes:</summary>
-
-            ```diff
-            ${{ steps.update.outputs.diff }}
-            ```
-            </details>
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -40,7 +40,7 @@ jobs:

      - name: Install dependencies
        run: |
-          pip3 install wheel packaging==23.2
+          pip3 install wheel packaging
          pip3 install --no-build-isolation -e .
          pip3 install -r requirements-dev.txt -r requirements-tests.txt

--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -33,15 +33,6 @@ jobs:
      - name: Check out repository code
        uses: actions/checkout@v4

-      - name: Restore HF cache
-        id: hf-cache-restore
-        uses: actions/cache/restore@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ runner.os }}-hf-hub-cache-v2
-
      - name: Setup Python
        uses: actions/setup-python@v5
        with:
@@ -51,11 +42,11 @@ jobs:
      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
+          pip3 install --upgrade packaging setuptools wheel

      - name: Install PyTorch
        run: |
-          pip3 install torch==${{ matrix.pytorch_version }}
+          pip3 install torch==${{ matrix.pytorch_version }} --index-url https://download.pytorch.org/whl/cpu

      - name: Update requirements.txt
        run: |
@@ -67,7 +58,8 @@ jobs:

      - name: Install dependencies
        run: |
-          pip3 show torch
+          pip3 install --upgrade pip
+          pip3 install --upgrade packaging
          pip3 install --no-build-isolation -U -e .
          python scripts/unsloth_install.py | sh
          python scripts/cutcrossentropy_install.py | sh
@@ -81,15 +73,10 @@ jobs:
        run: |
          axolotl --help

-      - name: Pre-Download dataset fixture
-        run: |
-          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
-
      - name: Run tests
        run: |
-          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
-          pytest -v tests/patched/
-          pytest -v tests/cli/
+          pytest -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ tests/
+          pytest tests/patched/

      - name: cleanup pip cache
        run: |
@@ -147,7 +134,6 @@ jobs:
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
-          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
-          modal run cicd.e2e_tests
+          modal run cicd.tests
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -49,7 +49,7 @@ jobs:
      max-parallel: 2
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.4.1", "2.5.1", "2.6.0", "2.7.0"]
+        pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
    timeout-minutes: 20

    steps:
@@ -63,7 +63,7 @@ jobs:
          path: |
            /home/runner/.cache/huggingface/hub/datasets--*
            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ runner.os }}-hf-hub-cache-v2
+          key: ${{ runner.os }}-hf-hub-cache-${{ hashFiles('**/conftest.py') }}

      - name: Setup Python
        uses: actions/setup-python@v5
@@ -74,7 +74,7 @@ jobs:
      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
+          pip3 install --upgrade packaging setuptools wheel

      - name: Install PyTorch
        run: |
@@ -96,23 +96,10 @@ jobs:
        run: |
          axolotl --help

-      - name: Pre-Download dataset fixture
-        run: |
-          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
-
      - name: Run tests
        run: |
-          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/ --cov=axolotl --cov-report=xml
-          pytest -v tests/patched/ --cov=axolotl --cov-append --cov-report=xml
-          pytest -v tests/cli/ --cov=axolotl --cov-append --cov-report=xml
-
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v5
-        with:
-          token: ${{ secrets.CODECOV_TOKEN }}
-          files: ./coverage.xml
-          flags: unittests,pytorch-${{ matrix.pytorch_version }}
-          fail_ci_if_error: false
+          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ tests/
+          pytest -v tests/patched/

      - name: cleanup pip cache
        run: |
@@ -149,7 +136,7 @@ jobs:
          path: |
            /home/runner/.cache/huggingface/hub/datasets--*
            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ runner.os }}-hf-hub-cache-v2
+          key: ${{ runner.os }}-hf-hub-cache-${{ hashFiles('**/conftest.py') }}

      - name: Setup Python
        uses: actions/setup-python@v5
@@ -160,7 +147,7 @@ jobs:
      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 setuptools_scm build wheel
+          pip3 install --upgrade packaging setuptools setuptools_scm build wheel

      - name: Install PyTorch
        run: |
@@ -183,14 +170,10 @@ jobs:
        run: |
          axolotl --help

-      - name: Show HF cache
-        run: huggingface-cli scan-cache
-
      - name: Run tests
        run: |
-          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
+          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ tests/
          pytest -v tests/patched/
-          pytest -v tests/cli/

      - name: cleanup pip cache
        run: |
@@ -219,7 +202,7 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.6.0
+            pytorch: 2.5.1
            num_gpus: 1
            axolotl_extras: vllm
    steps:
@@ -242,10 +225,9 @@ jobs:
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
-          modal run cicd.e2e_tests
+          modal run cicd.tests

  docker-e2e-tests:
    if: github.repository_owner == 'axolotl-ai-cloud'
@@ -267,13 +249,7 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.5.1
-            num_gpus: 1
-            axolotl_extras: vllm
-          - cuda: 126
-            cuda_version: 12.6.3
-            python_version: "3.11"
-            pytorch: 2.7.0
+            pytorch: 2.6.0
            num_gpus: 1
            axolotl_extras:
    steps:
@@ -296,7 +272,6 @@ jobs:
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
-          modal run cicd.e2e_tests
+          modal run cicd.tests
--- a/.gitignore
+++ b/.gitignore
@@ -181,10 +181,6 @@ prepared-datasets/
 submit.sh
 *.out*

-# Quartodoc generated files
-objects.json
-site_libs/
-
 typings/
 out/

--- a/.isort.cfg
+++ b/.isort.cfg
@@ -1,4 +1,3 @@
 [settings]
 profile=black
 known_third_party=wandb,comet_ml
-known_local_folder=src,tests
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@ default_language_version:

 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v5.0.0
+    rev: v4.4.0
    hooks:
    -   id: check-yaml
    -   id: end-of-file-fixer
@@ -11,23 +11,23 @@ repos:
    -   id: no-commit-to-branch
        args: ['--branch', 'main']
 -   repo: https://github.com/psf/black
-    rev: 25.1.0
+    rev: 23.3.0
    hooks:
    -   id: black
 -   repo: https://github.com/pycqa/isort
-    rev: 6.0.1
+    rev: 5.12.0
    hooks:
      - id: isort
 -   repo: https://github.com/PyCQA/flake8
-    rev: 7.1.2
+    rev: 6.1.0
    hooks:
    - id: flake8
-   repo: https://github.com/pylint-dev/pylint
-    rev: v3.3.6
+-   repo: https://github.com/PyCQA/pylint
+    rev: v3.3.0
    hooks:
    - id: pylint
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.15.0
+    rev: v1.3.0
    hooks:
    - id: mypy
      additional_dependencies:
@@ -36,7 +36,7 @@ repos:
            'pydantic>=2.5.3',
        ]
 -   repo: https://github.com/PyCQA/bandit
-    rev: 1.8.3
+    rev: 1.7.5
    hooks:
    -   id: bandit
        args: [
--- a/1
+++ b/1
@@ -1 +0,0 @@
-docs.axolotl.ai
--- a/README.md
+++ b/README.md
@@ -9,7 +9,6 @@
 <p align="center">
    <img src="https://img.shields.io/github/license/axolotl-ai-cloud/axolotl.svg?color=blue" alt="GitHub License">
    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg" alt="tests">
-    <a href="https://codecov.io/gh/axolotl-ai-cloud/axolotl"><img src="https://codecov.io/gh/axolotl-ai-cloud/axolotl/branch/main/graph/badge.svg" alt="codecov"></a>
    <a href="https://github.com/axolotl-ai-cloud/axolotl/releases"><img src="https://img.shields.io/github/release/axolotl-ai-cloud/axolotl.svg" alt="Releases"></a>
    <br/>
    <a href="https://github.com/axolotl-ai-cloud/axolotl/graphs/contributors"><img src="https://img.shields.io/github/contributors-anon/axolotl-ai-cloud/axolotl?color=yellow&style=flat-square" alt="contributors" style="height: 20px;"></a>
@@ -20,6 +19,9 @@
    <br/>
    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests-nightly.yml/badge.svg" alt="tests-nightly">
    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg" alt="multigpu-semi-weekly tests">
+    <a href="https://www.phorm.ai/query?projectId=e315ba4a-4e14-421f-ab05-38a1f9076f25">
+    <img alt="phorm.ai" src="https://img.shields.io/badge/Phorm-Ask_AI-%23F2777A.svg?&logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iNSIgaGVpZ2h0PSI0IiBmaWxsPSJub25lIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPgogIDxwYXRoIGQ9Ik00LjQzIDEuODgyYTEuNDQgMS40NCAwIDAgMS0uMDk4LjQyNmMtLjA1LjEyMy0uMTE1LjIzLS4xOTIuMzIyLS4wNzUuMDktLjE2LjE2NS0uMjU1LjIyNmExLjM1MyAxLjM1MyAwIDAgMS0uNTk1LjIxMmMtLjA5OS4wMTItLjE5Mi4wMTQtLjI3OS4wMDZsLTEuNTkzLS4xNHYtLjQwNmgxLjY1OGMuMDkuMDAxLjE3LS4xNjkuMjQ2LS4xOTFhLjYwMy42MDMgMCAwIDAgLjItLjEwNi41MjkuNTI5IDAgMCAwIC4xMzgtLjE3LjY1NC42NTQgMCAwIDAgLjA2NS0uMjRsLjAyOC0uMzJhLjkzLjkzIDAgMCAwLS4wMzYtLjI0OS41NjcuNTY3IDAgMCAwLS4xMDMtLjIuNTAyLjUwMiAwIDAgMC0uMTY4LS4xMzguNjA4LjYwOCAwIDAgMC0uMjQtLjA2N0wyLjQzNy43MjkgMS42MjUuNjcxYS4zMjIuMzIyIDAgMCAwLS4yMzIuMDU4LjM3NS4zNzUgMCAwIDAtLjExNi4yMzJsLS4xMTYgMS40NS0uMDU4LjY5Ny0uMDU4Ljc1NEwuNzA1IDRsLS4zNTctLjA3OUwuNjAyLjkwNkMuNjE3LjcyNi42NjMuNTc0LjczOS40NTRhLjk1OC45NTggMCAwIDEgLjI3NC0uMjg1Ljk3MS45NzEgMCAwIDEgLjMzNy0uMTRjLjExOS0uMDI2LjIyNy0uMDM0LjMyNS0uMDI2TDMuMjMyLjE2Yy4xNTkuMDE0LjMzNi4wMy40NTkuMDgyYTEuMTczIDEuMTczIDAgMCAxIC41NDUuNDQ3Yy4wNi4wOTQuMTA5LjE5Mi4xNDQuMjkzYTEuMzkyIDEuMzkyIDAgMCAxIC4wNzguNThsLS4wMjkuMzJaIiBmaWxsPSIjRjI3NzdBIi8+CiAgPHBhdGggZD0iTTQuMDgyIDIuMDA3YTEuNDU1IDEuNDU1IDAgMCAxLS4wOTguNDI3Yy0uMDUuMTI0LS4xMTQuMjMyLS4xOTIuMzI0YTEuMTMgMS4xMyAwIDAgMS0uMjU0LjIyNyAxLjM1MyAxLjM1MyAwIDAgMS0uNTk1LjIxNGMtLjEuMDEyLS4xOTMuMDE0LS4yOC4wMDZsLTEuNTYtLjEwOC4wMzQtLjQwNi4wMy0uMzQ4IDEuNTU5LjE1NGMuMDkgMCAuMTczLS4wMS4yNDgtLjAzM2EuNjAzLjYwMyAwIDAgMCAuMi0uMTA2LjUzMi41MzIgMCAwIDAgLjEzOS0uMTcyLjY2LjY2IDAgMCAwIC4wNjQtLjI0MWwuMDI5LS4zMjFhLjk0Ljk0IDAgMCAwLS4wMzYtLjI1LjU3LjU3IDAgMCAwLS4xMDMtLjIwMi41MDIuNTAyIDAgMCAwLS4xNjgtLjEzOC42MDUuNjA1IDAgMCAwLS4yNC0uMDY3TDEuMjczLjgyN2MtLjA5NC0uMDA4LS4xNjguMDEtLjIyMS4wNTUtLjA1My4wNDUtLjA4NC4xMTQtLjA5Mi4yMDZMLjcwNSA0IDAgMy45MzhsLjI1NS0yLjkxMUExLjAxIDEuMDEgMCAwIDEgLjM5My41NzIuOTYyLjk2MiAwIDAgMSAuNjY2LjI4NmEuOTcuOTcgMCAwIDEgLjMzOC0uMTRDMS4xMjIuMTIgMS4yMy4xMSAxLjMyOC4xMTlsMS41OTMuMTRjLjE2LjAxNC4zLjA0Ny40MjMuMWExLjE3IDEuMTcgMCAwIDEgLjU0NS40NDhjLjA2MS4wOTUuMTA5LjE5My4xNDQuMjk1YTEuNDA2IDEuNDA2IDAgMCAxIC4wNzcuNTgzbC0uMDI4LjMyMloiIGZpbGw9IndoaXRlIi8+CiAgPHBhdGggZD0iTTQuMDgyIDIuMDA3YTEuNDU1IDEuNDU1IDAgMCAxLS4wOTguNDI3Yy0uMDUuMTI0LS4xMTQuMjMyLS4xOTIuMzI0YTEuMTMgMS4xMyAwIDAgMS0uMjU0LjIyNyAxLjM1MyAxLjM1MyAwIDAgMS0uNTk1LjIxNGMtLjEuMDEyLS4xOTMuMDE0LS4yOC4wMDZsLTEuNTYtLjEwOC4wMzQtLjQwNi4wMy0uMzQ4IDEuNTU5LjE1NGMuMDkgMCAuMTczLS4wMS4yNDgtLjAzM2EuNjAzLjYwMyAwIDAgMCAuMi0uMTA2LjUzMi41MzIgMCAwIDAgLjEzOS0uMTcyLjY2LjY2IDAgMCAwIC4wNjQtLjI0MWwuMDI5LS4zMjFhLjk0Ljk0IDAgMCAwLS4wMzYtLjI1LjU3LjU3IDAgMCAwLS4xMDMtLjIwMi41MDIuNTAyIDAgMCAwLS4xNjgtLjEzOC42MDUuNjA1IDAgMCAwLS4yNC0uMDY3TDEuMjczLjgyN2MtLjA5NC0uMDA4LS4xNjguMDEtLjIyMS4wNTUtLjA1My4wNDUtLjA4NC4xMTQtLjA5Mi4yMDZMLjcwNSA0IDAgMy45MzhsLjI1NS0yLjkxMUExLjAxIDEuMDEgMCAwIDEgLjM5My41NzIuOTYyLjk2MiAwIDAgMSAuNjY2LjI4NmEuOTcuOTcgMCAwIDEgLjMzOC0uMTRDMS4xMjIuMTIgMS4yMy4xMSAxLjMyOC4xMTlsMS41OTMuMTRjLjE2LjAxNC4zLjA0Ny40MjMuMWExLjE3IDEuMTcgMCAwIDEgLjU0NS40NDhjLjA2MS4wOTUuMTA5LjE5My4xNDQuMjk1YTEuNDA2IDEuNDA2IDAgMCAxIC4wNzcuNTgzbC0uMDI4LjMyMloiIGZpbGw9IndoaXRlIi8+Cjwvc3ZnPgo=">
+  </a>
 </p>

 Axolotl is a tool designed to streamline post-training for various AI models.
@@ -56,7 +58,6 @@ Features:
 ### Installation

 ```bash
-pip3 install -U packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]

 # Download example axolotl configs, deepspeed configs
@@ -64,7 +65,7 @@ axolotl fetch examples
 axolotl fetch deepspeed_configs  # OPTIONAL
 ```

-Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html).
+Other installation approaches are described [here](https://axolotl-ai-cloud.github.io/axolotl/docs/installation.html).

 ### Your First Fine-tune

@@ -79,7 +80,7 @@ axolotl fetch examples --dest path/to/folder
 axolotl train examples/llama-3/lora-1b.yml
 ```

-That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/getting-started.html) for a more detailed walkthrough.
+That's it! Check out our [Getting Started Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/getting-started.html) for a more detailed walkthrough.

 ## ✨ Key Features

@@ -92,20 +93,19 @@ That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/ge

 ## 📚 Documentation

- [Installation Options](https://docs.axolotl.ai/docs/installation.html) - Detailed setup instructions for different environments
- [Configuration Guide](https://docs.axolotl.ai/docs/config.html) - Full configuration options and examples
- [Dataset Guide](https://docs.axolotl.ai/docs/dataset-formats/) - Supported formats and how to use them
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
- [Multipacking](https://docs.axolotl.ai/docs/multipack.html)
- [API Reference](https://docs.axolotl.ai/docs/api/) - Auto-generated code documentation
- [FAQ](https://docs.axolotl.ai/docs/faq.html) - Frequently asked questions
+- [Installation Options](https://axolotl-ai-cloud.github.io/axolotl/docs/installation.html) - Detailed setup instructions for different environments
+- [Configuration Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/config.html) - Full configuration options and examples
+- [Dataset Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/) - Supported formats and how to use them
+- [Multi-GPU Training](https://axolotl-ai-cloud.github.io/axolotl/docs/multi-gpu.html)
+- [Multi-Node Training](https://axolotl-ai-cloud.github.io/axolotl/docs/multi-node.html)
+- [Multipacking](https://axolotl-ai-cloud.github.io/axolotl/docs/multipack.html)
+- [FAQ](https://axolotl-ai-cloud.github.io/axolotl/docs/faq.html) - Frequently asked questions

 ## 🤝 Getting Help

 - Join our [Discord community](https://discord.gg/HhrNrHJPRb) for support
 - Check out our [Examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/) directory
- Read our [Debugging Guide](https://docs.axolotl.ai/docs/debugging.html)
+- Read our [Debugging Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/debugging.html)
 - Need dedicated support? Please contact [✉️wing@axolotl.ai](mailto:wing@axolotl.ai) for options

 ## 🌟 Contributing
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -1,180 +1,6 @@
 project:
  type: website

-quartodoc:
-  dir: docs/api
-  package: axolotl
-  title: API Reference
-  parser: google
-
-  sections:
-    - title: Core
-      desc: Core functionality for training
-      contents:
-        - train
-        - evaluate
-        - datasets
-        - convert
-        - prompt_tokenizers
-        - logging_config
-        - core.trainer_builder
-        - core.training_args
-        - core.chat.messages
-        - core.chat.format.chatml
-        - core.chat.format.llama3x
-        - core.chat.format.shared
-        - core.datasets.chat
-        - core.datasets.transforms.chat_builder
-    - title: CLI
-      desc: Command-line interface
-      contents:
-        - cli.main
-        - cli.train
-        - cli.evaluate
-        - cli.args
-        - cli.checks
-        - cli.config
-        - cli.inference
-        - cli.merge_lora
-        - cli.merge_sharded_fsdp_weights
-        - cli.preprocess
-        - cli.sweeps
-        - cli.utils
-        - cli.vllm_serve
-        - cli.cloud.base
-        - cli.cloud.modal_
-    - title: Trainers
-      desc: Training implementations
-      contents:
-        - core.trainers.base
-        - core.trainers.trl
-        - core.trainers.dpo.trainer
-        - core.trainers.grpo.trainer
-    - title: Prompt Strategies
-      desc: Prompt formatting strategies
-      contents:
-        - prompt_strategies.base
-        - prompt_strategies.chat_template
-        - prompt_strategies.alpaca_chat
-        - prompt_strategies.alpaca_instruct
-        - prompt_strategies.alpaca_w_system
-        - prompt_strategies.user_defined
-        - prompt_strategies.llama2_chat
-        - prompt_strategies.completion
-        - prompt_strategies.input_output
-        - prompt_strategies.stepwise_supervised
-        - prompt_strategies.metharme
-        - prompt_strategies.orcamini
-        - prompt_strategies.pygmalion
-        - prompt_strategies.messages.chat
-        - prompt_strategies.dpo.chat_template
-        - prompt_strategies.dpo.llama3
-        - prompt_strategies.dpo.chatml
-        - prompt_strategies.dpo.zephyr
-        - prompt_strategies.dpo.user_defined
-        - prompt_strategies.dpo.passthrough
-        - prompt_strategies.kto.llama3
-        - prompt_strategies.kto.chatml
-        - prompt_strategies.kto.user_defined
-        - prompt_strategies.orpo.chat_template
-        - prompt_strategies.bradley_terry.llama3
-    - title: Kernels
-      desc: Low-level performance optimizations
-      contents:
-        - kernels.lora
-        - kernels.geglu
-        - kernels.swiglu
-        - kernels.quantize
-        - kernels.utils
-    - title: MonkeyPatches
-      desc: Runtime patches for model optimizations
-      contents:
-        - monkeypatch.llama_attn_hijack_flash
-        - monkeypatch.llama_attn_hijack_xformers
-        - monkeypatch.mistral_attn_hijack_flash
-        - monkeypatch.multipack
-        - monkeypatch.relora
-        - monkeypatch.llama_expand_mask
-        - monkeypatch.lora_kernels
-        - monkeypatch.utils
-        - monkeypatch.btlm_attn_hijack_flash
-        - monkeypatch.llama_patch_multipack
-        - monkeypatch.stablelm_attn_hijack_flash
-        - monkeypatch.trainer_fsdp_optim
-        - monkeypatch.transformers_fa_utils
-        - monkeypatch.unsloth_
-        - monkeypatch.attention.mllama
-        - monkeypatch.data.batch_dataset_fetcher
-        - monkeypatch.mixtral
-    - title: Utils
-      desc: Utility functions
-      contents:
-        - utils.models
-        - utils.tokenization
-        - utils.chat_templates
-        - utils.lora
-        - utils.lora_embeddings
-        - utils.model_shard_quant
-        - utils.bench
-        - utils.freeze
-        - utils.trainer
-        - utils.schedulers
-        - utils.distributed
-        - utils.dict
-        - utils.optimizers.adopt
-        - utils.data.pretraining
-        - utils.data.sft
-        - utils.gradient_checkpointing.unsloth
-    - title: Schemas
-      desc: Pydantic data models for Axolotl config
-      contents:
-        - utils.schemas.config
-        - utils.schemas.model
-        - utils.schemas.training
-        - utils.schemas.datasets
-        - utils.schemas.peft
-        - utils.schemas.trl
-        - utils.schemas.multimodal
-        - utils.schemas.integrations
-        - utils.schemas.enums
-        - utils.schemas.utils
-    - title: Integrations
-      desc: Third-party integrations and extensions
-      contents:
-        - integrations.base
-        - integrations.cut_cross_entropy.args
-        - integrations.grokfast.optimizer
-        - integrations.kd.trainer
-        - integrations.liger.args
-        - integrations.lm_eval.args
-        - integrations.spectrum.args
-    - title: Common
-      desc: Common utilities and shared functionality
-      contents:
-        - common.architectures
-        - common.const
-        - common.datasets
-    - title: Models
-      desc: Custom model implementations
-      contents:
-        - models.mamba.modeling_mamba
-    - title: Data Processing
-      desc: Data processing utilities
-      contents:
-        - utils.collators.core
-        - utils.collators.batching
-        - utils.collators.mamba
-        - utils.collators.mm_chat
-        - utils.samplers.multipack
-    - title: Callbacks
-      desc: Training callbacks
-      contents:
-        - utils.callbacks.perplexity
-        - utils.callbacks.profiler
-        - utils.callbacks.lisa
-        - utils.callbacks.mlflow_
-        - utils.callbacks.comet_
-
 website:
  title: "Axolotl"
  description: "We make fine-tuning accessible, scalable, and fun"
@@ -206,18 +32,14 @@ website:
          contents:
            - docs/getting-started.qmd
            - docs/installation.qmd
-            - docs/inference.qmd
            - docs/cli.qmd
-            - docs/config.qmd
-            - text: "API Reference"
-              href: docs/api
+            - docs/inference.qmd

        - section: "Dataset Formats"
          contents: docs/dataset-formats/*

        - section: "Deployments"
          contents:
-            - docs/docker.qmd
            - docs/multi-gpu.qmd
            - docs/multi-node.qmd
            - docs/ray-integration.qmd
@@ -231,7 +53,6 @@ website:
            - docs/reward_modelling.qmd
            - docs/lr_groups.qmd
            - docs/lora_optims.qmd
-            - docs/dataset_loading.qmd

        - section: "Core Concepts"
          contents:
@@ -245,7 +66,6 @@ website:
            - docs/unsloth.qmd
            - docs/torchao.qmd
            - docs/custom_integrations.qmd
-            - docs/sequence_parallelism.qmd

        - section: "Troubleshooting"
          contents:
@@ -253,27 +73,12 @@ website:
            - docs/debugging.qmd
            - docs/nccl.qmd

+        - section: "Reference"
+          contents:
+            - docs/config.qmd
+
 format:
  html:
    theme: darkly
    css: styles.css
    toc: true
-    # Enable better handling of line breaks in markdown
-    preserve-tabs: true
-    html-math-method: mathjax
-    # Improved markdown processing options
-    md-extensions:
-      - markdown_it
-      - def_list
-      - attr_list
-      - fenced_divs
-      - tables
-      - html_admonition
-      - lineblocks
-      - fancy_lists
-    # Control whitespace handling
-    whitespace: preserve
-    # Process newlines in paragraphs
-    wrap: preserve
-    # Better line break handling
-    preserve-linebreaks: true
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -31,11 +31,10 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
    fi

-RUN pip install packaging==23.2 setuptools==75.8.0
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
+        pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
    fi

 RUN python scripts/unsloth_install.py | sh
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -3,53 +3,9 @@ set -e

 python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"

-# Run unit tests with initial coverage report
-pytest -v --durations=10 -n8 \
-  --ignore=tests/e2e/ \
-  --ignore=tests/patched/ \
-  --ignore=tests/cli \
-  /workspace/axolotl/tests/ \
-  --cov=axolotl
-
-# Run lora kernels tests with coverage append
-pytest -v --durations=10 \
-  /workspace/axolotl/tests/e2e/patched/lora_kernels \
-  --cov=axolotl \
-  --cov-append
-
-# Run patched tests excluding lora kernels with coverage append
-pytest -v --durations=10 \
-  --ignore=tests/e2e/patched/lora_kernels \
-  /workspace/axolotl/tests/e2e/patched \
-  --cov=axolotl \
-  --cov-append
-
-# Run solo tests with coverage append
-pytest -v --durations=10 -n1 \
-  /workspace/axolotl/tests/e2e/solo/ \
-  --cov=axolotl \
-  --cov-append
-
-# Run integration tests with coverage append
-pytest -v --durations=10 \
-  /workspace/axolotl/tests/e2e/integrations/ \
-  --cov=axolotl \
-  --cov-append
-
-pytest -v --durations=10 /workspace/axolotl/tests/cli \
-  --cov=axolotl \
-  --cov-append
-
-# Run remaining e2e tests with coverage append and final report
-pytest -v --durations=10 \
-  --ignore=tests/e2e/solo/ \
-  --ignore=tests/e2e/patched/ \
-  --ignore=tests/e2e/multigpu/ \
-  --ignore=tests/e2e/integrations/ \
-  --ignore=tests/cli \
-  /workspace/axolotl/tests/e2e/ \
-  --cov=axolotl \
-  --cov-append \
-  --cov-report=xml:e2e-coverage.xml
-
-codecov upload-process -t $CODECOV_TOKEN -f e2e-coverage.xml -F e2e,pytorch-${PYTORCH_VERSION}
+pytest -v --durations=10 -n8 --ignore=tests/e2e/ --ignore=tests/patched/ /workspace/axolotl/tests/
+pytest -v --durations=10 /workspace/axolotl/tests/e2e/patched/lora_kernels  # running these with the other patches causes a failure
+pytest -v --durations=10 --ignore=tests/e2e/patched/lora_kernels /workspace/axolotl/tests/e2e/patched
+pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/solo/
+pytest -v --durations=10 /workspace/axolotl/tests/e2e/integrations/
+pytest -v --durations=10 --ignore=tests/e2e/solo/ --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ --ignore=tests/e2e/integrations/ /workspace/axolotl/tests/e2e/
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -1,7 +1,6 @@
 """
-modal application to run axolotl gpu tests in Modal
-"""
-
+ modal application to run axolotl gpu tests in Modal
+ """
 # pylint: disable=duplicate-code

 import os
@@ -29,7 +28,6 @@ df_args = {
    "CUDA": os.environ.get("CUDA", "121"),
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
-    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
    "HF_HOME": "/workspace/data/huggingface-cache/hub",
 }

@@ -69,7 +67,7 @@ def run_cmd(cmd: str, run_folder: str):
@app.function(
    image=cicd_image,
    gpu=GPU_CONFIG,
-    timeout=90 * 60,
+    timeout=60 * 60,
    cpu=8.0,
    memory=131072 * N_GPUS,
    volumes=VOLUME_CONFIG,
--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
@@ -1,23 +1,5 @@
 #!/bin/bash
 set -e

-# Only run two tests at a time to avoid OOM on GPU (with coverage collection)
-pytest -v -n2 \
-  --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ \
-  --ignore=/workspace/axolotl/tests/e2e/multigpu/patched/ \
-  /workspace/axolotl/tests/e2e/multigpu/ \
-  --cov=axolotl
-
-# Run solo tests with coverage append
-pytest -v --durations=10 -n1 \
-  /workspace/axolotl/tests/e2e/multigpu/solo/ \
-  --cov=axolotl \
-  --cov-append
-
-pytest -v  --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/patched/ \
-  --cov=axolotl \
-  --cov-append \
-  --cov-report=xml:multigpu-coverage.xml
-
-# Upload coverage to Codecov
-codecov upload-process -t $CODECOV_TOKEN -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION}
+# only run one test at a time so as not to OOM the GPU
+pytest -v -n2 /workspace/axolotl/tests/e2e/multigpu/
--- a/cicd/e2e_tests.py
+++ b/cicd/e2e_tests.py
@@ -1,5 +1,4 @@
 """Modal app to run axolotl GPU tests"""
-
 # pylint: disable=duplicate-code

 import os
@@ -28,7 +27,6 @@ df_args = {
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
-    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
    "HF_HOME": "/workspace/data/huggingface-cache/hub",
 }

--- a/codecov.yml
+++ b/codecov.yml
@@ -1,56 +0,0 @@
-codecov:
-  require_ci_to_pass: yes
-  notify:
-    wait_for_ci: true
-
-coverage:
-  precision: 2
-  round: down
-  range: "70...100"
-  status:
-    project:
-      default:
-        # basic
-        target: auto
-        threshold: 0%
-        base: auto
-        # advanced
-        branches: null
-        if_no_uploads: error
-        if_not_found: success
-        if_ci_failed: error
-        only_pulls: false
-        flags: null
-        paths: null
-    patch:
-      default:
-        # basic
-        target: auto
-        threshold: 0%
-        base: auto
-        # advanced
-        branches: null
-        if_no_uploads: error
-        if_not_found: success
-        if_ci_failed: error
-        only_pulls: false
-        flags: null
-        paths: null
-
-parsers:
-  gcov:
-    branch_detection:
-      conditional: yes
-      loop: yes
-      method: no
-      macro: no
-
-comment:
-  layout: "reach,diff,flags,files,footer"
-  behavior: default
-  require_changes: no
-  require_base: no
-  require_head: yes
-
-github_checks:
-  annotations: false
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -20,9 +20,9 @@ WORKDIR /workspace/axolotl

 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
+        pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
    fi

 RUN python scripts/unsloth_install.py | sh
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -28,8 +28,8 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"

 WORKDIR /workspace

-RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
-    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
+RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
+    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"

@@ -37,7 +37,3 @@ RUN git lfs install --skip-repo && \
    pip3 install awscli && \
    # The base image ships with `pydantic==1.8.2` which is not working
    pip3 install -U --no-cache-dir pydantic==1.10.10
-
-RUN if [ "$PYTORCH_VERSION" = "2.7.0" ] ; then \
-        pip3 install flash-attn==2.7.4.post1; \
-    fi
--- a/docker/Dockerfile-base-next
+++ b/docker/Dockerfile-base-next
@@ -1,38 +0,0 @@
-ARG CUDA_VERSION="12.8.1"
-ARG CUDNN_VERSION="8"
-ARG UBUNTU_VERSION="22.04"
-ARG MAX_JOBS=4
-
-FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
-
-ENV PATH="/root/miniconda3/bin:${PATH}"
-
-ARG PYTHON_VERSION="3.11"
-ARG PYTORCH_VERSION="next"
-ARG CUDA="128"
-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
-
-ENV PYTHON_VERSION=$PYTHON_VERSION
-ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
-
-RUN apt-get update \
-    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
-    && wget \
-    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
-    && mkdir /root/.conda \
-    && bash Miniconda3-latest-Linux-x86_64.sh -b \
-    && rm -f Miniconda3-latest-Linux-x86_64.sh \
-    && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
-
-ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
-
-WORKDIR /workspace
-
-RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
-    python3 -m pip install --no-cache-dir -U torch==2.7.0 --extra-index-url https://download.pytorch.org/whl/test/cu$CUDA && \
-    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
-    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
-
-RUN git lfs install --skip-repo && \
-    pip3 install awscli && \
-    pip3 install -U --no-cache-dir pydantic==2.10.6
--- a/docker/Dockerfile-base-nightly
+++ b/docker/Dockerfile-base-nightly
@@ -1,39 +0,0 @@
-ARG CUDA_VERSION="12.8.1"
-ARG CUDNN_VERSION="8"
-ARG UBUNTU_VERSION="22.04"
-ARG MAX_JOBS=4
-
-FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
-
-ENV PATH="/root/miniconda3/bin:${PATH}"
-
-ARG PYTHON_VERSION="3.11"
-ARG PYTORCH_VERSION="nightly"
-ARG CUDA="128"
-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
-
-ENV PYTHON_VERSION=$PYTHON_VERSION
-ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
-
-RUN apt-get update \
-    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
-    && wget \
-    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
-    && mkdir /root/.conda \
-    && bash Miniconda3-latest-Linux-x86_64.sh -b \
-    && rm -f Miniconda3-latest-Linux-x86_64.sh \
-    && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
-
-ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
-
-WORKDIR /workspace
-
-RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
-    python3 -m pip install --no-cache-dir -U torch --extra-index-url https://download.pytorch.org/whl/nightly/cu$CUDA && \
-    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
-    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
-
-RUN git lfs install --skip-repo && \
-    pip3 install awscli && \
-    # The base image ships with `pydantic==1.8.2` which is not working
-    pip3 install -U --no-cache-dir pydantic==1.10.10
--- a/docker/Dockerfile-cloud
+++ b/docker/Dockerfile-cloud
@@ -14,7 +14,7 @@ COPY scripts/motd /etc/motd

 RUN pip install jupyterlab notebook ipywidgets && \
    jupyter lab clean
-RUN apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop && \
+RUN apt install --yes --no-install-recommends openssh-server tmux && \
    mkdir -p ~/.ssh && \
    chmod 700 ~/.ssh && \
    printf "\n[[ -z \"\$TMUX\"  ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -1,4 +1,2 @@
 /.quarto/
 _site/
-/api/*.qmd
-/api/*.html
--- a/docs/cli.qmd
+++ b/docs/cli.qmd
@@ -1,5 +1,5 @@
 ---
-title: "Command Line Interface (CLI)"
+title: "CLI Reference"
 format:
  html:
    toc: true
@@ -170,7 +170,7 @@ axolotl merge-sharded-fsdp-weights config.yml

 ### evaluate

-Evaluates a model's performance (loss etc) on the train and eval datasets.
+Evaluates a model's performance using metrics specified in the config.

 ```bash
 # Basic evaluation
@@ -197,19 +197,6 @@ lm_eval_batch_size: # Batch size for evaluation
 output_dir: # Directory to save evaluation results
 ```

-See [LM Eval Harness](https://github.com/EleutherAI/lm-evaluation-harness) for more details.
-
-### delinearize-llama4
-
-Delinearizes a Llama 4 linearized model into a regular HuggingFace Llama 4 model. This only works with the non-quantized linearized model.
-
-```bash
-axolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir
-```
-
-This would be necessary to use with other frameworks. If you have an adapter, merge it with the non-quantized linearized model before delinearizing.
-
-
 ## Legacy CLI Usage

 While the new Click-based CLI is preferred, Axolotl still supports the legacy module-based CLI:
@@ -248,7 +235,7 @@ Create a cloud config YAML with your Modal settings:
 ```yaml
 # cloud_config.yml
 provider: modal
-gpu: a100       # Supported: l40s, a100-40gb, a100-80gb, a10g, h100, t4, l4
+gpu: a100  # Supported: l40s, a100-40gb, a100-80gb, a10g, h100, t4, l4
 gpu_count: 1    # Number of GPUs to use
 timeout: 86400  # Maximum runtime in seconds (24 hours)
 branch: main    # Git branch to use (optional)
@@ -261,7 +248,7 @@ volumes:        # Persistent storage volumes
  - name: axolotl-artifacts
    mount: /workspace/artifacts

-secrets:        # Secrets to inject
+env:            # Environment variables
  - WANDB_API_KEY
  - HF_TOKEN
 ```
@@ -287,27 +274,15 @@ axolotl lm-eval config.yml --cloud cloud_config.yml
 ### Cloud Configuration Options

 ```yaml
-provider:    # compute provider, currently only `modal` is supported
-gpu:         # GPU type to use
-gpu_count:   # Number of GPUs (default: 1)
-memory:      # RAM in GB (default: 128)
-timeout:     # Maximum runtime in seconds
+provider: # compute provider, currently only `modal` is supported
+gpu: # GPU type to use
+gpu_count: # Number of GPUs (default: 1)
+memory: # RAM in GB (default: 128)
+timeout: # Maximum runtime in seconds
 timeout_preprocess: # Preprocessing timeout
-branch:      # Git branch to use
-docker_tag:  # Custom Docker image tag
-volumes:     # List of persistent storage volumes
-
-# Environment variables to pass. Can be specified in two ways:
-# 1. As a string: Will load the value from the host computer's environment variables
-# 2. As a key-value pair: Will use the specified value directly
-# Example:
-# env:
-#   - CUSTOM_VAR  # Loads from host's $CUSTOM_VAR
-#   - {CUSTOM_VAR: "value"}  # Uses "value" directly
-env:
-
-# Secrets to inject. Same input format as `env` but for sensitive data.
-secrets:
-  # - HF_TOKEN
-  # - WANDB_API_KEY
+branch: # Git branch to use
+docker_tag: # Custom Docker image tag
+volumes: # List of persistent storage volumes
+env: # Environment variables to pass
+secrets: # Secrets to inject
 ```
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -1,5 +1,5 @@
 ---
-title: Config Reference
+title: Config options
 description: A complete list of all configuration options.
 ---

@@ -30,11 +30,6 @@ tokenizer_legacy:
 # Resize the model embeddings when new tokens are added to multiples of 32
 # This is reported to improve training speed on some models
 resize_token_embeddings_to_32x:
-# Optional[bool] Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.
-shrink_embeddings:
-# Whether to load the model with randomly initialized weights. Useful for
-# pre-training a model from scratch or debugging purposes.
-random_init_weights:

 # (Internal use only)
 # Used to identify which the model is based on
@@ -88,12 +83,6 @@ gpu_memory_limit: 20GiB
 # Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge
 lora_on_cpu: true

-# List[str]. Add plugins to extend the pipeline.
-# See `src/axolotl/integrations` for the available plugins or doc below for more details.
-# https://docs.axolotl.ai/docs/custom_integrations.html
-plugins:
-  # - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
-
 # A list of one or more datasets to finetune the model with
 datasets:
  # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
@@ -109,7 +98,7 @@ datasets:
    preprocess_shards: # Optional[int] process dataset in N sequential chunks for memory efficiency (exclusive with `shards`)

    name: # Optional[str] name of dataset configuration to load
-    split: train # Optional[str] name of dataset split to load from
+    train_on_split: train # Optional[str] name of dataset split to load from
    revision: # Optional[str] The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets.
    trust_remote_code: # Optional[bool] Trust remote code for untrusted source

@@ -165,21 +154,15 @@ datasets:
      content: value
      # ...

-    # Optional[Dict[str, List]]. Roles mapping in the messages.
-    # The format is {target_role: [source_roles]}. All source roles will be mapped to the target role.
-    # The default is:
+    message_property_mappings:
+
+    # Optional[Dict[str, List]]. Roles mapping in the messages. The default is:
    roles:
      user: ["human", "user"]
      assistant: ["gpt", "assistant"]
      system: ["system"]
      tool: ["tool"]

-    # Optional[bool]. Whether to drop the system turn from the dataset. Only works with chat_template.
-    # This does not drop the default system message from chat_template if it exists. If you wish to,
-    # we recommend using a custom jinja template with the default system message removed or
-    # adding a system turn with empty content.
-    drop_system_message:
-
    # IMPORTANT: The following fields determine which parts of the conversation to train on.
    # Priority order: message_field_training > message_field_training_detail > train_on_inputs or role in roles_to_train
    # See examples at `docs/dataset-formats/conversation.qmd`
@@ -218,46 +201,10 @@ test_datasets:
    data_files:
      - /workspace/data/eval.jsonl

-# use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo'
+# use RL training: 'dpo', 'ipo', 'kto'
 rl:
-rl_beta:  # Optional[float]. The beta parameter for the RL training.
-
-# dpo
-dpo_use_weighting:  # Optional[bool]. Whether to perform weighting.
-rpo_alpha: # Optional[float]. Weighting of NLL term in loss from RPO paper.
-
-# orpo
-orpo_alpha: 0.1  # Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to `beta` in `ORPOConfig` due to trl mapping.
-
-# kto
-kto_desirable_weight: # Optional[float]. Factor for desirable loss term in KTO loss.
-kto_undesirable_weight: # Optional[float]. Factor for undesirable loss term in KTO loss.
-
-# simpo
-cpo_alpha: 1.0  # Weight of the BC regularizer
-simpo_gamma: 0.5  # Target reward margin for the SimPO loss
-
-# grpo
-trl:
-  use_vllm: # Optional[bool]. Whether to use VLLM for RL training.
-  vllm_server_host: # Optional[str]. Host of the vLLM server to connect to.
-  vllm_server_port: # Optional[int]. Port of the vLLM server to connect to.
-  vllm_server_timeout: # Optional[int]. Total timeout (in seconds) to wait for the vLLM server to respond.
-  vllm_guided_decoding_regex: # Optional[str]. Regex for vLLM guided decoding.
-
-  beta: # Optional[float]. Beta parameter for the RL training. Same as `rl_beta`. Use
-  max_completion_length: # Optional[int]. Maximum length of the completion for RL training.
-
-  reward_funcs: # Optional[list[str]]. List of reward functions to load. Paths must be importable from current dir.
-  reward_weights: # Optional[list[float]]. List of reward weights for the reward functions.
-
-  num_generations: # Optional[int]. Number of generations to sample.
-  log_completions: # Optional[bool]. Whether to log completions.
-
-  sync_ref_model: # Optional[bool]. Whether to sync the reference model.
-  ref_model_mixup_alpha: # Optional[float]. Mixup alpha for the reference model.
-  ref_model_sync_steps: # Optional[int]. Sync steps for the reference model.
-
+# whether to perform weighting if doing DPO training. Boolean.
+dpo_use_weighting:

 # reward modelling: `True` or `False`
 reward_model:
@@ -275,13 +222,13 @@ process_reward_model:
 chat_template: tokenizer_default
 # custom jinja template for chat template. This will be only used if chat_template is set to `jinja` or `null` (in which case chat_template is automatically set to `jinja`). Default is null.
 chat_template_jinja: null
-# Changes the default system message. Currently only supports chatml.
-default_system_message: You are a helpful assistant. Please give a long and detailed answer.
+# Changes the default system message
+default_system_message: You are a helpful assistant. Please give a long and detailed answer. # Currently only supports chatml.
 # Axolotl attempts to save the dataset as an arrow after packing the data together so
 # subsequent training attempts load faster, relative path
 dataset_prepared_path: data/last_run_prepared
 # Push prepared dataset to hub
-push_dataset_to_hub: # Optional[str] repo_org/repo_name
+push_dataset_to_hub: # repo path
 # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
 # if not set.
 dataset_processes: # defaults to os.cpu_count() if not set
@@ -322,13 +269,9 @@ total_num_tokens:
 sample_packing_group_size: 100000
 # The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples.
 sample_packing_bin_size: 200
-sample_pack_sequentially: # Optional[bool]. Whether to pack samples sequentially.
-
 # whether to concatenate samples during pretraining
 pretraining_sample_concatenation:

-curriculum_sampling: # Optional[bool]. Whether to use sequential sampling for curriculum learning
-
 # Use batch flattening for speedups when not using sample_packing
 batch_flattening:

@@ -360,27 +303,7 @@ lora_target_modules:
 #  - down_proj
 #  - up_proj
 lora_target_linear: # If true, will target all linear modules
-
-# List[int] | int. # The layer indices to transform, otherwise, apply to all layers
-# https://huggingface.co/docs/peft/v0.15.0/en/package_reference/lora#peft.LoraConfig.layers_to_transform
-peft_layers_to_transform:
-
-# Optional[bool]. Whether to use DoRA.
-# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#weight-decomposed-low-rank-adaptation-dora
-peft_use_dora:
-
-# Optional[bool]. Whether to use RSLoRA.
-# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#rank-stabilized-lora
-peft_use_rslora:
-
-# Optional[list[tuple[int, int]]]. List of layer indices to replicate.
-# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#memory-efficient-layer-replication-with-lora
-peft_layer_replication:
-
-# bool | Literal["gaussian", "eva", "olora", "pissa", "pissa_niter_[number of iters]", "corda", "loftq"]
-# How to initialize LoRA weights. Default to True which is MS original implementation.
-# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#initialization
-peft_init_lora_weights:
+peft_layers_to_transform: # The layer indices to transform, otherwise, apply to all layers

 # If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.
 # For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.
@@ -394,7 +317,7 @@ lora_fan_in_fan_out: false

 # Apply custom LoRA autograd functions and activation function Triton kernels for
 # speed and memory savings
-# See: https://docs.axolotl.ai/docs/lora_optims.html
+# See: https://axolotl-ai-cloud.github.io/axolotl/docs/lora_optims.html
 lora_mlp_kernel: true
 lora_qkv_kernel: true
 lora_o_kernel: true
@@ -492,7 +415,6 @@ auto_find_batch_size: # Optional[bool]

 eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
 eval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
-do_causal_lm_eval: # Whether to run causal language model evaluation for metrics in `eval_causal_lm_metrics`.
 eval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is ["sacrebleu", "comet", "ter", "chrf", "perplexity"]

 profiler_steps: # enable the pytorch profiler to capture the first N steps of training to the output_dir.
@@ -512,8 +434,7 @@ train_on_inputs: false
 # Note that training loss may have an oscillating pattern with this enabled.
 group_by_length: false

-# Whether to use gradient checkpointing. Available options are: true, false, "offload".
-# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
+# Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
 gradient_checkpointing: false
 # additional kwargs to pass to the trainer for gradient checkpointing
 # gradient_checkpointing_kwargs:
@@ -524,7 +445,7 @@ gradient_checkpointing: false
 early_stopping_patience: 3

 # Specify a scheduler and kwargs to use with the optimizer
-lr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | empty for cosine
+lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
 lr_scheduler_kwargs:
 cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
 cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)
@@ -534,58 +455,36 @@ lr_div_factor: # Learning rate div factor

 # Specify optimizer
 # Valid values are driven by the Transformers OptimizerNames class, see:
-# https://github.com/huggingface/transformers/blob/cbf924b76c03828101a34069a96d209314114fd5/src/transformers/training_args.py#L144-L189
+# https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
 #
 # Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of
 # torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used
 # in the examples/ for your model and fine-tuning use case.
 #
 # Valid values for 'optimizer' include:
+# - adamw_hf
 # - adamw_torch
 # - adamw_torch_fused
 # - adamw_torch_xla
-# - adamw_torch_npu_fused
 # - adamw_apex_fused
-# - adopt_adamw  (an EXPERIMENTAL optimizer, only for torch version >= 2.5.1)
+# - adopt_adamw (an EXPERIMENTAL optimizer, only for torch version >= 2.5.1)
 # - adafactor
 # - adamw_anyprecision
-# - adamw_torch_4bit
-# - ademamix
 # - sgd
 # - adagrad
 # - adamw_bnb_8bit
-# - adamw_8bit   # alias for adamw_bnb_8bit
-# - ademamix_8bit
 # - lion_8bit
 # - lion_32bit
 # - paged_adamw_32bit
 # - paged_adamw_8bit
-# - paged_ademamix_32bit
-# - paged_ademamix_8bit
 # - paged_lion_32bit
 # - paged_lion_8bit
-# - rmsprop
-# - rmsprop_bnb
-# - rmsprop_bnb_8bit
-# - rmsprop_bnb_32bit
 # - galore_adamw
 # - galore_adamw_8bit
 # - galore_adafactor
 # - galore_adamw_layerwise
 # - galore_adamw_8bit_layerwise
 # - galore_adafactor_layerwise
-# - lomo
-# - adalomo
-# - grokadamw
-# - schedule_free_adamw
-# - schedule_free_sgd
-# - apollo_adamw
-# - apollo_adamw_layerwise
-#
-# Additional custom optimizers include:
-# - optimi_adamw
-# - ao_adamw_8bit
-# - ao_adamw_fp8
 optimizer:
 # Dictionary of arguments to pass to the optimizer
 optim_args:
@@ -614,42 +513,27 @@ max_grad_norm:
 # currently only supported on Llama and Mistral
 neftune_noise_alpha:

-# Optional[bool]. Whether to bettertransformers
+# Whether to bettertransformers
 flash_optimum:
-
-# Note: Only one of the following attention patches can be used at a time.
-# For example, if you set `xformers_attention` to `true`, do not set `flash_attention` to `true`.
-
-# Optional[bool]. Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
+# Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
 xformers_attention:
-# Optional[bool]. Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
+# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
 flash_attention:
-flash_attn_cross_entropy:  # Optional[bool]. Whether to use flash-attention cross entropy implementation - advanced use only
-flash_attn_rms_norm:  # Optional[bool]. Whether to use flash-attention rms norm implementation - advanced use only
-flash_attn_fuse_qkv: # Optional[bool]. Whether to fuse QKV into a single operation
-flash_attn_fuse_mlp: # Optional[bool]. Whether to fuse part of the MLP into a single operation
-# Optional[bool]. Whether to use scaled-dot-product attention
+flash_attn_cross_entropy:  # Whether to use flash-attention cross entropy implementation - advanced use only
+flash_attn_rms_norm:  # Whether to use flash-attention rms norm implementation - advanced use only
+flash_attn_fuse_qkv: # Whether to fuse QKV into a single operation
+flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
+# Whether to use scaled-dot-product attention
 # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
 sdp_attention:
-# Optional[bool]. Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
+# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
 s2_attention:
-
-# Optional[bool]. Whether to use low_cpu_mem_usage
-low_cpu_mem_usage:
-# Optional[str]. Resume from a specific checkpoint dir
+# Resume from a specific checkpoint dir
 resume_from_checkpoint:
-# Optional[bool]. If resume_from_checkpoint isn't set and you simply want it to start where it left off.
+# If resume_from_checkpoint isn't set and you simply want it to start where it left off.
 # Be careful with this being turned on between different models.
 auto_resume_from_checkpoints: false

-## Multimodal section
-# int | tuple[int, int] | None . Size to resize images to, width x height.
-# Will read from model/processor config if not set.
-image_size:
-# str. Algorithm to use for image resizing. "bilinear", "bicubic", "lanczos". Default is "bilinear".
-image_resize_algorithm: 'bilinear'
-## End of multimodal section
-
 # Don't mess with this, it's here for accelerate and torchrun
 local_rank:

@@ -664,13 +548,6 @@ special_tokens:
 # Add extra tokens.
 tokens:

-# Mapping token_id to new_token_string to override reserved added_tokens in the tokenizer.
-# Only works for tokens that are not part of the base vocab (aka are added_tokens).
-# Can be checked if they exist in tokenizer.json added_tokens.
-added_tokens_overrides:  # Dict[int, str]
-#  128041: "<|im_start|>"
-#  128042: "<|im_end|>"
-
 # FSDP
 fsdp:
 fsdp_config:
@@ -683,20 +560,6 @@ ddp_timeout:
 ddp_bucket_cap_mb:
 ddp_broadcast_buffers:

-# Sequence parallelism
-# Set to a divisor of the number of GPUs available to split sequences into chunks of equal size.
-# Use in long context training to prevent OOM when sequences cannot fit into a single GPU's VRAM.
-# E.g., if 4 GPUs are available, set this value to 2 to split each sequence into two equal-sized
-# subsequences, or set to 4 to split into four equal-sized subsequences.
-# See https://docs.axolotl.ai/docs/sequence_parallelism.html for more details.
-sequence_parallel_degree:
-# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
-# Must evenly divide the number of KV heads in your model.
-heads_k_stride: 1
-# One of "varlen_llama3", "batch_ring", "batch_zigzag", "batch_stripe". Defaults to "varlen_llama3"
-# in the sample packing case, and "batch_ring" in the non-sample packing case.
-ring_attn_func:
-
 # Path to torch distx for optim 'adamw_anyprecision'
 torchdistx_path:

--- a/docs/custom_integrations.qmd
+++ b/docs/custom_integrations.qmd
@@ -55,47 +55,3 @@ sections = [
 for section_name, folder_name in sections:
    print(print_section(section_name, folder_name))
 ```
-
-## Adding a new integration
-
-Plugins can be used to customize the behavior of the training pipeline through [hooks](https://en.wikipedia.org/wiki/Hooking). See [`axolotl.integrations.BasePlugin`](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/integrations/base.py) for the possible hooks.
-
-To add a new integration, please follow these steps:
-
-1. Create a new folder in the `src/axolotl/integrations` directory.
-2. Add any relevant files (`LICENSE`, `README.md`, `ACKNOWLEDGEMENTS.md`, etc.) to the new folder.
-3. Add `__init__.py` and `args.py` files to the new folder.
-  - `__init__.py` should import the integration and hook into the appropriate functions.
-  - `args.py` should define the arguments for the integration.
-4. (If applicable) Add CPU tests under `tests/integrations` or GPU tests under `tests/e2e/integrations`.
-
-::: {.callout-tip}
-
-See [src/axolotl/integrations/cut_cross_entropy](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations/cut_cross_entropy) for a minimal integration example.
-
-:::
-
-::: {.callout-warning}
-
-If you could not load your integration, please ensure you are pip installing in editable mode.
-
-```bash
-pip install -e .
-```
-
-and correctly spelled the integration name in the config file.
-
-```yaml
-plugins:
-  - axolotl.integrations.your_integration_name.YourIntegrationPlugin
-```
-
-:::
-
-::: {.callout-note}
-
-It is not necessary to place your integration in the `integrations` folder. It can be in any location, so long as it's installed in a package in your python env.
-
-See this repo for an example: [https://github.com/axolotl-ai-cloud/diff-transformer](https://github.com/axolotl-ai-cloud/diff-transformer)
-
-:::
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -74,10 +74,6 @@ datasets:
    train_on_eos:
 ```

-::: {.callout-tip}
-If you receive an error like "`chat_template` choice is `tokenizer_default` but tokenizer's `chat_template` is null.", it means the tokenizer does not have a default `chat_template`. Follow the examples below instead to set a custom `chat_template`.
-:::
-
 2. Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages.

 ```yaml
--- a/docs/dataset-formats/index.qmd
+++ b/docs/dataset-formats/index.qmd
@@ -13,13 +13,6 @@ As there are a lot of available options in Axolotl, this guide aims to provide a

 Axolotl supports 3 kinds of training methods: pre-training, supervised fine-tuning, and preference-based post-training (e.g. DPO, ORPO, PRMs). Each method has their own dataset format which are described below.

-::: {.callout-tip}
-
-This guide will mainly use JSONL as an introduction. Please refer to the [dataset loading docs](../dataset_loading.qmd) to understand how to load datasets from other sources.
-
-For `pretraining_dataset:` specifically, please refer to the [Pre-training section](#pre-training).
-:::
-
 ## Pre-training

 When aiming to train on large corpora of text datasets, pre-training is your go-to choice. Due to the size of these datasets, downloading the entire-datasets before beginning training would be prohibitively time-consuming. Axolotl supports [streaming](https://huggingface.co/docs/datasets/en/stream) to only load batches into memory at a time.
@@ -136,7 +129,6 @@ You can mix and match within each approach or across approaches to train a model
 We suggest this approach when you want to bring your own tokenized dataset.

 Axolotl expects the dataset to have three keys:
-
 - `input_ids`: from tokenizing formatted prompt
 - `attention_mask`: for masking padding. If you don't add padding, it would be equal to `len(input_ids) * [1]`
 - `labels`: this is the same as `input_ids`, however, if you want to mask certain tokens, you would set those indices to `-100`.
@@ -457,7 +449,10 @@ datasets:
    type: alpaca
 ```

-Axolotl supports many kinds of instruction dataset. All of them can be found in the [Instruction Dataset Documentation](inst_tune.qmd) with their respective type and sample row format.
+Axolotl supports many kinds of instruction dataset. All of them can be found here (https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/inst_tune.html) with their respective type and sample row format.
+
+
+Reference: [Instruction Dataset Documentation](inst_tune.qmd).

 #### Custom Instruct Prompt Format

--- a/docs/dataset_loading.qmd
+++ b/docs/dataset_loading.qmd
@@ -1,276 +0,0 @@
---
-title: Dataset Loading
-description: Understanding how to load datasets from different sources
-back-to-top-navigation: true
-toc: true
-toc-depth: 5
---
-
-## Overview
-
-Datasets can be loaded in a number of different ways depending on the how it is saved (the extension of the file) and where it is stored.
-
-## Loading Datasets
-
-We use the `datasets` library to load datasets and a mix of `load_dataset` and `load_from_disk` to load them.
-
-You may recognize the similar named configs between `load_dataset` and the `datasets` section of the config file.
-
-```yaml
-datasets:
-  - path:
-    name:
-    data_files:
-    split:
-    revision:
-    trust_remote_code:
-```
-
-::: {.callout-tip}
-
-Do not feel overwhelmed by the number of options here. A lot of them are optional. In fact, the most common config to use would be `path` and sometimes `data_files`.
-
-:::
-
-This matches the API of [`datasets.load_dataset`](https://github.com/huggingface/datasets/blob/0b5998ac62f08e358f8dcc17ec6e2f2a5e9450b6/src/datasets/load.py#L1838-L1858), so if you're familiar with that, you will feel right at home.
-
-For HuggingFace's guide to load different dataset types, see [here](https://huggingface.co/docs/datasets/loading).
-
-For full details on the config, see [config.qmd](config.qmd).
-
-::: {.callout-note}
-
-You can set multiple datasets in the config file by more than one entry under `datasets`.
-
-```yaml
-datasets:
-  - path: /path/to/your/dataset
-  - path: /path/to/your/other/dataset
-```
-
-:::
-
-### Local dataset
-
-#### Files
-
-Usually, to load a JSON file, you would do something like this:
-
-```python
-from datasets import load_dataset
-
-dataset = load_dataset("json", data_files="data.json")
-```
-
-Which translates to the following config:
-
-```yaml
-datasets:
-  - path: json
-    data_files: /path/to/your/file.jsonl
-```
-
-However, to make things easier, we have added a few shortcuts for loading local dataset files.
-
-You can just point the `path` to the file or directory along with the `ds_type` to load the dataset. The below example shows for a JSON file:
-
-```yaml
-datasets:
-  - path: /path/to/your/file.jsonl
-    ds_type: json
-```
-
-This works for CSV, JSON, Parquet, and Arrow files.
-
-::: {.callout-tip}
-
-If `path` points to a file and `ds_type` is not specified, we will automatically infer the dataset type from the file extension, so you could omit `ds_type` if you'd like.
-
-:::
-
-#### Directory
-
-If you're loading a directory, you can point the `path` to the directory.
-
-Then, you have two options:
-
-##### Loading entire directory
-
-You do not need any additional configs.
-
-We will attempt to load in the following order:
- datasets saved with `datasets.save_to_disk`
- loading entire directory of files (such as with parquet/arrow files)
-
-```yaml
-datasets:
-  - path: /path/to/your/directory
-```
-
-##### Loading specific files in directory
-
-Provide `data_files` with a list of files to load.
-
-```yaml
-datasets:
-    # single file
-  - path: /path/to/your/directory
-    ds_type: csv
-    data_files: file1.csv
-
-    # multiple files
-  - path: /path/to/your/directory
-    ds_type: json
-    data_files:
-      - file1.jsonl
-      - file2.jsonl
-
-    # multiple files for parquet
-  - path: /path/to/your/directory
-    ds_type: parquet
-    data_files:
-      - file1.parquet
-      - file2.parquet
-
-```
-
-### HuggingFace Hub
-
-The method you use to load the dataset depends on how the dataset was created, whether a folder was uploaded directly or a HuggingFace Dataset was pushed.
-
-::: {.callout-note}
-
-If you're using a private dataset, you will need to enable the `hf_use_auth_token` flag in the root-level of the config file.
-
-:::
-
-#### Folder uploaded
-
-This would mean that the dataset is a single file or file(s) uploaded to the Hub.
-
-```yaml
-datasets:
-  - path: org/dataset-name
-    data_files:
-      - file1.jsonl
-      - file2.jsonl
-```
-
-#### HuggingFace Dataset
-
-This means that the dataset is created as a HuggingFace Dataset and pushed to the Hub via `datasets.push_to_hub`.
-
-```yaml
-datasets:
-  - path: org/dataset-name
-```
-
-::: {.callout-note}
-
-There are some other configs which may be required like `name`, `split`, `revision`, `trust_remote_code`, etc depending on the dataset.
-
-:::
-
-### Remote Filesystems
-
-Via the `storage_options` config under `load_dataset`, you can load datasets from remote filesystems like S3, GCS, Azure, and OCI.
-
-::: {.callout-warning}
-
-This is currently experimental. Please let us know if you run into any issues!
-
-:::
-
-The only difference between the providers is that you need to prepend the path with the respective protocols.
-
-```yaml
-datasets:
-    # Single file
-  - path: s3://bucket-name/path/to/your/file.jsonl
-
-    # Directory
-  - path: s3://bucket-name/path/to/your/directory
-```
-
-For directory, we load via `load_from_disk`.
-
-#### S3
-
-Prepend the path with `s3://`.
-
-The credentials are pulled in the following order:
-
- `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `AWS_SESSION_TOKEN` environment variables
- from the `~/.aws/credentials` file
- for nodes on EC2, the IAM metadata provider
-
-::: {.callout-note}
-
-We assume you have credentials setup and not using anonymous access. If you want to use anonymous access, let us know! We may have to open a config option for this.
-
-:::
-
-Other environment variables that can be set can be found in [boto3 docs](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html#using-environment-variables)
-
-#### GCS
-
-Prepend the path with `gs://` or `gcs://`.
-
-The credentials are loaded in the following order:
-
- gcloud credentials
- for nodes on GCP, the google metadata service
- anonymous access
-
-#### Azure
-
-##### Gen 1
-
-Prepend the path with `adl://`.
-
-Ensure you have the following environment variables set:
-
- `AZURE_STORAGE_TENANT_ID`
- `AZURE_STORAGE_CLIENT_ID`
- `AZURE_STORAGE_CLIENT_SECRET`
-
-##### Gen 2
-
-Prepend the path with `abfs://` or `az://`.
-
-Ensure you have the following environment variables set:
-
- `AZURE_STORAGE_ACCOUNT_NAME`
- `AZURE_STORAGE_ACCOUNT_KEY`
-
-Other environment variables that can be set can be found in [adlfs docs](https://github.com/fsspec/adlfs?tab=readme-ov-file#setting-credentials)
-
-#### OCI
-
-Prepend the path with `oci://`.
-
-It would attempt to read in the following order:
-
- `OCIFS_IAM_TYPE`, `OCIFS_CONFIG_LOCATION`, and `OCIFS_CONFIG_PROFILE` environment variables
- when on OCI resource, resource principal
-
-Other environment variables:
-
- `OCI_REGION_METADATA`
-
-Please see the [ocifs docs](https://ocifs.readthedocs.io/en/latest/getting-connected.html#Using-Environment-Variables).
-
-### HTTPS
-
-The path should start with `https://`.
-
-```yaml
-datasets:
-  - path: https://path/to/your/dataset/file.jsonl
-```
-
-This must be publically accessible.
-
-## Next steps
-
-Now that you know how to load datasets, you can learn more on how to load your specific dataset format into your target output format [dataset formats docs](dataset-formats).
--- a/docs/dataset_preprocessing.qmd
+++ b/docs/dataset_preprocessing.qmd
@@ -6,7 +6,7 @@ description: How datasets are processed
 ## Overview

 Dataset pre-processing is the step where Axolotl takes each dataset you've configured alongside
-the [dataset format](dataset-formats) and prompt strategies to:
+the [dataset format](docs/dataset-formats) and prompt strategies to:

 - parse the dataset based on the *dataset format*
 - transform the dataset to how you would interact with the model based on the *prompt strategy*
--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -1,139 +0,0 @@
---
-title: "Docker"
-format:
-  html:
-    toc: true
-    toc-depth: 4
---
-
-This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai).
-
-## Base
-
-The base image is the most minimal image that can install Axolotl. It is based on the `nvidia/cuda` image. It includes python, torch, git, git-lfs, awscli, pydantic, and more.
-
-#### Image
-
-```
-axolotlai/axolotl-base
-```
-
-Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-base)
-
-#### Tags format
-
-```bash
-main-base-py{python_version}-cu{cuda_version}-{pytorch_version}
-```
-
-Tags examples:
-
- `main-base-py3.11-cu124-2.6.0`
- `main-base-py3.11-cu124-2.5.1`
- `main-base-py3.11-cu124-2.4.1`
-
-## Main
-
-The main image is the image that is used to run Axolotl. It is based on the `axolotlai/axolotl-base` image and includes the Axolotl codebase, dependencies, and more.
-
-#### Image
-
-```
-axolotlai/axolotl
-```
-
-Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl)
-
-#### Tags format {#sec-main-tags}
-
-```bash
-# on push to main
-main-py{python_version}-cu{cuda_version}-{pytorch_version}
-
-# latest main (currently torch 2.5.1, python 3.11, cuda 12.4)
-main-latest
-
-# nightly build
-{branch}-{date_in_YYYYMMDD}-py{python_version}-cu{cuda_version}-{pytorch_version}
-
-# tagged release
-{version}
-```
-
-:::{.callout-tip}
-
-There may be some extra tags appended to the image, like `-vllm` which installs those packages.
-
-:::
-
-Tags examples:
-
- `main-py3.11-cu124-2.6.0`
- `main-py3.11-cu124-2.5.1`
- `main-py3.11-cu124-2.4.1`
- `main-latest`
- `main-20250303-py3.11-cu124-2.6.0`
- `main-20250303-py3.11-cu124-2.5.1`
- `main-20250303-py3.11-cu124-2.4.1`
- `0.7.1`
-
-## Cloud
-
-The cloud image is the image that is used to run Axolotl in the cloud. It is based on the `axolotlai/axolotl` image and sets ENV variables like HuggingFace cache directories for volume mounts, tmux, and more for different cloud providers.
-
-:::{.callout-tip}
-
-Jupyter lab is run by default. Set `JUPYTER_DISABLE=1` in the environment variables to disable it.
-
-:::
-
-#### Image
-
-```
-axolotlai/axolotl-cloud
-```
-
-Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-cloud)
-
-#### Tags format
-
-This uses the same tags as the [`main` image](#sec-main-tags).
-
-#### Environment variables
-
- `JUPYTER_DISABLE`: Disable Jupyter lab.
- `JUPYTER_PASSWORD`: Set a password for the Jupyter lab.
- `PUBLIC_KEY` / `SSH_KEY`: Add a public key for the SSH service.
-
-#### Volume mounts
-
-:::{.callout-tip}
-
-We recommend mounting volumes to `/workspace/data` for data persistence. `/workspace/axolotl` contains the source code and is ephemeral.
-
-:::
-
- `/workspace/data/axolotl-artifacts`: Directory to store Axolotl artifacts.
- `/workspace/data/huggingface-cache`: Directory to store HuggingFace cache.
-
-## Cloud-no-tmux
-
-This is the same as the [`cloud` image](#sec-cloud) but without tmux.
-
-#### Image
-
-```
-axolotlai/axolotl-cloud-term
-```
-
-Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-cloud-term)
-
-:::{.callout-note}
-
-The naming may be a bit confusing as it has `-term` appended to the end.
-
-:::
-
-#### Tags format
-
-This uses the same tags as the [`cloud` image](#sec-cloud-tags).
--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -19,38 +19,12 @@ description: Frequently asked questions

 **Q: AttributeError: 'DummyOptim' object has no attribute 'step'**

-**Q: ModuleNotFoundError: No module named 'mpi4py' using single GPU with deepspeed**
-
-> A: You may be using deepspeed with single gpu. Please remove the `deepspeed:` section in the yaml file or `--deepspeed` CLI flag.
+> A: You may be using deepspeed with single gpu. Please don't set `deepspeed:` in yaml or cli.

 **Q: The codes is stuck on saving preprocessed datasets.**

 > A: This is usually an issue with the GPU. This can be resolved through setting the os environment variable `CUDA_VISIBLE_DEVICES=0`. If you are on runpod, this is usually a pod issue. Starting a new pod should take care of it.

-**Q: Received mismatch error on merge adapters / loading adapters between torch.Size of checkpoint and model.**
-
-> A: This is likely due to vocab size mismatch. By default, Axolotl expands the model's embeddings if the tokenizer has more tokens than the model. Please use the `axolotl merge-lora` command to merge the adapters instead of using your own scripts.
-
-> On the other hand, if the model has more tokens than the tokenizer, Axolotl does not shrink the model's embeddings unless `shrink_embeddings: true` is set in the config.
-
-**Q: How to call Axolotl via custom python scripts?**
-
-> A: Since Axolotl is just Python, please see `src/axolotl/cli/main.py` on how each command is called.
-
-**Q: How to know the value to use for `fsdp_transformer_layer_cls_to_wrap`?**
-
-> A: This is the class name of the transformer layer to wrap with FSDP. For example, for `LlamaForCausalLM`, the value is `LlamaDecoderLayer`. To find this for a specific model, check the model's `PreTrainedModel` definition and look for `_no_split_modules` variable in the `modeling_<model_name>.py` file within `transformers` library.
-
-**Q: ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as pad_token**
-
-> A: This is because the tokenizer does not have a padding token. Please add a padding token to the tokenizer via:
-
-> ```yaml
-> special_tokens:
->   # str. If you're not sure, set to same as `eos_token`.
->   pad_token: "..."
-> ```
-
 ### Chat templates

 **Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**
@@ -76,7 +50,3 @@ description: Frequently asked questions
 **Q: The EOS/EOT token is incorrectly being masked or not being masked.**

 > A: This is because of the mismatch between `tokenizer.eos_token` and EOS/EOT token in template. Please make sure to set `eos_token` under `special_tokens` to the same EOS/EOT token as in template.
-
-**Q: "`chat_template` choice is `tokenizer_default` but tokenizer's `chat_template` is null. Please add a `chat_template` in tokenizer config"**
-
-> A: This is because the tokenizer does not have a chat template. Please add a chat template in the tokenizer config. See [chat_template](dataset-formats/conversation.qmd#chat-template) for more details.
--- a/docs/getting-started.qmd
+++ b/docs/getting-started.qmd
@@ -36,9 +36,7 @@ The YAML configuration file controls everything about your training. Here's what

 ```yaml
 base_model: NousResearch/Llama-3.2-1B
-
-load_in_8bit: true
-adapter: lora
+# hub_model_id: username/custom_model_name

 datasets:
  - path: teknium/GPT4-LLM-Cleaned
@@ -46,15 +44,11 @@ datasets:
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.1
 output_dir: ./outputs/lora-out
+
+adapter: lora
+lora_model_dir:
 ```

-::: {.callout-tip}
-`load_in_8bit: true` and `adapter: lora` enables LoRA adapter finetuning.
-
- To perform Full finetuning, remove these two lines.
- To perform QLoRA finetuning, replace with `load_in_4bit: true` and `adapter: qlora`.
-:::
-
 See our [Config options](config.qmd) for more details.

 ### Training {#sec-training}
@@ -62,7 +56,7 @@ See our [Config options](config.qmd) for more details.
 When you run `axolotl train`, Axolotl:

 1. Downloads the base model
-2. (If specified) applies QLoRA/LoRA adapter layers
+2. (If specified) applies LoRA adapter layers
 3. Loads and processes the dataset
 4. Runs the training loop
 5. Saves the trained model and / or LoRA weights
@@ -75,8 +69,6 @@ Let's modify the example for your own data:

 ```yaml
 base_model: NousResearch/Nous-Hermes-llama-1b-v1
-
-load_in_8bit: true
 adapter: lora

 # Training settings
@@ -112,6 +104,8 @@ format):
 {"instruction": "Classify this text", "input": "Not good at all", "output": "negative"}
 ```

+Please consult the supported [Dataset Formats](dataset-formats/) for more details.
+
 3. Run the training:

 ```bash
--- a/docs/inference.qmd
+++ b/docs/inference.qmd
@@ -1,5 +1,5 @@
 ---
-title: "Inference and Merging"
+title: "Inference"
 format:
  html:
    toc: true
@@ -9,14 +9,10 @@ execute:
  enabled: false
 ---

-This guide covers how to use your trained models for inference, including model loading, interactive testing, merging adapters, and common troubleshooting steps.
+This guide covers how to use your trained models for inference, including model loading, interactive testing, and common troubleshooting steps.

 ## Quick Start {#sec-quickstart}

-::: {.callout-tip}
-Use the same config used for training on inference/merging.
-:::
-
 ### Basic Inference {#sec-basic}

 ::: {.panel-tabset}
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -19,16 +19,9 @@ This guide covers all the ways you can install and set up Axolotl for your envir

 ## Installation Methods {#sec-installation-methods}

-::: {.callout-important}
-Please make sure to have Pytorch installed before installing Axolotl in your local environment.
-
-Follow the instructions at: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
-:::
-
 ### PyPI Installation (Recommended) {#sec-pypi}

 ```{.bash}
-pip3 install -U packaging setuptools wheel ninja
 pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
 ```

@@ -44,7 +37,7 @@ For the latest features between releases:
 ```{.bash}
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
-pip3 install -U packaging setuptools wheel ninja
+pip3 install packaging ninja
 pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
 ```

@@ -72,8 +65,6 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it \
 ```
 :::

-Please refer to the [Docker documentation](docker.qmd) for more information on the different Docker images that are available.
-
 ## Cloud Environments {#sec-cloud}

 ### Cloud GPU Providers {#sec-cloud-gpu}
@@ -85,7 +76,6 @@ For providers supporting Docker:
  - [Latitude.sh](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)
  - [JarvisLabs.ai](https://jarvislabs.ai/templates/axolotl)
  - [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
-  - [Novita](https://novita.ai/gpus-console?templateId=311)

 ### Google Colab {#sec-colab}

@@ -115,7 +105,7 @@ We recommend using WSL2 (Windows Subsystem for Linux) or Docker.
 2. Install PyTorch: https://pytorch.org/get-started/locally/
 3. Install Axolotl:
   ```{.bash}
-   pip3 install -U packaging setuptools wheel ninja
+   pip3 install packaging
   pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
   ```
 4. (Optional) Login to Hugging Face:
--- a/docs/lora_optims.qmd
+++ b/docs/lora_optims.qmd
@@ -17,7 +17,6 @@ We currently support several common model architectures, including (but not limi
 - `qwen2`
 - `gemma`
 - `gemma2`
- `gemma3`

 <details>

@@ -67,10 +66,6 @@ logic to be compatible with more of them.

 </details>

-::: {.callout-tip}
-Check out our [LoRA optimizations blog](https://axolotlai.substack.com/p/accelerating-lora-fine-tuning-with).
-:::
-
 ## Usage

 These optimizations can be enabled in your Axolotl config YAML file. The
--- a/docs/multi-gpu.qmd
+++ b/docs/multi-gpu.qmd
@@ -18,7 +18,6 @@ Axolotl supports several methods for multi-GPU training:

 - DeepSpeed (recommended)
 - FSDP (Fully Sharded Data Parallel)
- Sequence parallelism
 - FSDP + QLoRA

 ## DeepSpeed {#sec-deepspeed}
@@ -36,9 +35,6 @@ deepspeed: deepspeed_configs/zero1.json
 ### Usage {#sec-deepspeed-usage}

 ```{.bash}
-# Fetch deepspeed configs (if not already present)
-axolotl fetch deepspeed_configs
-
 # Passing arg via config
 axolotl train config.yml

@@ -51,20 +47,10 @@ axolotl train config.yml --deepspeed deepspeed_configs/zero1.json
 We provide default configurations for:

 - ZeRO Stage 1 (`zero1.json`)
- ZeRO Stage 1 with torch compile (`zero1_torch_compile.json`)
 - ZeRO Stage 2 (`zero2.json`)
 - ZeRO Stage 3 (`zero3.json`)
- ZeRO Stage 3 with bf16 (`zero3_bf16.json`)
- ZeRO Stage 3 with bf16 and CPU offload params(`zero3_bf16_cpuoffload_params.json`)
- ZeRO Stage 3 with bf16 and CPU offload params and optimizer (`zero3_bf16_cpuoffload_all.json`)

-::: {.callout-tip}
-
-Choose the configuration that offloads the least amount to memory while still being able to fit on VRAM for best performance.
-
-Start from Stage 1 -> Stage 2 -> Stage 3.
-
-:::
+Choose based on your memory requirements and performance needs.

 ## FSDP {#sec-fsdp}

@@ -80,28 +66,6 @@ fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
 ```

-## Sequence parallelism {#sec-sequence-parallelism}
-
-We support sequence parallelism (SP) via the
-[ring-flash-attention](https://github.com/zhuzilin/ring-flash-attention) project. This
-allows one to split up sequences across GPUs, which is useful in the event that a
-single sequence causes OOM errors during model training.
-
-First, install `ring-flash-attn`, recommended via `pip install axolotl[ring-flash-attn]`,
-or from source with `pip install .[ring-flash-attn]`.
-
-Your Axolotl YAML config should contain the following lines:
-
-```{.yaml}
-sequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU
-flash_attention: true  # Required with sequence parallelism
-
-# Optional; strides across the key dimension. Larger values use more memory but will make training faster.
-heads_k_stride: 1
-```
-
-See our [dedicated guide](sequence_parallelism.qmd) for more details.
-
 ### FSDP + QLoRA {#sec-fsdp-qlora}

 For combining FSDP with QLoRA, see our [dedicated guide](fsdp_qlora.qmd).
--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -1,180 +1,28 @@
---
-title: MultiModal / Vision Language Models (BETA)
-format:
-  html:
-    toc: true
-    toc-depth: 3
---
+# MultiModal / Vision Language Models (BETA)

-## Supported Models
+### Supported Models

- [Mllama](#sec-mllama)
- [Llama4](#sec-llama4)
- [Pixtral](#sec-pixtral)
- [Llava-1.5](#sec-llava-15)
- [Mistral-Small-3.1](#sec-mistral-small-31)
- [Gemma-3](#sec-gemma-3)
- [Qwen2-VL](#sec-qwen2-vl)
- [Qwen2.5-VL](#sec-qwen25-vl)
+- Mllama, i.e. llama with vision models

-## Usage
+### Usage

-Multimodal support is limited and doesn't have full feature parity.
-
-Here are the hyperparams you'll need to use to finetune a multimodal model.
+Currently multimodal support is limited and doesn't have full feature parity. To finetune a multimodal Llama w/ LoRA,
+you'll need to use the following in YAML in combination with the rest of the required hyperparams.

 ```yaml
+base_model: alpindale/Llama-3.2-11B-Vision-Instruct
 processor_type: AutoProcessor
-
 skip_prepare_dataset: true
-remove_unused_columns: false  # leave columns in place as they are needed to handle image embeddings during training
-sample_packing: false  # not yet supported with multimodal

-chat_template:  # see in next section
-
-# example dataset
+chat_template: llama3_2_vision
 datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]
    field_messages: messages
+remove_unused_columns: false
+sample_packing: false

-# (optional) if doing lora, only finetune the Language model,
-# leave the vision model and vision tower frozen
-# load_in_8bit: true
-adapter: lora
+# only finetune the Language model, leave the vision model and vision tower frozen
 lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
-
-# (optional) if you want to resize images to a set size
-image_size: 512
-image_resize_algorithm: bilinear
-```
-
-Please see [examples](https://github.com/axolotl-ai/axolotl/tree/main/examples) folder for full configs.
-
-::: {.callout-warning}
-Some of our chat_templates have been extended to support broader dataset types. This should not break any existing configs.
-:::
-
-### Mllama {#sec-mllama}
-
-```yaml
-base_model: meta-llama/Llama-3.2-11B-Vision-Instruct
-
-chat_template: llama3_2_vision
-```
-
-### Llama4 {#sec-llama4}
-
-```yaml
-base_model: meta-llama/Llama-4-Scout-17B-16E-Instruct
-
-chat_template: llama4
-```
-
-### Pixtral {#sec-pixtral}
-
-```yaml
-base_model: mistralai/Pixtral-12B-2409
-
-chat_template: pixtral
-```
-
-### Llava-1.5 {#sec-llava-15}
-
-```yaml
-base_model: llava-hf/llava-1.5-7b-hf
-
-chat_template: llava
-```
-
-### Mistral-Small-3.1 {#sec-mistral-small-31}
-
-```yaml
-base_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503
-
-chat_template: mistral_v7_tekken
-```
-
-### Gemma-3 {#sec-gemma-3}
-
-::: {.callout-tip}
-The Gemma3-1B model is a text-only model, so please train as regular text model.
-:::
-
-For multi-modal 4B/12B/27B models, use the following config:
-
-```yaml
-base_model: google/gemma-3-4b-it
-
-chat_template: gemma3
-```
-
-### Qwen2-VL {#sec-qwen2-vl}
-
-```yaml
-base_model: Qwen/Qwen2-VL-7B-Instruct
-
-chat_template: qwen2_vl
-```
-
-### Qwen2.5-VL {#sec-qwen25-vl}
-
-```yaml
-base_model: Qwen/Qwen2.5-VL-7B-Instruct
-
-chat_template: qwen2_vl  # same as qwen2-vl
-```
-
-## Dataset Format
-
-For multi-modal datasets, we adopt an extended `chat_template` format similar to OpenAI's Message format.
-
- A message is a list of `role` and `content`.
- `role` can be `system`, `user`, `assistant`, etc.
- `content` is a list of `type` and (`text` or `image` or `path` or `url` or `base64`).
-
-::: {.callout-note}
-For backwards compatibility:
-
- If the dataset has a `images` or `image` column of `list[Image]`, it will be appended to the first `content` list as `{"type": "image", "image": ...}`. However, if the content already has a `{"type": "image"}` but no `image` key, it will be set the `image` key.
- If `content` is a string, it will be converted to a list with `type` as `text`.
-:::
-
-::: {.callout-tip}
-For image loading, you can use the following keys within `content` alongside `"type": "image"`:
-
- `"path": "/path/to/image.jpg"`
- `"url": "https://example.com/image.jpg"`
- `"base64": "..."`
- `"image": PIL.Image`
-:::
-
-Here is an example of a multi-modal dataset:
-```json
-[
-  {
-    "messages": [
-        {
-            "role": "system",
-            "content": [
-              {"type": "text", "text": "You are a helpful assistant."}
-              ]
-        },
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
-                {"type": "text", "text": "Describe this image in detail."}
-            ]
-        },
-        {
-            "role": "assistant",
-            "content": [
-              {"type": "text", "text": "The image is a bee."}
-            ]
-        }
-    ]
-  }
-]
 ```
--- a/docs/reward_modelling.qmd
+++ b/docs/reward_modelling.qmd
@@ -28,23 +28,8 @@ val_set_size: 0.1
 eval_steps: 100
 ```

-Bradley-Terry chat templates expect single-turn conversations in the following format:
-
-```json
-{
-    "system": "...", // optional
-    "input": "...",
-    "chosen": "...",
-    "rejected": "..."
-}
-```
-
 ### Process Reward Models (PRM)

-::: {.callout-tip}
-Check out our [PRM blog](https://axolotlai.substack.com/p/process-reward-models).
-:::
-
 Process reward models are trained using data which contains preference annotations for each step in a series of interactions. Typically, PRMs are trained to provide reward signals over each step of a reasoning trace and are used for downstream reinforcement learning.
 ```yaml
 base_model: Qwen/Qwen2.5-3B
@@ -60,5 +45,3 @@ datasets:
 val_set_size: 0.1
 eval_steps: 100
 ```
-
-Please see [stepwise_supervised](dataset-formats/stepwise_supervised.qmd) for more details on the dataset format.
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -3,7 +3,6 @@ title: "RLHF (Beta)"
 description: "Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human feedback."
 back-to-top-navigation: true
 toc: true
-toc-expand: 2
 toc-depth: 4
 ---

@@ -298,7 +297,7 @@ The input format is a simple JSON input with customizable fields based on the ab

 ### IPO

-As IPO is just DPO with a different loss function, all supported dataset formats for [DPO](#dpo) are also supported for IPO.
+As IPO is just DPO with a different loss function, all supported options for DPO works here.

 ```yaml
 rl: ipo
@@ -344,9 +343,8 @@ ORPO supports the following types with the following dataset format:

 ```yaml
 rl: kto
-rl_beta: 0.1  # default
-kto_desirable_weight: 1.0  # default
-kto_undesirable_weight: 1.0  # default
+rl_beta: 0.5
+kto_desirable_weight: 0.2

 remove_unused_columns: false

@@ -498,52 +496,9 @@ The input format is a simple JSON input with customizable fields based on the ab

 ### GRPO

-::: {.callout-tip}
-Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/axolotl-cookbook/tree/main/grpo#training-an-r1-style-large-language-model-using-grpo).
-:::
-
-If you have multiple GPUs available, we reccomend using `vLLM` with the `GRPOTrainer` to significantly speedup trajectory generation during training.
-First, launch a `vLLM` server using `trl vllm-serve` - you may use a config file or CLI overrides to configure your vLLM server. In this example, we're
-using 4 GPUs - 2 for training, and 2 for vLLM:
-
-::: {.callout-important}
-Make sure you've installed the correct version of vLLM by including it as an extra when installing axolotl, e.g. `pip install axolotl[vllm]`.
-:::
-
-```yaml
-base_model: Qwen/Qwen2.5-1.5B-Instruct
-
-vllm:
-    host: 0.0.0.0
-    port: 8000
-    tensor_parallel_size: 2
-    gpu_memory_utilization: 0.85
-    dtype: auto
-    # max_model_len: # you may find it useful to set the vLLM model context length if you know this beforehand
-
-rl: grpo
-trl:
-    use_vllm: true
-    vllm_server_host: 0.0.0.0
-    vllm_server_port: 8000
-    vllm_server_timeout: 300
-```
-
-```bash
-CUDA_VISIBLE_DEVICES=2,3 axolotl vllm-serve grpo.yaml
-```
-
-Your `vLLM` instance will now attempt to spin up, and it's time to kick off training utilizing our remaining two GPUs. In another terminal, execute:
-
-```bash
-CUDA_VISIBLE_DEVICES=0,1 axolotl train grpo.yaml --num-processes 2
-```
-
-#### Reward functions
-
 GRPO uses custom reward functions and transformations. Please have them ready locally.

-For example, to load OpenAI's GSM8K and use a random reward for completions:
+For ex, to load OpenAI's GSM8K and use a random reward for completions:

 ```python
 # rewards.py
@@ -569,9 +524,10 @@ trl:
    beta: 0.001
    max_completion_length: 256
    use_vllm: True
+    vllm_device: auto
+    vllm_gpu_memory_utilization: 0.15
    num_generations: 4
    reward_funcs: ["rewards.rand_reward_func"]    # format: '{file_name}.{fn_name}'
-    reward_weights: [1.0]
 datasets:
  - path: openai/gsm8k
    name: main
@@ -580,21 +536,6 @@ datasets:

 To see other examples of custom reward functions, please see [TRL GRPO Docs](https://github.com/huggingface/trl/blob/main/docs/source/grpo_trainer.md#using-a-custom-reward-function).

-To see description of the configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/config/models/input/v0_4_1/trl.py).
-
-### SimPO
-
-SimPO uses [CPOTrainer](https://huggingface.co/docs/trl/main/en/cpo_trainer) but with alternative loss function.
-
-```yaml
-rl: simpo
-rl_beta: 0.1  # default in CPOTrainer
-cpo_alpha: 1.0  # default in CPOTrainer
-simpo_gamma: 0.5  # default in CPOTrainer
-```
-
-This method uses the same dataset format as [DPO](#dpo).
-
 ### Using local dataset files

 ```yaml
--- a/docs/sequence_parallelism.qmd
+++ b/docs/sequence_parallelism.qmd
@@ -1,100 +0,0 @@
---
-title: Sequence Parallelism
-description: Train with long sequences split across multiple GPUs.
---
-
-# Sequence Parallelism
-
-Sequence parallelism is a technique that splits sequences across multiple GPUs,
-allowing you to train with very long sequences that wouldn't fit on a single GPU. Each
-GPU processes a different portion of the sequence, and the results are aggregated
-through a ring communication pattern.
-
-## When to Use Sequence Parallelism
-
-Use sequence parallelism when:
-
- You need to train with sequence lengths that don't fit into a single GPU's memory
- You have multiple GPUs available
- You're experiencing OOM (Out Of Memory) errors with long sequences
-
-## Configuration
-
-To enable sequence parallelism, add the following to your configuration file:
-
-```yaml
-# Set to a divisor (> 1) of the number of GPUs available
-sequence_parallel_degree: 4  # Split sequences across 4 GPUs
-# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
-heads_k_stride: 1
-# Optional; one of "varlen_llama3", "batch_ring", "batch_zigzag", "batch_stripe". Defaults to
-# "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
-ring_attn_func:
-```
-
-The `sequence_parallel_degree` should be a divisor of the total number of GPUs. For example:
-
- With 8 GPUs, valid values would be 2, 4, or 8
- With 4 GPUs, valid values would be 2 or 4
-
-## Implementation Details
-
-When sequence parallelism is enabled:
-
-1. Each sequence is divided into equal chunks across the GPUs in a sequence parallel group
-2. The data collator handles the chunking of input_ids, attention_mask, labels, and position_ids
-3. Position IDs are adjusted to maintain proper relative positions, especially for packed sequences
-4. The trainer uses special ring communication patterns for attention operations
-
-## Requirements
-
-To use sequence parallelism, you need:
-
- Multiple GPUs (at least 2)
- The `ring-flash-attn` package. Install with:
-  - `pip install axolotl[ring-flash-attn]` (preferred)
-  - `pip install ring-flash-attn>=0.1.4`
-
-## Limitations
-
- Flash attention must be enabled for this to work (`flash_attention: true` in config YAML)
- May have a small performance overhead due to communication between GPUs
-
-## Example
-
-```yaml
-base_model: meta-llama/Llama-3-8B-Instruct
-sequence_len: 8192
-
-...
-
-sequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU
-flash_attention: true  # Required with sequence parallelism
-# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
-heads_k_stride: 1
-
-...
-```
-
-This will train the Llama 3 8B model with 8K context length, with each sequence split
-into 2 subsequences of length 4096 across 2 GPUs.
-
-## Sample Packing with Sequence Parallelism
-
-Sequence parallelism is compatible with Axolotl's sample packing functionality. When using both features together:
-
-1. Samples are first packed together
-2. The packed sequences are then divided across GPUs in the sequence parallel group
-3. Position IDs are automatically adjusted to maintain proper relative positions
-
-## Effect on Batch Size
-
-When using sequence parallelism, your effective global batch size is **divided** by the `sequence_parallel_degree`. This happens because:
-
- Each group of `sequence_parallel_degree` GPUs works on the same batch (just different parts of each sequence)
- The number of batches processed per step decreases
-
-For example:
- With 8 GPUs and no sequence parallelism: 8 different batches processed per step
- With 8 GPUs and `sequence_parallel_degree=4`: Only 2 different batches processed per step (each split across 4 GPUs)
- If your per-GPU `micro_batch_size` is 2, the global batch size decreases from 16 to 4
--- a/examples/cerebras/btlm-ft.yml
+++ b/examples/cerebras/btlm-ft.yml
@@ -8,6 +8,10 @@ tokenizer_type: GPT2Tokenizer
 trust_remote_code: true
 tokenizer_use_fast: true
 tokenizer_legacy: true
+
+load_in_8bit: false
+load_in_4bit: false
+strict: false
 push_dataset_to_hub:
 hf_use_auth_token: true
 datasets:
@@ -30,6 +34,7 @@ lora_alpha:
 lora_dropout:
 lora_target_modules:
 lora_target_linear:
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -53,12 +58,16 @@ learning_rate: 0.000085
 train_on_inputs: true
 group_by_length: false
 bf16: auto
+fp16:
 tf32: true

 gradient_checkpointing: false
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1

+xformers_attention:
 flash_attention: true
 sdp_attention:
 flash_optimum:
@@ -71,6 +80,8 @@ evals_per_epoch: 4
 saves_per_epoch: 1
 save_total_limit:

+debug:
+deepspeed:
 weight_decay: 0.1
 special_tokens:
  pad_token: "<|endoftext|>"
--- a/examples/cerebras/qlora.yml
+++ b/examples/cerebras/qlora.yml
@@ -4,6 +4,7 @@ base_model: cerebras/Cerebras-GPT-1.3B

 load_in_8bit: false
 load_in_4bit: true
+strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
@@ -21,6 +22,7 @@ lora_target_modules:
  - c_attn
  - c_proj
 lora_target_linear:
+lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
@@ -34,10 +36,15 @@ optimizer: paged_adamw_8bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0002
+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: true
 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
 xformers_attention: true
 flash_attention:
@@ -46,6 +53,10 @@ gptq_model_v1:
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.1
+fsdp:
+fsdp_config:
 special_tokens:
  pad_token: "<|endoftext|>"
--- a/examples/code-llama/13b/lora.yml
+++ b/examples/code-llama/13b/lora.yml
@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer

 load_in_8bit: true
 load_in_4bit: false
+strict: false

 datasets:
  - path: mhenrichsen/alpaca_2k_test
@@ -25,6 +26,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -39,18 +41,29 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/code-llama/13b/qlora.yml
+++ b/examples/code-llama/13b/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer

 load_in_8bit: false
 load_in_4bit: true
+strict: false

 datasets:
  - path: mhenrichsen/alpaca_2k_test
@@ -25,7 +26,9 @@ pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
+lora_target_modules:
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -40,18 +43,28 @@ optimizer: paged_adamw_32bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/code-llama/34b/lora.yml
+++ b/examples/code-llama/34b/lora.yml
@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer

 load_in_8bit: true
 load_in_4bit: false
+strict: false

 datasets:
  - path: mhenrichsen/alpaca_2k_test
@@ -25,6 +26,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -39,18 +41,29 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/code-llama/34b/qlora.yml
+++ b/examples/code-llama/34b/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer

 load_in_8bit: false
 load_in_4bit: true
+strict: false

 datasets:
  - path: mhenrichsen/alpaca_2k_test
@@ -25,7 +26,9 @@ pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
+lora_target_modules:
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -40,18 +43,28 @@ optimizer: paged_adamw_32bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/code-llama/7b/lora.yml
+++ b/examples/code-llama/7b/lora.yml
@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer

 load_in_8bit: true
 load_in_4bit: false
+strict: false

 datasets:
  - path: mhenrichsen/alpaca_2k_test
@@ -25,6 +26,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -39,18 +41,29 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/code-llama/7b/qlora.yml
+++ b/examples/code-llama/7b/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer

 load_in_8bit: false
 load_in_4bit: true
+strict: false

 datasets:
  - path: mhenrichsen/alpaca_2k_test
@@ -25,7 +26,9 @@ pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
+lora_target_modules:
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -40,18 +43,28 @@ optimizer: paged_adamw_32bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/cohere/command-r-7b-qlora.yml
+++ b/examples/cohere/command-r-7b-qlora.yml
@@ -1,58 +0,0 @@
-base_model: CohereForAI/c4ai-command-r7b-12-2024
-model_type: AutoModelForCausalLM
-tokenizer_type: AutoTokenizer
-
-load_in_8bit: false
-load_in_4bit: true
-
-# huggingface repo
-chat_template: cohere
-datasets:
-  - path: cgato/SlimOrcaDedupCleaned
-    type: chat_template
-    field_messages: conversations
-    message_property_mappings:
-      role: from
-      content: value
-
-val_set_size: 0.0
-output_dir: ./outputs/out
-
-adapter: qlora
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-
-sequence_len: 2048
-sample_packing: true
-eval_sample_packing: false
-pad_to_sequence_len: true
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-
-gradient_accumulation_steps: 4
-micro_batch_size: 1
-num_epochs: 4
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: true
-
-gradient_checkpointing: true
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_ratio: 0.1
-evals_per_epoch:
-saves_per_epoch: 1
-weight_decay: 0.0
-special_tokens:
--- a/examples/dbrx/16bit-lora.yaml
+++ b/examples/dbrx/16bit-lora.yaml
@@ -4,6 +4,10 @@ base_model: LnL-AI/dbrx-base-converted-v2

 trust_remote_code: true

+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
 datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
@@ -44,20 +48,26 @@ optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: false  # don't use with fsdp_activation_checkpointing
 gradient_checkpointing_kwargs:
  use_reentrant: false
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 10
 evals_per_epoch:
 saves_per_epoch: 1
-
+debug:
 weight_decay: 0.0
 fsdp:
  - full_shard
--- a/examples/dbrx/8bit-lora.yaml
+++ b/examples/dbrx/8bit-lora.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true

 load_in_8bit: true
 load_in_4bit: false
+strict: false

 datasets:
  - path: tatsu-lab/alpaca
@@ -47,20 +48,26 @@ optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: false  # don't use with fsdp_activation_checkpointing
 gradient_checkpointing_kwargs:
  use_reentrant: false
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 10
 evals_per_epoch:
 saves_per_epoch: 1
-
+debug:
 weight_decay: 0.0
 fsdp:
  - full_shard
--- a/examples/dbrx/fft-ds-zero3.yaml
+++ b/examples/dbrx/fft-ds-zero3.yaml
@@ -4,6 +4,10 @@ base_model: LnL-AI/dbrx-base-converted-v2

 trust_remote_code: true

+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
 datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
@@ -31,19 +35,25 @@ optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 10
 evals_per_epoch:
 saves_per_epoch: 1
-
+debug:
 weight_decay: 0.0
 deepspeed: deepspeed_configs/zero3_bf16.json
--- a/examples/deepcoder/deepcoder-14B-preview-lora.yml
+++ b/examples/deepcoder/deepcoder-14B-preview-lora.yml
@@ -1,58 +0,0 @@
-base_model: agentica-org/DeepCoder-14B-Preview
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-
-load_in_8bit: true
-load_in_4bit: false
-strict: false
-
-datasets:
-  - path: fozziethebeat/alpaca_messages_2k_test
-    type: chat_template
-    field_messages: messages
-    message_property_mappings:
-      role: role
-      content: content
-
-dataset_prepared_path:
-val_set_size: 0.05
-output_dir: ./outputs/lora-out
-
-sequence_len: 4096
-sample_packing: true
-eval_sample_packing: false
-pad_to_sequence_len: true
-
-adapter: lora
-lora_model_dir:
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 2
-micro_batch_size: 2
-num_epochs: 4
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: true
-
-gradient_checkpointing: true
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_steps: 10
-evals_per_epoch: 1
-saves_per_epoch: 1
-weight_decay: 0.0
-special_tokens:
--- a/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
@@ -1,58 +0,0 @@
-base_model: deepcogito/cogito-v1-preview-llama-3B
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-
-load_in_8bit: true
-load_in_4bit: false
-strict: false
-
-datasets:
-  - path: fozziethebeat/alpaca_messages_2k_test
-    type: chat_template
-    field_messages: messages
-    message_property_mappings:
-      role: role
-      content: content
-
-dataset_prepared_path:
-val_set_size: 0.05
-output_dir: ./outputs/lora-out
-
-sequence_len: 4096
-sample_packing: true
-eval_sample_packing: false
-pad_to_sequence_len: true
-
-adapter: lora
-lora_model_dir:
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 2
-micro_batch_size: 2
-num_epochs: 1
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: true
-
-gradient_checkpointing: true
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_steps: 10
-evals_per_epoch: 1
-saves_per_epoch: 1
-weight_decay: 0.0
-special_tokens:
--- a/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
@@ -1,58 +0,0 @@
-base_model: deepcogito/cogito-v1-preview-qwen-14B
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-
-load_in_8bit: true
-load_in_4bit: false
-strict: false
-
-datasets:
-  - path: fozziethebeat/alpaca_messages_2k_test
-    type: chat_template
-    field_messages: messages
-    message_property_mappings:
-      role: role
-      content: content
-
-dataset_prepared_path:
-val_set_size: 0.05
-output_dir: ./outputs/lora-out
-
-sequence_len: 4096
-sample_packing: true
-eval_sample_packing: false
-pad_to_sequence_len: true
-
-adapter: lora
-lora_model_dir:
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 2
-micro_batch_size: 2
-num_epochs: 1
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: true
-
-gradient_checkpointing: true
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_steps: 10
-evals_per_epoch: 1
-saves_per_epoch: 1
-weight_decay: 0.0
-special_tokens:
--- a/examples/deepseek-v2/fft-fsdp-16b.yaml
+++ b/examples/deepseek-v2/fft-fsdp-16b.yaml
@@ -3,6 +3,10 @@ base_model: deepseek-ai/DeepSeek-V2-Lite
 # hub_model_id: username/custom_model_name
 trust_remote_code: true

+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
 datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
@@ -27,19 +31,27 @@ optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 2e-5

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
+early_stopping_patience:
 resume_from_checkpoint:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 2
+eval_table_size:
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
 special_tokens:
 fsdp:
--- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true

 load_in_8bit: false
 load_in_4bit: true
+strict: false


 plugins:
@@ -51,19 +52,27 @@ optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 2e-5

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
+early_stopping_patience:
 resume_from_checkpoint:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 2
+eval_table_size:
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
 special_tokens:
 fsdp:
--- a/examples/falcon/config-7b-lora.yml
+++ b/examples/falcon/config-7b-lora.yml
@@ -11,6 +11,7 @@ trust_remote_code: true
 load_in_8bit: true
 load_in_4bit: false
 gptq: false
+strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
@@ -24,7 +25,9 @@ max_packed_sequence_len:
 lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.0
+lora_target_modules:
 lora_target_linear: true
+lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
@@ -38,10 +41,15 @@ optimizer: adamw_bnb_8bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.00003
+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: true
 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
 xformers_attention: true
 flash_attention:
@@ -50,7 +58,11 @@ gptq_model_v1:
 warmup_steps: 40
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  pad_token: "<|endoftext|>"
  bos_token: "<|endoftext|>"
--- a/examples/falcon/config-7b-qlora.yml
+++ b/examples/falcon/config-7b-qlora.yml
@@ -15,6 +15,7 @@ load_in_8bit: false
 # enable 4bit for QLoRA
 load_in_4bit: true
 gptq: false
+strict: false
 push_dataset_to_hub:
 datasets:
  - path: QingyiSi/Alpaca-CoT
@@ -37,7 +38,9 @@ lora_alpha: 16
 # 0.05 for 33B and 65B models
 lora_dropout: 0.05
 # add LoRA modules on all linear layers of the base model
+lora_target_modules:
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -64,7 +67,10 @@ lr_scheduler: cosine
 # - 2e-4 for 7b & 13b
 # - 1e-4 for 33b & 64b
 learning_rate: 0.0002
+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: true
 gradient_checkpointing: true
 # stop training after this many evaluation losses have increased in a row
@@ -72,6 +78,7 @@ gradient_checkpointing: true
 early_stopping_patience: 3
 resume_from_checkpoint:
 auto_resume_from_checkpoints: true
+local_rank:
 logging_steps: 1
 xformers_attention: true
 flash_attention:
@@ -80,7 +87,11 @@ gptq_model_v1:
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.000001
+fsdp:
+fsdp_config:
 special_tokens:
  pad_token: "<|endoftext|>"
  bos_token: "<|endoftext|>"
--- a/examples/falcon/config-7b.yml
+++ b/examples/falcon/config-7b.yml
@@ -7,7 +7,11 @@ tokenizer_type: AutoTokenizer

 # required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
 trust_remote_code: true
+
+load_in_8bit: false
+load_in_4bit: false
 gptq: false
+strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
@@ -21,7 +25,9 @@ max_packed_sequence_len:
 lora_r: 64
 lora_alpha: 32
 lora_dropout: 0.0
+lora_target_modules:
 lora_target_linear: true
+lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
@@ -35,10 +41,15 @@ optimizer: adamw_bnb_8bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.00003
+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: true
 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
 xformers_attention: true
 flash_attention:
@@ -47,7 +58,11 @@ gptq_model_v1:
 warmup_steps: 40
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  pad_token: "<|endoftext|>"
  bos_token: "<|endoftext|>"
--- a/examples/gemma/qlora.yml
+++ b/examples/gemma/qlora.yml
@@ -8,6 +8,7 @@ tokenizer_type: AutoTokenizer

 load_in_8bit: false
 load_in_4bit: true
+strict: false

 # huggingface repo
 datasets:
@@ -41,16 +42,28 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
--- a/examples/gemma2/qlora.yml
+++ b/examples/gemma2/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer

 load_in_8bit: false
 load_in_4bit: true
+strict: false

 # huggingface repo
 chat_template: gemma
@@ -47,16 +48,28 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: true

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch:
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
--- a/examples/gemma2/reward-model.yaml
+++ b/examples/gemma2/reward-model.yaml
@@ -6,6 +6,10 @@ tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name

+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
 reward_model: true
 chat_template: gemma
 datasets:
@@ -34,6 +38,8 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: true
 fp16:
 tf32: true
@@ -41,12 +47,21 @@ tf32: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_ratio: 0.1
 evals_per_epoch:
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
--- a/examples/gemma3/gemma-3-1b-qlora.yml
+++ b/examples/gemma3/gemma-3-1b-qlora.yml
@@ -1,66 +0,0 @@
-base_model: google/gemma-3-1b-it
-# optionally might have model_type or tokenizer_type
-model_type: AutoModelForCausalLM
-tokenizer_type: AutoTokenizer
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-
-# gemma3 doesn't seem to play nice with ddp
-ddp_find_unused_parameters: true
-
-load_in_8bit: false
-load_in_4bit: true
-
-# huggingface repo
-chat_template: gemma3
-datasets:
-  - path: cgato/SlimOrcaDedupCleaned
-    type: chat_template
-    field_messages: conversations
-    message_property_mappings:
-      role: from
-      content: value
-
-val_set_size: 0.0
-output_dir: ./outputs/out
-
-adapter: qlora
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-
-sequence_len: 2048
-sample_packing: true
-eval_sample_packing: false
-pad_to_sequence_len: true
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-
-gradient_accumulation_steps: 4
-micro_batch_size: 1
-num_epochs: 4
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: true
-
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_ratio: 0.1
-evals_per_epoch:
-saves_per_epoch: 1
-weight_decay: 0.0
-special_tokens:
--- a/examples/gemma3/gemma-3-4b-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-qlora.yml
@@ -1,60 +0,0 @@
-base_model: google/gemma-3-4b-it
-
-load_in_4bit: true
-
-# gemma3 doesn't seem to play nice with ddp
-ddp_find_unused_parameters: true
-
-chat_template: gemma3
-datasets:
-  - path: cgato/SlimOrcaDedupCleaned
-    type: chat_template
-    field_messages: conversations
-    message_property_mappings:
-      role: from
-      content: value
-
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.01
-output_dir: ./outputs/out
-
-adapter: qlora
-lora_model_dir:
-
-sequence_len: 2048
-sample_packing: true
-pad_to_sequence_len: true
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 1
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: true
-fp16:
-tf32: true
-
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-logging_steps: 1
-flash_attention: true
-eager_attention:
-
-warmup_ratio: 0.1
-evals_per_epoch: 1
-saves_per_epoch: 1
-weight_decay: 0.0
--- a/examples/gemma3/gemma-3-4b-vision-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-vision-qlora.yml
@@ -1,62 +0,0 @@
-base_model: google/gemma-3-4b-it
-processor_type: AutoProcessor
-
-load_in_4bit: true
-
-# these 3 lines are needed for now to handle vision chat templates w images
-skip_prepare_dataset: true
-remove_unused_columns: false
-sample_packing: false
-
-# gemma3 doesn't seem to play nice with ddp
-ddp_find_unused_parameters: true
-
-chat_template: gemma3
-datasets:
-  - path: HuggingFaceH4/llava-instruct-mix-vsft
-    type: chat_template
-    split: train[:1%]
-    field_messages: messages
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.01
-output_dir: ./outputs/out
-
-adapter: qlora
-lora_model_dir:
-
-sequence_len: 2048
-pad_to_sequence_len: false
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 1
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: true
-fp16:
-tf32: true
-
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-logging_steps: 1
-flash_attention: true
-eager_attention:
-
-warmup_ratio: 0.1
-evals_per_epoch: 1
-saves_per_epoch: 1
-weight_decay: 0.0
--- a/examples/glm4/qlora-32b.yaml
+++ b/examples/glm4/qlora-32b.yaml
@@ -1,62 +0,0 @@
-base_model: THUDM/GLM-4-32B-0414
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-
-load_in_4bit: true
-
-datasets:
-  - path: teknium/GPT4-LLM-Cleaned
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0
-output_dir: ./outputs/qlora-out
-
-adapter: qlora
-lora_model_dir:
-
-sequence_len: 2048
-sample_packing: true
-eval_sample_packing: true
-pad_to_sequence_len: true
-
-lora_r: 16
-lora_alpha: 32
-lora_dropout: 0.05
-lora_target_modules:
-  - gate_proj
-  - down_proj
-  - up_proj
-  - q_proj
-  - v_proj
-  - k_proj
-  - o_proj
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 2
-micro_batch_size: 2
-num_epochs: 1
-optimizer: adamw_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: false
-
-gradient_checkpointing: true
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-loss_watchdog_threshold: 5.0
-loss_watchdog_patience: 3
-
-warmup_steps: 10
-evals_per_epoch: 1
-saves_per_epoch: 1
-weight_decay: 0.0
-special_tokens:
--- a/examples/gptj/qlora.yml
+++ b/examples/gptj/qlora.yml
@@ -4,6 +4,7 @@ base_model: EleutherAI/gpt-j-6b

 load_in_8bit: false
 load_in_4bit: true
+strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
@@ -17,7 +18,9 @@ max_packed_sequence_len:
 lora_r: 8
 lora_alpha: 32
 lora_dropout: 0.05
+lora_target_modules:
 lora_target_linear: true
+lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
@@ -31,10 +34,15 @@ optimizer: paged_adamw_8bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0001
+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: true
 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
 xformers_attention: true
 flash_attention:
@@ -43,6 +51,10 @@ gptq_model_v1:
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.1
+fsdp:
+fsdp_config:
 special_tokens:
  pad_token: "<|endoftext|>"
--- a/examples/jamba/qlora.yaml
+++ b/examples/jamba/qlora.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true

 load_in_8bit: false
 load_in_4bit: true
+strict: false

 datasets:
  - path: mhenrichsen/alpaca_2k_test
@@ -39,18 +40,26 @@ optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.00001

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 10
 evals_per_epoch:
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
 special_tokens:
--- a/examples/jamba/qlora_deepspeed.yaml
+++ b/examples/jamba/qlora_deepspeed.yaml
@@ -5,6 +5,7 @@ trust_remote_code: true

 load_in_8bit: false
 load_in_4bit: true
+strict: false

 datasets:
  - path: mhenrichsen/alpaca_2k_test
@@ -38,20 +39,26 @@ optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.00001

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 10
 evals_per_epoch:
 saves_per_epoch: 1
-
+debug:
 deepspeed: deepspeed_configs/zero2.json
 weight_decay: 0.0
 special_tokens:
--- a/examples/jamba/qlora_fsdp_large.yaml
+++ b/examples/jamba/qlora_fsdp_large.yaml
@@ -5,6 +5,7 @@ tokenizer_type: AutoTokenizer
 # hub_model_id: username/custom_model_name

 load_in_4bit: true
+strict: false
 use_tensorboard: true
 chat_template: jamba
 datasets:
@@ -38,6 +39,8 @@ optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 0.00001

+train_on_inputs: false
+group_by_length: false
 bf16: true
 tf32: true

--- a/examples/jeopardy-bot/config.yml
+++ b/examples/jeopardy-bot/config.yml
@@ -33,9 +33,13 @@ optimizer: adamw_bnb_8bit
 torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.00003
+train_on_inputs: false
+group_by_length: false
 bf16: auto
 tf32: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 5
 xformers_attention: true
 flash_attention:
@@ -44,7 +48,11 @@ gptq_model_v1:
 warmup_steps: 20
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.1
+fsdp:
+fsdp_config:
 tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/llama-2/fft_optimized.yml
+++ b/examples/llama-2/fft_optimized.yml
@@ -5,6 +5,10 @@ tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name

+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
 datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
@@ -22,6 +26,7 @@ lora_r:
 lora_alpha:
 lora_dropout:
 lora_target_linear:
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -36,12 +41,18 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
@@ -50,8 +61,11 @@ flash_attn_fuse_mlp: true

 warmup_steps: 100
 evals_per_epoch: 4
+eval_table_size:
 saves_per_epoch: 1
-
+debug:
 deepspeed: #deepspeed_configs/zero2.json # multi-gpu only
 weight_decay: 0.1
+fsdp:
+fsdp_config:
 special_tokens:
--- a/examples/llama-2/gptq-lora.yml
+++ b/examples/llama-2/gptq-lora.yml
@@ -10,6 +10,9 @@ gptq_disable_exllama: true

 tokenizer_use_fast: true
 tokenizer_legacy: true
+load_in_8bit: false
+load_in_4bit: false
+strict: false
 push_dataset_to_hub:
 hf_use_auth_token: true
 datasets:
@@ -30,6 +33,7 @@ lora_target_modules:
  - q_proj
  - v_proj
 lora_target_linear:
+lora_fan_in_fan_out:
 wandb_project:
 wandb_watch:
 wandb_name:
@@ -46,19 +50,26 @@ torchdistx_path:
 lr_scheduler: cosine
 lr_quadratic_warmup: true
 learning_rate: 0.000017
+train_on_inputs: false
+group_by_length: false
 bf16: false
 fp16: false
 float16: true
 tf32: true
 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention:
 sdp_attention:
 flash_optimum:
 warmup_steps: 100
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.1
 special_tokens:
  bos_token: "<s>"
--- a/examples/llama-2/lisa.yml
+++ b/examples/llama-2/lisa.yml
@@ -5,6 +5,10 @@ tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name

+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
@@ -22,6 +26,7 @@ lora_r:
 lora_alpha:
 lora_dropout:
 lora_target_linear:
+lora_fan_in_fan_out:

 lisa_n_layers: 4
 lisa_step_interval: 20
@@ -40,12 +45,18 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 5e-5 # recommendation from lisa paper for 7b

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
@@ -54,8 +65,13 @@ flash_attn_fuse_mlp: true

 warmup_steps: 100
 evals_per_epoch: 4
+eval_table_size:
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.1
+fsdp:
+fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/llama-2/loftq.yml
+++ b/examples/llama-2/loftq.yml
@@ -5,6 +5,10 @@ tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name

+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
 datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
@@ -22,6 +26,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:
 peft:
  loftq_config:
    loftq_bits: 4
@@ -39,16 +44,29 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer

 load_in_8bit: true
 load_in_4bit: false
+strict: false

 datasets:
  - path: mhenrichsen/alpaca_2k_test
@@ -25,6 +26,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -39,16 +41,29 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
--- a/examples/llama-2/qlora-fsdp.yml
+++ b/examples/llama-2/qlora-fsdp.yml
@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer

 load_in_8bit: false
 load_in_4bit: true
+strict: false

 datasets:
  - path: yahma/alpaca-cleaned
@@ -25,7 +26,9 @@ pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
+lora_target_modules:
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -40,19 +43,28 @@ optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 0.00001

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
 fsdp:
  - full_shard
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer

 load_in_8bit: false
 load_in_4bit: true
+strict: false

 datasets:
  - path: mhenrichsen/alpaca_2k_test
@@ -25,7 +26,9 @@ pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
+lora_target_modules:
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -40,16 +43,27 @@ optimizer: paged_adamw_32bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
--- a/examples/llama-2/relora.yml
+++ b/examples/llama-2/relora.yml
@@ -5,6 +5,7 @@ tokenizer_type: LlamaTokenizer

 load_in_8bit: false
 load_in_4bit: true
+strict: false

 datasets:
  - path: teknium/GPT4-LLM-Cleaned
@@ -23,7 +24,9 @@ pad_to_sequence_len: true
 lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.05
+lora_target_modules:
 lora_target_linear: true
+lora_fan_in_fan_out:

 relora_steps: 150
 relora_warmup_steps: 10
@@ -42,18 +45,28 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
--- a/examples/llama-3-vision/lora-11b.yaml
+++ b/examples/llama-3-vision/lora-11b.yaml
@@ -4,6 +4,7 @@ processor_type: AutoProcessor
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name

+strict: false

 # these 3 lines are needed for now to handle vision chat templates w images
 skip_prepare_dataset: true
@@ -44,11 +45,14 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: true
 fp16:
 tf32: true

 gradient_checkpointing: true
+local_rank:
 logging_steps: 1
 flash_attention: true
 eager_attention:
@@ -56,4 +60,8 @@ eager_attention:
 warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
--- a/examples/llama-3/fft-8b-liger-fsdp.yaml
+++ b/examples/llama-3/fft-8b-liger-fsdp.yaml
@@ -9,6 +9,7 @@ liger_rms_norm: true
 liger_glu_activation: true
 liger_fused_linear_cross_entropy: true

+strict: false

 chat_template: llama3
 datasets:
@@ -41,19 +42,27 @@ optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 2e-5

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
+early_stopping_patience:
 resume_from_checkpoint:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 2
+eval_table_size:
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
 fsdp:
  - full_shard
--- a/examples/llama-3/fft-8b.yaml
+++ b/examples/llama-3/fft-8b.yaml
@@ -2,6 +2,10 @@ base_model: NousResearch/Meta-Llama-3.1-8B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name

+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
 datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
@@ -26,19 +30,29 @@ optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 2e-5

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
+early_stopping_patience:
 resume_from_checkpoint:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 2
+eval_table_size:
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  pad_token: <|end_of_text|>
--- a/examples/llama-3/instruct-dpo-lora-8b.yml
+++ b/examples/llama-3/instruct-dpo-lora-8b.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer

 load_in_8bit: true
 load_in_4bit: false
+strict: false

 chat_template: llama3
 rl: dpo
@@ -41,6 +42,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -55,15 +57,28 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
--- a/examples/llama-3/instruct-lora-8b.yml
+++ b/examples/llama-3/instruct-lora-8b.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer

 load_in_8bit: true
 load_in_4bit: false
+strict: false

 chat_template: llama3
 datasets:
@@ -36,6 +37,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -50,17 +52,30 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
   pad_token: <|end_of_text|>
--- a/examples/llama-3/lora-1b-deduplicate-dpo.yml
+++ b/examples/llama-3/lora-1b-deduplicate-dpo.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer

 load_in_8bit: true
 load_in_4bit: false
+strict: false

 chat_template: llama3
 rl: dpo
@@ -57,6 +58,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -71,15 +73,28 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
--- a/examples/llama-3/lora-1b-deduplicate-sft.yml
+++ b/examples/llama-3/lora-1b-deduplicate-sft.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer

 load_in_8bit: true
 load_in_4bit: false
+strict: false

 datasets:
  - path: mhenrichsen/alpaca_2k_test
@@ -18,6 +19,7 @@ val_set_size: 0.0
 output_dir: ./outputs/lora-out

 dataset_exact_deduplication: true
+test_value: true

 sequence_len: 4096
 sample_packing: true
@@ -30,6 +32,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:
 lora_modules_to_save:
  - embed_tokens
  - lm_head
@@ -47,17 +50,30 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
   pad_token: <|end_of_text|>
--- a/examples/llama-3/lora-1b-kernels.yml
+++ b/examples/llama-3/lora-1b-kernels.yml
@@ -2,6 +2,10 @@ base_model: NousResearch/Llama-3.2-1B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name

+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
@@ -20,6 +24,7 @@ lora_r: 16
 lora_alpha: 32
 # Currently, we don't support dropout with our custom Triton kernels
 # lora_dropout: 0.05
+lora_fan_in_fan_out:
 lora_target_modules:
  - gate_proj
  - down_proj
@@ -48,12 +53,18 @@ optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 loss_watchdog_threshold: 5.0
@@ -62,6 +73,10 @@ loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  pad_token: "<|end_of_text|>"
--- a/examples/llama-3/lora-1b-ray.yml
+++ b/examples/llama-3/lora-1b-ray.yml
@@ -2,6 +2,10 @@ base_model: NousResearch/Llama-3.2-1B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name

+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
@@ -20,6 +24,7 @@ pad_to_sequence_len: true
 lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.05
+lora_fan_in_fan_out:
 lora_target_modules:
  - gate_proj
  - down_proj
@@ -42,12 +47,18 @@ optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 loss_watchdog_threshold: 5.0
@@ -56,9 +67,11 @@ loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
-
+debug:
 deepspeed: deepspeed_configs/zero3.json
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  pad_token: "<|end_of_text|>"

--- a/examples/llama-3/lora-1b-sample-packing-sequentially.yml
+++ b/examples/llama-3/lora-1b-sample-packing-sequentially.yml
@@ -1,65 +0,0 @@
-base_model: meta-llama/Llama-3.2-1B
-# optionally might have model_type or tokenizer_type
-model_type: LlamaForCausalLM
-tokenizer_type: AutoTokenizer
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-
-load_in_8bit: true
-load_in_4bit: false
-
-datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-dataset_prepared_path:
-val_set_size: 0.0
-output_dir: ./outputs/lora-out
-
-test_value: true
-
-sequence_len: 4096
-sample_packing: true
-sample_packing_sequentially: true
-curriculum_sampling: true
-eval_sample_packing: false
-pad_to_sequence_len: true
-
-adapter: lora
-lora_model_dir:
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_modules_to_save:
-  - embed_tokens
-  - lm_head
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 4
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: false
-
-gradient_checkpointing: true
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_steps: 10
-evals_per_epoch: 4
-saves_per_epoch: 1
-weight_decay: 0.0
-special_tokens:
-  pad_token: <|end_of_text|>
--- a/examples/llama-3/lora-1b.yml
+++ b/examples/llama-3/lora-1b.yml
@@ -2,6 +2,10 @@ base_model: NousResearch/Llama-3.2-1B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name

+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
@@ -20,6 +24,7 @@ pad_to_sequence_len: true
 lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.05
+lora_fan_in_fan_out:
 lora_target_modules:
  - gate_proj
  - down_proj
@@ -42,12 +47,18 @@ optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 loss_watchdog_threshold: 5.0
@@ -56,6 +67,10 @@ loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  pad_token: "<|end_of_text|>"
--- a/examples/llama-3/lora-8b.yml
+++ b/examples/llama-3/lora-8b.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer

 load_in_8bit: true
 load_in_4bit: false
+strict: false

 datasets:
  - path: mhenrichsen/alpaca_2k_test
@@ -26,6 +27,7 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:
 lora_modules_to_save:
  - embed_tokens
  - lm_head
@@ -43,17 +45,30 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
   pad_token: <|end_of_text|>
--- a/examples/llama-3/qlora-1b-kto.yaml
+++ b/examples/llama-3/qlora-1b-kto.yaml
@@ -4,6 +4,7 @@ base_model: meta-llama/Llama-3.2-1B

 load_in_8bit: false
 load_in_4bit: true
+strict: false

 rl: kto
 rl_beta: 0.5
@@ -31,6 +32,7 @@ lora_r: 32
 lora_alpha: 64
 lora_dropout: 0.05
 lora_target_linear: true
+lora_fan_in_fan_out:

 wandb_project:
 wandb_entity:
@@ -45,19 +47,31 @@ optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: true

 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
-  use_reentrant: false
+  use_reentrant: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 warmup_steps: 20
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  pad_token: "<|end_of_text|>"
--- a/examples/llama-3/qlora-1b.yml
+++ b/examples/llama-3/qlora-1b.yml
@@ -4,6 +4,7 @@ base_model: NousResearch/Llama-3.2-1B

 load_in_8bit: false
 load_in_4bit: true
+strict: false

 datasets:
  - path: teknium/GPT4-LLM-Cleaned
@@ -23,6 +24,7 @@ pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
+lora_fan_in_fan_out:
 lora_target_modules:
  - gate_proj
  - down_proj
@@ -45,12 +47,18 @@ optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

+train_on_inputs: false
+group_by_length: false
 bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
+local_rank:
 logging_steps: 1
+xformers_attention:
 flash_attention: true

 loss_watchdog_threshold: 5.0
@@ -58,7 +66,13 @@ loss_watchdog_patience: 3

 warmup_steps: 10
 evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
 saves_per_epoch: 1
+debug:
+deepspeed:
 weight_decay: 0.0
+fsdp:
+fsdp_config:
 special_tokens:
  pad_token: "<|end_of_text|>"
--- a/Show More
+++ b/Show More