Merge branch 'main' into map-dataset-fetcher-fix

handle possibly empty batch
2025-06-26 11:20:05 -04:00 · 2025-06-26 10:59:27 -04:00
353 changed files with 1774 additions and 16758 deletions
--- a/.bandit
+++ b/.bandit
@@ -1,3 +1,3 @@
 [bandit]
 exclude = tests
-skips = B101,B615
+skips = B101
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -5,13 +5,11 @@ on:
    branches:
      - "main"
    paths:
-      - 'docker/Dockerfile-base'
-      - 'docker/Dockerfile-uv-base'
+      - 'Dockerfile-base'
      - '.github/workflows/base.yml'
  pull_request:
    paths:
-      - 'docker/Dockerfile-base'
-      - 'docker/Dockerfile-uv-base'
+      - 'Dockerfile-base'
      - '.github/workflows/base.yml'
  workflow_dispatch:

@@ -29,11 +27,11 @@ jobs:
            cuda_version: 12.4.1
            cudnn_version: ""
            python_version: "3.11"
-            pytorch: 2.6.0
+            pytorch: 2.5.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
-          - cuda: "126"
-            cuda_version: 12.6.3
+          - cuda: "124"
+            cuda_version: 12.4.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.6.0
@@ -43,7 +41,7 @@ jobs:
            cuda_version: 12.6.3
            cudnn_version: ""
            python_version: "3.11"
-            pytorch: 2.7.0
+            pytorch: 2.6.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
          - cuda: "126"
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -15,16 +15,17 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 126
-            cuda_version: 12.6.3
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.5.1
+            axolotl_extras:
+          - cuda: 124
+            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
-            axolotl_extras:
-          - cuda: 126
-            cuda_version: 12.6.3
-            python_version: "3.11"
-            pytorch: 2.7.0
            axolotl_extras: vllm
+            is_latest: true
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
@@ -82,17 +83,17 @@ jobs:
    strategy:
      matrix:
        include:
-          - cuda: 126
-            cuda_version: 12.6.3
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.5.1
+            axolotl_extras:
+          - cuda: 124
+            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
            axolotl_extras:
            is_latest: true
-          - cuda: 126
-            cuda_version: 12.6.3
-            python_version: "3.11"
-            pytorch: 2.7.0
-            axolotl_extras:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
@@ -145,8 +146,8 @@ jobs:
    strategy:
      matrix:
        include:
-          - cuda: 126
-            cuda_version: 12.6.3
+          - cuda: 124
+            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
            axolotl_extras:
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -26,18 +26,18 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 126
-            cuda_version: 12.6.3
+          - cuda: 124
+            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
-            axolotl_extras:
+            axolotl_extras: vllm
            num_gpus: 2
            nightly_build: "true"
-          - cuda: 126
-            cuda_version: 12.6.3
+          - cuda: 124
+            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.7.0
-            axolotl_extras: vllm
+            pytorch: 2.5.1
+            axolotl_extras:
            num_gpus: 2
            nightly_build: "true"
          - cuda: 126
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -12,16 +12,16 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 126
-            cuda_version: 12.6.3
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.5.1
+            axolotl_extras:
+          - cuda: 124
+            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
            axolotl_extras:
-          - cuda: 126
-            cuda_version: 12.6.3
-            python_version: "3.11"
-            pytorch: 2.7.1
-            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -65,16 +65,16 @@ jobs:
    strategy:
      matrix:
        include:
-          - cuda: 126
-            cuda_version: 12.6.3
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.5.1
+            axolotl_extras:
+          - cuda: 124
+            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
            axolotl_extras:
-          - cuda: 126
-            cuda_version: 12.6.3
-            python_version: "3.11"
-            pytorch: 2.7.1
-            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
--- a/.github/workflows/preview-docs.yml
+++ b/.github/workflows/preview-docs.yml
@@ -28,8 +28,6 @@ jobs:
    steps:
      - name: Check out repository
        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event.pull_request.head.sha }}

      - name: Set up Quarto
        uses: quarto-dev/quarto-actions/setup@v2
@@ -52,11 +50,10 @@ jobs:

      - name: Netlify Publish
        uses: nwtgck/actions-netlify@v3.0
-        id: netlify
        with:
          publish-dir: './_site'
-          enable-pull-request-comment: false
-          enable-github-deployment: false
+          enable-pull-request-comment: true
+          enable-github-deployment: true
          github-token: ${{ secrets.GITHUB_TOKEN }}
          deploy-message: "Deployed On Netlify"
          github-deployment-environment: 'preview'
@@ -64,13 +61,3 @@ jobs:
        env:
          NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
          NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }}
-
-      - name: Update PR with preview link
-        if: ${{ steps.netlify.outcome == 'success' }}
-        uses: marocchino/sticky-pull-request-comment@v2
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          message: |
-            📖 **Documentation Preview**: ${{ steps.netlify.outputs.deploy-url }}
-
-            Deployed on Netlify from commit ${{ github.event.pull_request.head.sha }}
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -18,26 +18,116 @@ jobs:
        env:
          SKIP: no-commit-to-branch

+  preload-cache:
+    name: Preload HF cache
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python_version: ["3.11"]
+        pytorch_version: ["2.6.0"]
+    timeout-minutes: 20
+
+    env:
+      AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
+
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+
+      - name: Restore HF cache
+        id: hf-cache-restore
+        uses: actions/cache/restore@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ runner.os }}-hf-hub-cache-v2
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python_version }}
+          cache: 'pip' # caching pip dependencies
+
+      - name: upgrade pip
+        run: |
+          pip3 install --upgrade pip
+          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
+
+      - name: Install PyTorch
+        run: |
+          pip3 install torch==${{ matrix.pytorch_version }}
+
+      - name: Install dependencies
+        run: |
+          pip3 show torch
+          pip3 install --no-build-isolation -U -e .
+          python scripts/unsloth_install.py | sh
+          python scripts/cutcrossentropy_install.py | sh
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+
+      - name: Make sure PyTorch version wasn't clobbered
+        run: |
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
+
+      - name: Ensure axolotl CLI was installed
+        run: |
+          axolotl --help
+
+      - name: Pre-Download dataset fixture
+        run: |
+          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
+
+      - name: Run tests
+        run: |
+          pytest -v tests/conftest.py
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v5
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          files: ./coverage.xml
+          flags: unittests,pytorch-${{ matrix.pytorch_version }}
+          fail_ci_if_error: false
+
+      - name: cleanup pip cache
+        run: |
+          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
+
+      - name: Save HF cache
+        id: hf-cache
+        uses: actions/cache/save@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
+
  pytest:
    name: PyTest
    runs-on: ubuntu-latest
+    needs: [preload-cache]
    strategy:
      fail-fast: false
      max-parallel: 2
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.6.0", "2.7.0"]
+        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
    timeout-minutes: 20

    steps:
      - name: Check out repository code
        uses: actions/checkout@v4

-      - name: Restore Cache from S3
-        id: hf-cache-restore-s3
-        run: |
-          mkdir -p /home/runner/.cache/huggingface/hub
-          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd
+      - name: Restore HF cache
+        id: hf-cache-restore
+        uses: actions/cache/restore@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ runner.os }}-hf-hub-cache-v2

      - name: Setup Python
        uses: actions/setup-python@v5
@@ -78,11 +168,15 @@ jobs:
        run: |
          axolotl --help

+      - name: Pre-Download dataset fixture
+        run: |
+          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
+
      - name: Run tests
        run: |
-          pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
-          pytest -v --durations=10 tests/patched/
-          pytest -v --durations=10 tests/cli/
+          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
+          pytest -v tests/patched/
+          pytest -v tests/cli/

      - name: cleanup pip cache
        run: |
@@ -92,15 +186,22 @@ jobs:
    if: github.repository_owner == 'axolotl-ai-cloud'
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
-    timeout-minutes: 120
+    timeout-minutes: 60
    needs: [pre-commit, pytest]

    strategy:
      fail-fast: false
      matrix:
        include:
-          - cuda: 126
-            cuda_version: 12.6.3
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.5.1
+            num_gpus: 1
+            axolotl_extras:
+            nightly_build: "true"
+          - cuda: 124
+            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
            num_gpus: 1
@@ -116,7 +217,7 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==1.0.2 jinja2
+          pip install modal==0.71.8 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -52,7 +52,7 @@ jobs:
      fail-fast: false
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.6.0", "2.7.0", "2.7.1"]
+        pytorch_version: ["2.5.1", "2.6.0", "2.7.1"]
    timeout-minutes: 20

    steps:
@@ -102,9 +102,9 @@ jobs:

      - name: Run tests
        run: |
-          pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/ --cov=axolotl --cov-report=xml
-          pytest -v --durations=10 tests/patched/ --cov=axolotl --cov-append --cov-report=xml
-          pytest -v --durations=10 tests/cli/ --cov=axolotl --cov-append --cov-report=xml
+          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/ --cov=axolotl --cov-report=xml
+          pytest -v tests/patched/ --cov=axolotl --cov-append --cov-report=xml
+          pytest -v tests/cli/ --cov=axolotl --cov-append --cov-report=xml

      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v5
@@ -125,7 +125,7 @@ jobs:
      fail-fast: false
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.6.0", "2.7.0", "2.7.1"]
+        pytorch_version: ["2.5.1", "2.6.0", "2.7.1"]
    timeout-minutes: 20

    steps:
@@ -175,9 +175,9 @@ jobs:

      - name: Run tests
        run: |
-          pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
-          pytest -v --durations=10 tests/patched/
-          pytest -v --durations=10 tests/cli/
+          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
+          pytest -v tests/patched/
+          pytest -v tests/cli/

      - name: cleanup pip cache
        run: |
@@ -195,12 +195,12 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 126
-            cuda_version: 12.6.3
+          - cuda: 124
+            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.6.0
            num_gpus: 1
-            axolotl_extras:
+            axolotl_extras: vllm
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
@@ -247,10 +247,22 @@ jobs:
      fail-fast: false
      matrix:
        include:
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.6.0
+            num_gpus: 1
+            axolotl_extras: llmcompressor
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.5.1
+            num_gpus: 1
+            axolotl_extras:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.6.0
+            pytorch: 2.7.1
            num_gpus: 1
            axolotl_extras:
          - cuda: 128
@@ -299,7 +311,7 @@ jobs:
            python_version: "3.11"
            pytorch: 2.6.0
            num_gpus: 1
-            axolotl_extras:
+            axolotl_extras: vllm
    steps:
      - name: Checkout
        uses: actions/checkout@v4
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -36,7 +36,7 @@ repos:
            'pydantic>=2.5.3',
        ]
 -   repo: https://github.com/PyCQA/bandit
-    rev: 1.8.6
+    rev: 1.8.5
    hooks:
    -   id: bandit
        args: [
--- a/.runpod/src/config/config.yaml
+++ b/.runpod/src/config/config.yaml
@@ -97,7 +97,7 @@
 #       # 'no_input_format' cannot include {input}
 #       no_input_format: "{instruction} "

-#       # For `completion` datasets only, uses the provided field instead of `text` column
+#       # For `completion` datsets only, uses the provided field instead of `text` column
 #       field:

 # # Axolotl attempts to save the dataset as an arrow after packing the data together so
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,5 +2,4 @@ include requirements.txt
 include README.md
 include LICENSE
 include src/setuptools_axolotl_dynamic_dependencies.py
-include src/axolotl/utils/chat_templates/templates/*.jinja
 recursive-include axolotl *.py
--- a/README.md
+++ b/README.md
@@ -55,12 +55,10 @@ Features:

 - NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
 - Python 3.11
- PyTorch ≥2.6.0
+- PyTorch ≥2.5.1

 ### Installation

-#### Using pip
-
 ```bash
 pip3 install -U packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
@@ -70,13 +68,6 @@ axolotl fetch examples
 axolotl fetch deepspeed_configs  # OPTIONAL
 ```

-#### Using Docker
-
-Installing with Docker can be less error prone than installing in your own environment.
-```bash
-docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
-```
-
 Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html).

 ### Your First Fine-tune
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -276,7 +276,6 @@ website:
            - docs/torchao.qmd
            - docs/custom_integrations.qmd
            - docs/sequence_parallelism.qmd
-            - docs/gradient_checkpointing.qmd

        - section: "Troubleshooting"
          contents:
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -9,7 +9,6 @@ ENV GITHUB_REF="{{ GITHUB_REF }}"
 ENV GITHUB_SHA="{{ GITHUB_SHA }}"
 ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
 ENV HF_HOME="{{ HF_HOME }}"
-ENV AXOLOTL_DATASET_PROCESSES="8"

 RUN apt-get update && \
    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -24,9 +24,9 @@ df_template = template_env.get_template("Dockerfile.jinja")
 df_args = {
    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.6.0"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu126-2.6.0"),
-    "CUDA": os.environ.get("CUDA", "126"),
+    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.5.1"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu124-2.5.1"),
+    "CUDA": os.environ.get("CUDA", "124"),
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
@@ -24,16 +24,14 @@ df_template = template_env.get_template(dockerfile)
 df_args = {
    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.6.0"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu126-2.6.0"),
-    "CUDA": os.environ.get("CUDA", "126"),
+    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.5.1"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu124-2.5.1"),
+    "CUDA": os.environ.get("CUDA", "124"),
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
    "HF_HOME": "/workspace/data/huggingface-cache/hub",
-    "PYTHONUNBUFFERED": os.environ.get("PYTHONUNBUFFERED", "1"),
-    "DEEPSPEED_LOG_LEVEL": os.environ.get("DEEPSPEED_LOG_LEVEL", "WARNING"),
 }

 dockerfile_contents = df_template.render(**df_args)
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -38,6 +38,6 @@ RUN git lfs install --skip-repo && \
    # The base image ships with `pydantic==1.8.2` which is not working
    pip3 install -U --no-cache-dir pydantic==1.10.10

-RUN if [ "$PYTORCH_VERSION" = "2.6.0" ] && [ "$CUDA" = "124" ] ; then \
-        FLASH_ATTENTION_FORCE_BUILD="TRUE" pip3 install --no-build-isolation flash-attn==2.8.0.post2; \
+RUN if [ "$PYTORCH_VERSION" = "2.7.1" ] ; then \
+        pip3 install flash-attn==2.7.4.post1; \
    fi
--- a/docker/Dockerfile-uv-base
+++ b/docker/Dockerfile-uv-base
@@ -34,3 +34,7 @@ RUN uv pip install packaging setuptools wheel psutil \
    && uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" \
    && uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" \
    && uv pip install awscli pydantic
+
+RUN if [ "$PYTORCH_VERSION" = "2.7.1" ] ; then \
+        uv pip install --no-build-isolation flash-attn==2.7.4.post1; \
+    fi
--- a/docs/custom_integrations.qmd
+++ b/docs/custom_integrations.qmd
@@ -7,7 +7,6 @@ toc-depth: 3
 ```{python}
 #| echo: false

-import os
 import re

 def process_readme(integration_name):
@@ -54,24 +53,6 @@ sections = [
    ("LLMCompressor", "llm_compressor")
 ]

-for folder_name in os.listdir("../src/axolotl/integrations/"):
-    if folder_name in [path for name, path in sections]:
-        # skip if already in sections
-        continue
-    if os.path.exists(f"../src/axolotl/integrations/{folder_name}/README.md"):
-        # grab the first heading in README.md as the section name
-        with open(f"../src/axolotl/integrations/{folder_name}/README.md", "r") as f:
-            txt = f.read()
-            matches = re.search(r'^# (.*)\n?', txt, flags=re.MULTILINE)
-            if matches:
-                name = matches.group(1)
-            else:
-                continue
-            sections.append((name, folder_name))
-
-# sort sections by name
-sections = sorted(sections, key=lambda x: x[0])
-
 for section_name, folder_name in sections:
    print(print_section(section_name, folder_name))
 ```
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -187,7 +187,6 @@ Instead of passing `tools` via the system prompt, an alternative method would be
            "role": "assistant", // call the function via assistant
            "tool_calls": [
                {
-                    "id": "...",  // required only for mistral
                    "type": "function",
                    "function": {
                        "name": "...",
@@ -200,7 +199,6 @@ Instead of passing `tools` via the system prompt, an alternative method would be
        },
        {
            "role": "tool",
-            "tool_call_id": "...",  // required only for mistral
            "name": "...",
            "content": "..."
        },
--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -9,7 +9,7 @@ format:
 This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai).

 ::: {.callout-important}
-For Blackwell GPUs, please use the tags with PyTorch 2.7.1 and CUDA 12.8.
+For Blackwell GPUs, please use the tags with Pytorch 2.7.1 and CUDA 12.8.
 :::

 ## Base
@@ -34,9 +34,8 @@ Tags examples:

 - `main-base-py3.11-cu128-2.7.1`
 - `main-base-py3.11-cu126-2.7.1`
- `main-base-py3.11-cu126-2.7.0`
- `main-base-py3.11-cu126-2.6.0`
 - `main-base-py3.11-cu124-2.6.0`
+- `main-base-py3.11-cu124-2.5.1`

 ## Main

@@ -74,15 +73,13 @@ There may be some extra tags appended to the image, like `-vllm` which installs

 Tags examples:

- `main-py3.11-cu128-2.7.1`
- `main-py3.11-cu126-2.7.1`
 - `main-py3.11-cu126-2.7.0`
- `main-py3.11-cu126-2.6.0`
 - `main-py3.11-cu124-2.6.0`
+- `main-py3.11-cu124-2.5.1`
 - `main-latest`
 - `main-20250303-py3.11-cu124-2.6.0`
- `main-20250303-py3.11-cu126-2.6.0`
- `0.10.1`
+- `main-20250303-py3.11-cu124-2.5.1`
+- `0.9.2`

 ## Cloud

--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -51,18 +51,6 @@ description: Frequently asked questions
 >   pad_token: "..."
 > ```

-**Q: `IterableDataset error` or `KeyError: 'input_ids'` when using `preprocess` CLI**
-
-> A: This is because you may be using `preprocess` CLI with `pretraining_dataset:` or `skip_prepare_dataset: true` respectively. Please use `axolotl train` CLI directly instead as these datasets are prepared on demand.
-
-**Q: vLLM is not working with Axolotl**
-
-> A: We currently recommend torch 2.6.0 for use with `vllm`. Please ensure you use the right version. For Docker, please use the `main-py3.11-cu124-2.6.0` tag.
-
-**Q: FA2 2.8.0 `undefined symbol` runtime error on CUDA 12.4**
-
-> A: There seems to be a wheel issue with FA2 2.8.0 on CUDA 12.4. Try CUDA 12.6 instead or downgrade to FA2 2.7.4. Please refer to the upstream issue: https://github.com/Dao-AILab/flash-attention/issues/1717.
-
 ### Chat templates

 **Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**
--- a/docs/fsdp_qlora.qmd
+++ b/docs/fsdp_qlora.qmd
@@ -20,7 +20,7 @@ To enable `QLoRA` with `FSDP`, you need to perform the following steps:
 > See the [example config](#example-config) file in addition to reading these instructions.

 1. Set `adapter: qlora` in your axolotl config file.
-2. Enable FSDP in your axolotl config, as [described here](multi-gpu.qmd#sec-fsdp).
+2. Enable FSDP in your axolotl config, as [described here](https://github.com/axolotl-ai-cloud/axolotl?tab=readme-ov-file#fsdp).
 3. Use one of the supported model types: `llama`, `mistral` or `mixtral`.

 ## Example Config
--- a/docs/gradient_checkpointing.qmd
+++ b/docs/gradient_checkpointing.qmd
@@ -1,29 +0,0 @@
---
-title: Gradient Checkpointing and Activation Offloading
---
-
-Gradient checkpointing and activation offloading are techniques used to optimize the performance of deep learning
-models by reducing the memory footprint and improving computational efficiency.
-
-### Enabling Gradient Checkpointing
-
-```yaml
-gradient_checkpointing: true
-```
-
-### Enabling Activation Offloading
-
-```yaml
-gradient_checkpointing: true  # required for activation offloading
-activation_offloading: true
-```
-
-Activation offloading variants:
-
-The default `activation_offloading: true` offloads activations to CPU and uses CUDA streams
-to overlap the communications and computations when offloading.
-
-The `activation_offloading: legacy` naively offloads activations to CPU and without additional optimizations.
-
-For resource constrained environments with limited CPU memory, `activation_offloading: disk` offloads
-activations to disk instead of CPU RAM so that much larger context lengths can be trained with minimal memory.
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -15,7 +15,7 @@ This guide covers all the ways you can install and set up Axolotl for your envir

 - NVIDIA GPU (Ampere architecture or newer for `bf16` and Flash Attention) or AMD GPU
 - Python ≥3.11
- PyTorch ≥2.6.0
+- PyTorch ≥2.5.1

 ## Installation Methods {#sec-installation-methods}

--- a/docs/multi-gpu.qmd
+++ b/docs/multi-gpu.qmd
@@ -23,6 +23,8 @@ Axolotl supports several methods for multi-GPU training:

 ## DeepSpeed {#sec-deepspeed}

+DeepSpeed is the recommended approach for multi-GPU training due to its stability and performance. It provides various optimization levels through ZeRO stages.
+
 ### Configuration {#sec-deepspeed-config}

 Add to your YAML config:
@@ -30,6 +32,7 @@ Add to your YAML config:
 ```{.yaml}
 deepspeed: deepspeed_configs/zero1.json
 ```
+
 ### Usage {#sec-deepspeed-usage}

 ```{.bash}
@@ -63,75 +66,9 @@ Start from Stage 1 -> Stage 2 -> Stage 3.

 :::

-::: {.callout-tip}
+## FSDP {#sec-fsdp}

-Using ZeRO Stage 3 with Single-GPU training
-
-ZeRO Stage 3 can be used for training on a single GPU by manually setting the environment variables:
-`WORLD_SIZE=1 LOCAL_RANK=0 MASTER_ADDR=0.0.0.0 MASTER_PORT=29500`
-
-:::
-
-## Fully Sharded Data Parallel (FSDP) {#sec-fsdp}
-
-::: {.callout-note}
-
-FSDP2 is recommended for new users. FSDP1 is deprecated and will be removed in an upcoming release of Axolotl.
-
-:::
-
-### Migrating from FSDP1 to FSDP2 {#sec-migrate-fsdp1-fsdp2}
-
-To migrate your config from FSDP1 to FSDP2, you must use the `fsdp_version` top-level config field to specify the FSDP version, and
-also follow the config field mapping below to update field names.
-
-#### Config mapping
-
-FSDP1 | FSDP2
-------- | --------
-fsdp_sharding_strategy | reshard_after_forward
-fsdp_backward_prefetch_policy | **REMOVED**
-fsdp_backward_prefetch | **REMOVED**
-fsdp_forward_prefetch | **REMOVED**
-fsdp_sync_module_states | **REMOVED**
-fsdp_cpu_ram_efficient_loading | cpu_ram_efficient_loading
-fsdp_state_dict_type | state_dict_type
-fsdp_use_orig_params | **REMOVED**
-
-
-For example, if you were using the following FSDP1 config:
-
-```{.yaml}
-fsdp_version: 1
-fsdp_config:
-  fsdp_offload_params: false
-  fsdp_cpu_ram_efficient_loading: true
-  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  fsdp_transformer_layer_cls_to_wrap: Qwen3DecoderLayer
-  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_sharding_strategy: FULL_SHARD
-```
-
-You can migrate to the following FSDP2 config:
-
-```{.yaml}
-fsdp_version: 2
-fsdp_config:
-  offload_params: false
-  cpu_ram_efficient_loading: true
-  auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  transformer_layer_cls_to_wrap: Qwen3DecoderLayer
-  state_dict_type: FULL_STATE_DICT
-  reshard_after_forward: true
-```
-
-### FSDP1 (deprecated) {#sec-fsdp-config}
-
-::: {.callout-note}
-
-Using `fsdp` to configure FSDP is deprecated and will be removed in an upcoming release of Axolotl. Please use `fsdp_config` as above instead.
-
-:::
+### Basic FSDP Configuration {#sec-fsdp-config}

 ```{.yaml}
 fsdp:
@@ -143,7 +80,6 @@ fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
 ```

-
 ## Sequence parallelism {#sec-sequence-parallelism}

 We support sequence parallelism (SP) via the
--- a/docs/multi-node.qmd
+++ b/docs/multi-node.qmd
@@ -40,13 +40,13 @@ use_cpu: false

 Configure your model to use FSDP in the Axolotl yaml. For example:
 ```yaml
-fsdp_version: 2
+fsdp:
+  - full_shard
+  - auto_wrap
 fsdp_config:
-  offload_params: true
-  state_dict_type: FULL_STATE_DICT
-  auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  transformer_layer_cls_to_wrap: LlamaDecoderLayer
-  reshard_after_forward: true
+  fsdp_offload_params: true
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
 ```

 All you have to do now is launch using accelerate as you would usually do on each machine and voila, the processes will start once you have launched accelerate on every machine.
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -17,6 +17,7 @@ feedback. Various methods include, but not limited to:
 - [Kahneman-Tversky Optimization (KTO)](#kto)
 - [Odds Ratio Preference Optimization (ORPO)](#orpo)
 - [Group Relative Policy Optimization (GRPO)](#grpo)
+- Proximal Policy Optimization (PPO) (not yet supported in axolotl, if you're interested in contributing, please reach out!)


 ## RLHF using Axolotl
@@ -274,14 +275,15 @@ rl: dpo
 datasets:
  - path: ...
    split: train
-    type:
-      field_prompt: "prompt"
-      field_system: "system"
-      field_chosen: "chosen"
-      field_rejected: "rejected"
-      prompt_format: "{prompt}"
-      chosen_format: "{chosen}"
-      rejected_format: "{rejected}"
+    type: user_defined.default
+
+    field_prompt: "prompt"
+    field_system: "system"
+    field_chosen: "chosen"
+    field_rejected: "rejected"
+    prompt_format: "{prompt}"
+    chosen_format: "{chosen}"
+    rejected_format: "{rejected}"
 ```

 The input format is a simple JSON input with customizable fields based on the above config.
@@ -474,13 +476,14 @@ rl: kto
 datasets:
  - path: ...
    split: train
-    type:
-      field_prompt: "prompt"
-      field_system: "system"
-      field_completion: "completion"
-      field_label: "label"
-      prompt_format: "{prompt}"
-      completion_format: "{completion}"
+    type: user_defined.default
+
+    field_prompt: "prompt"
+    field_system: "system"
+    field_completion: "completion"
+    field_label: "label"
+    prompt_format: "{prompt}"
+    completion_format: "{completion}"
 ```

 The input format is a simple JSON input with customizable fields based on the above config.
--- a/examples/archived/README.md
+++ b/examples/archived/README.md
@@ -1,5 +0,0 @@
-# Archived Examples
-
-This directory contains examples that are no longer maintained and may no longer be functional.
-
-We keep them around for archival purposes in case they are useful to others.
--- a/examples/archived/cerebras/btlm-ft.yml
+++ b/examples/archived/cerebras/btlm-ft.yml
--- a/examples/archived/cerebras/qlora.yml
+++ b/examples/archived/cerebras/qlora.yml
--- a/examples/cloud/modal.yaml
+++ b/examples/cloud/modal.yaml
@@ -26,5 +26,3 @@ timeout: 86400
 # Preprocess specific configurations
 memory_preprocess: 32
 timeout_preprocess: 14400
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/archived/code-llama/13b/lora.yml
+++ b/examples/archived/code-llama/13b/lora.yml
--- a/examples/archived/code-llama/13b/qlora.yml
+++ b/examples/archived/code-llama/13b/qlora.yml
--- a/examples/archived/code-llama/34b/lora.yml
+++ b/examples/archived/code-llama/34b/lora.yml
--- a/examples/archived/code-llama/34b/qlora.yml
+++ b/examples/archived/code-llama/34b/qlora.yml
--- a/examples/archived/code-llama/7b/lora.yml
+++ b/examples/archived/code-llama/7b/lora.yml
--- a/examples/archived/code-llama/7b/qlora.yml
+++ b/examples/archived/code-llama/7b/qlora.yml
--- a/examples/archived/code-llama/README.md
+++ b/examples/archived/code-llama/README.md
--- a/examples/cohere/command-r-7b-qlora.yml
+++ b/examples/cohere/command-r-7b-qlora.yml
@@ -35,6 +35,7 @@ wandb_watch:
 wandb_name:
 wandb_log_model:

+
 gradient_accumulation_steps: 4
 micro_batch_size: 1
 num_epochs: 4
@@ -55,5 +56,3 @@ evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
--- a/examples/archived/dbrx/16bit-lora.yaml
+++ b/examples/archived/dbrx/16bit-lora.yaml
--- a/examples/archived/dbrx/8bit-lora.yaml
+++ b/examples/archived/dbrx/8bit-lora.yaml
--- a/examples/archived/dbrx/README.md
+++ b/examples/archived/dbrx/README.md
--- a/examples/archived/dbrx/fft-ds-zero3.yaml
+++ b/examples/archived/dbrx/fft-ds-zero3.yaml
--- a/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
+++ b/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
--- a/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
@@ -56,5 +56,3 @@ evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
@@ -56,5 +56,3 @@ evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/deepseek-v2/fft-fsdp-16b.yaml
+++ b/examples/deepseek-v2/fft-fsdp-16b.yaml
@@ -55,5 +55,3 @@ fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: DeepseekV2DecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -79,5 +79,3 @@ fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: DeepseekV2DecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/devstral/README.md
+++ b/examples/devstral/README.md
@@ -1,70 +0,0 @@
-# Finetune Devstral with Axolotl
-
-Devstral Small is a 24B parameter opensource model from MistralAI found on HuggingFace [Devstral-Small-2505](https://huggingface.co/mistralai/Devstral-Small-2505) and [Devstral-Small-2507](https://huggingface.co/mistralai/Devstral-Small-2507). `Devstral-Small-2507` is the latest version of the model and has [function calling](https://mistralai.github.io/mistral-common/usage/tools/) support.
-
-This guide shows how to fine-tune it with Axolotl with multi-turn conversations with proper masking.
-
-The model was fine-tuned ontop of [Mistral-Small-3.1](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503) without the vision layer and has a context of up to 128k tokens.
-
-Thanks to the team at MistralAI for giving us early access to prepare for this release.
-
-## Getting started
-
-1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Devstral is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
-
-    Here is an example of how to install from main for pip:
-
-```bash
-# Ensure you have Pytorch installed (Pytorch 2.6.0+)
-git clone https://github.com/axolotl-ai-cloud/axolotl.git
-cd axolotl
-
-pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation -e '.[flash-attn]'
-```
-
-2. Run the finetuning example:
-
-```bash
-axolotl train examples/devstral/devstral-small-qlora.yml
-```
-
-This config uses about 21GB VRAM.
-
-Let us know how it goes. Happy finetuning! 🚀
-
-### TIPS
-
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
- Learn how to use function calling with Axolotl at [docs](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#using-tool-use).
-
-## Optimization Guides
-
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
- [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy)
- [Liger Kernel](https://docs.axolotl.ai/docs/custom_integrations.html#liger-kernels)
-
-## Limitations
-
-We only support the `mistral-common` tokenizer for Supervised Fine-tuning at the moment and for `type: chat_template` only.
-
-In addition, we do not support overriding tokens yet.
-
-## Related Resources
-
- [MistralAI Devstral Blog](https://mistral.ai/news/devstral)
- [MistralAI Devstral 1.1 Blog](https://mistral.ai/news/devstral-2507)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
-
-
-## Future Work
-
- Add parity to Preference Tuning, RL, Multi-modal, etc.
- Add parity to other tokenizer configs like overriding tokens.
--- a/examples/devstral/devstral-small-qlora.yml
+++ b/examples/devstral/devstral-small-qlora.yml
@@ -1,66 +0,0 @@
-base_model: mistralai/Devstral-Small-2507
-
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-
-# Enable to use mistral-common tokenizer
-tokenizer_use_mistral_common: true
-
-load_in_8bit: false
-load_in_4bit: true
-
-plugins:
-  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
-
-datasets:
-  - path: fozziethebeat/alpaca_messages_2k_test
-    type: chat_template
-
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.1
-output_dir: ./outputs/qlora-out
-
-adapter: qlora
-lora_model_dir:
-
-sequence_len: 2048
-sample_packing: true
-pad_to_sequence_len: true
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0
-lora_target_linear: true
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 1
-optimizer: adamw_torch
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: false
-
-gradient_checkpointing: true
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-loss_watchdog_threshold: 5.0
-loss_watchdog_patience: 3
-
-warmup_ratio: 0.05
-evals_per_epoch: 4
-saves_per_epoch: 1
-
-weight_decay: 0.0
-special_tokens:
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml
@@ -69,5 +69,3 @@ evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/falcon-h1/falcon-h1-1b-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-1b-qlora.yaml
@@ -46,6 +46,7 @@ wandb_watch:
 wandb_name:
 wandb_log_model:

+
 gradient_accumulation_steps: 4
 micro_batch_size: 1
 num_epochs: 4
@@ -68,5 +69,3 @@ evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/falcon-h1/falcon-h1-34b-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-34b-qlora.yaml
@@ -69,5 +69,3 @@ evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/falcon-h1/falcon-h1-3b-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-3b-qlora.yaml
@@ -69,5 +69,3 @@ evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/falcon-h1/falcon-h1-500m-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-500m-qlora.yaml
@@ -69,5 +69,3 @@ evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/falcon-h1/falcon-h1-7b-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-7b-qlora.yaml
@@ -69,5 +69,3 @@ evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/archived/falcon/config-7b-lora.yml
+++ b/examples/archived/falcon/config-7b-lora.yml
--- a/examples/archived/falcon/config-7b-qlora.yml
+++ b/examples/archived/falcon/config-7b-qlora.yml
--- a/examples/archived/falcon/config-7b.yml
+++ b/examples/archived/falcon/config-7b.yml
--- a/examples/archived/gemma/qlora.yml
+++ b/examples/archived/gemma/qlora.yml
--- a/examples/gemma2/qlora.yml
+++ b/examples/gemma2/qlora.yml
@@ -60,5 +60,3 @@ evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/gemma2/reward-model.yaml
+++ b/examples/gemma2/reward-model.yaml
@@ -50,5 +50,3 @@ evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/gemma3/gemma-3-1b-qlora.yml
+++ b/examples/gemma3/gemma-3-1b-qlora.yml
@@ -66,5 +66,3 @@ evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/gemma3/gemma-3-4b-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-qlora.yml
@@ -60,5 +60,3 @@ warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/gemma3/gemma-3-4b-vision-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-vision-qlora.yml
@@ -62,5 +62,3 @@ warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/glm4/qlora-32b.yaml
+++ b/examples/glm4/qlora-32b.yaml
@@ -60,5 +60,3 @@ evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/archived/gptj/qlora.yml
+++ b/examples/archived/gptj/qlora.yml
--- a/examples/jamba/qlora.yaml
+++ b/examples/jamba/qlora.yaml
@@ -54,5 +54,3 @@ evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/jamba/qlora_deepspeed.yaml
+++ b/examples/jamba/qlora_deepspeed.yaml
@@ -55,5 +55,3 @@ saves_per_epoch: 1
 deepspeed: deepspeed_configs/zero2.json
 weight_decay: 0.0
 special_tokens:
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/jamba/qlora_fsdp_large.yaml
+++ b/examples/jamba/qlora_fsdp_large.yaml
@@ -64,5 +64,3 @@ fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: JambaAttentionDecoderLayer,JambaMambaDecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/archived/jeopardy-bot/config.yml
+++ b/examples/archived/jeopardy-bot/config.yml
--- a/examples/lfm2/README.md
+++ b/examples/lfm2/README.md
@@ -1,7 +0,0 @@
-# Liquid Foundation Models 2
-
-LFM2 support in transformers exists in the main branch, but is not yet included in the transformers release.
-
-```bash
-pip install --upgrade --no-deps --force-reinstall git+https://github.com/huggingface/transformers.git
-```
--- a/examples/lfm2/lfm2-350m-fft.yaml
+++ b/examples/lfm2/lfm2-350m-fft.yaml
@@ -1,50 +0,0 @@
-base_model: LiquidAI/LFM2-350M
-
-chunked_cross_entropy: true
-
-chat_template: tokenizer_default
-eot_tokens:
-  - "<|im_end|>"
-datasets:
-  - path: mlabonne/FineTome-100k
-    type: chat_template
-    split: train[:20%]
-    field_messages: conversations
-    message_field_role: from
-    message_field_content: value
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.05
-output_dir: ./outputs/out
-
-sequence_len: 4096
-sample_packing: true
-pad_to_sequence_len: true
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 2
-micro_batch_size: 4
-num_epochs: 1
-optimizer: adamw_torch_fused
-lr_scheduler: cosine
-learning_rate: 5e-5
-
-bf16: true
-tf32: true
-
-gradient_checkpointing: false
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_ratio: 0.1
-evals_per_epoch: 2
-saves_per_epoch: 1
-
-weight_decay: 0.0
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-2/fft_optimized.yml
+++ b/examples/llama-2/fft_optimized.yml
@@ -55,5 +55,3 @@ saves_per_epoch: 1
 deepspeed: #deepspeed_configs/zero2.json # multi-gpu only
 weight_decay: 0.1
 special_tokens:
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-2/gptq-lora.yml
+++ b/examples/llama-2/gptq-lora.yml
@@ -64,5 +64,3 @@ special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-2/lisa.yml
+++ b/examples/llama-2/lisa.yml
@@ -60,5 +60,3 @@ special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-2/loftq.yml
+++ b/examples/llama-2/loftq.yml
@@ -52,5 +52,3 @@ evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -52,5 +52,3 @@ evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-2/qlora-fsdp.yml
+++ b/examples/llama-2/qlora-fsdp.yml
@@ -67,5 +67,3 @@ fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
 special_tokens:
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -53,5 +53,3 @@ evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-2/relora.yml
+++ b/examples/llama-2/relora.yml
@@ -58,5 +58,3 @@ special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3-vision/lora-11b.yaml
+++ b/examples/llama-3-vision/lora-11b.yaml
@@ -57,5 +57,3 @@ warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/3b-qat-fsdp2.yaml
+++ b/examples/llama-3/3b-qat-fsdp2.yaml
@@ -77,5 +77,3 @@ fsdp_config:

 special_tokens:
  pad_token: <|end_of_text|>
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/fft-8b-liger-fsdp.yaml
+++ b/examples/llama-3/fft-8b-liger-fsdp.yaml
@@ -72,5 +72,3 @@ fsdp_config:
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot_id|>
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/fft-8b.yaml
+++ b/examples/llama-3/fft-8b.yaml
@@ -42,5 +42,3 @@ saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
  pad_token: <|end_of_text|>
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/instruct-dpo-lora-8b.yml
+++ b/examples/llama-3/instruct-dpo-lora-8b.yml
@@ -71,5 +71,3 @@ warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/instruct-lora-8b.yml
+++ b/examples/llama-3/instruct-lora-8b.yml
@@ -64,5 +64,3 @@ saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
   pad_token: <|end_of_text|>
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/lora-1b-deduplicate-dpo.yml
+++ b/examples/llama-3/lora-1b-deduplicate-dpo.yml
@@ -83,5 +83,3 @@ warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/lora-1b-deduplicate-sft.yml
+++ b/examples/llama-3/lora-1b-deduplicate-sft.yml
@@ -61,5 +61,3 @@ saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
   pad_token: <|end_of_text|>
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/lora-1b-kernels.yml
+++ b/examples/llama-3/lora-1b-kernels.yml
@@ -65,5 +65,3 @@ saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
  pad_token: "<|end_of_text|>"
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/lora-1b-ray.yml
+++ b/examples/llama-3/lora-1b-ray.yml
@@ -64,5 +64,3 @@ special_tokens:

 use_ray: true
 ray_num_workers: 4
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/lora-1b-sample-packing-sequentially.yml
+++ b/examples/llama-3/lora-1b-sample-packing-sequentially.yml
@@ -63,5 +63,3 @@ saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
  pad_token: <|end_of_text|>
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/lora-1b.yml
+++ b/examples/llama-3/lora-1b.yml
@@ -60,5 +60,3 @@ saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
  pad_token: "<|end_of_text|>"
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/lora-8b.yml
+++ b/examples/llama-3/lora-8b.yml
@@ -57,5 +57,3 @@ saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
   pad_token: <|end_of_text|>
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/qlora-1b-kto.yaml
+++ b/examples/llama-3/qlora-1b-kto.yaml
@@ -61,5 +61,3 @@ saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
  pad_token: "<|end_of_text|>"
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/qlora-1b.yml
+++ b/examples/llama-3/qlora-1b.yml
@@ -62,5 +62,3 @@ saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
  pad_token: "<|end_of_text|>"
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/qlora-fsdp-405b.yaml
+++ b/examples/llama-3/qlora-fsdp-405b.yaml
@@ -60,5 +60,3 @@ fsdp_config:
  fsdp_sharding_strategy: FULL_SHARD
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/qlora-fsdp-70b.yaml
+++ b/examples/llama-3/qlora-fsdp-70b.yaml
@@ -69,5 +69,3 @@ fsdp_config:
  fsdp_sharding_strategy: FULL_SHARD
 special_tokens:
  pad_token: <|end_of_text|>
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Dan Saunders	c4f4f81bed	Merge branch 'main' into map-dataset-fetcher-fix	2025-06-26 11:20:05 -04:00
Dan Saunders	4ebd4aae3d	handle possibly empty batch	2025-06-26 10:59:27 -04:00