fix llama modeling

fix residuals and add llama support
add custom modeling for gemma3 using liger fused add rms
2025-07-30 11:37:58 -04:00 · 2025-07-30 10:22:38 -04:00 · 2025-07-30 08:21:03 -04:00 · 2025-07-30 08:05:25 -04:00 · 2025-07-30 06:44:06 -04:00 · 2025-07-30 06:38:13 -04:00
358 changed files with 6681 additions and 1760 deletions
--- a/.axolotl-complete.bash
+++ b/.axolotl-complete.bash
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+_axolotl_completions() {
+    local cur prev
+    COMPREPLY=()
+    cur="${COMP_WORDS[COMP_CWORD]}"
+    prev="${COMP_WORDS[COMP_CWORD-1]}"
+
+    # If we're completing the first argument (the command)
+    if [[ $COMP_CWORD -eq 1 ]]; then
+        mapfile -t COMPREPLY < <(compgen -W "delinearize-llama4 fetch lm-eval merge-sharded-fsdp-weights quantize vllm-serve evaluate inference merge-lora preprocess train" -- "$cur")
+        return 0
+    fi
+
+    # Commands that should complete with directories and YAML files
+    local -a yaml_commands=("merge-sharded-fsdp-weights" "quantize" "vllm-serve" "evaluate" "inference" "merge-lora" "preprocess" "train")
+
+    # Check if previous word is in our list
+    if [[ " ${yaml_commands[*]} " =~ (^|[[:space:]])$prev($|[[:space:]]) ]]; then
+        # Use filename completion which handles directories properly
+        compopt -o filenames
+        mapfile -t COMPREPLY < <(compgen -f -- "$cur")
+
+        # Filter to only include directories and YAML files
+        local -a filtered=()
+        for item in "${COMPREPLY[@]}"; do
+            if [[ -d "$item" ]] || [[ "$item" == *.yaml ]] || [[ "$item" == *.yml ]]; then
+                filtered+=("$item")
+            fi
+        done
+        COMPREPLY=("${filtered[@]}")
+
+        return 0
+    fi
+
+    # Default: no completion
+    return 0
+}
+
+# Remove the -o nospace option - let filenames handle it
+complete -F _axolotl_completions axolotl
--- a/.coderabbit.yaml
+++ b/.coderabbit.yaml
@@ -0,0 +1,16 @@
+# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json
+language: "en-US"
+early_access: false
+reviews:
+  profile: "chill"
+  request_changes_workflow: false
+  high_level_summary: true
+  review_status: true
+  collapse_walkthrough: true
+  poem: false
+  sequence_diagrams: false
+  auto_review:
+    enabled: true
+    drafts: false
+chat:
+  auto_reply: true
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -17,7 +17,7 @@ on:

 jobs:
  build-base:
-    if: github.repository_owner == 'axolotl-ai-cloud'
+    if: ${{ github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
    timeout-minutes: 480
    # this job needs to be run on self-hosted GPU runners...
    runs-on: ubuntu-latest-m
@@ -108,7 +108,7 @@ jobs:
            PYTORCH_VERSION=${{ matrix.pytorch }}
            TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }}
  build-base-uv:
-    if: github.repository_owner == 'axolotl-ai-cloud'
+    if: ${{ github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
    timeout-minutes: 480
    runs-on: ubuntu-latest-m
    strategy:
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -3,6 +3,7 @@ on:
  # check on PRs, and manual triggers
  merge_group:
  pull_request:
+      types: [opened, synchronize, reopened, ready_for_review]
      paths:
       - '**.py'
       - 'requirements.txt'
@@ -16,6 +17,7 @@ jobs:
  pre-commit:
    name: pre-commit
    runs-on: ubuntu-latest
+    if: ${{ !github.event.pull_request.draft }}
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -87,7 +87,6 @@ jobs:
            python_version: "3.11"
            pytorch: 2.6.0
            axolotl_extras:
-            is_latest: true
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
@@ -98,6 +97,7 @@ jobs:
            python_version: "3.11"
            pytorch: 2.7.1
            axolotl_extras:
+            is_latest: true
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -21,7 +21,7 @@ concurrency:

 jobs:
  test-axolotl-multigpu:
-    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
    strategy:
      fail-fast: false
      matrix:
@@ -36,10 +36,17 @@ jobs:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            axolotl_extras:
            num_gpus: 2
            nightly_build: "true"
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.7.1
+            axolotl_extras: vllm
+            num_gpus: 2
+            nightly_build: "true"
    runs-on: [self-hosted, modal]
    timeout-minutes: 120
    steps:
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -12,11 +12,16 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.6.0
            axolotl_extras:
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.7.1
+            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -60,15 +65,15 @@ jobs:
    strategy:
      matrix:
        include:
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.6.0
            axolotl_extras:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.6.0
+            pytorch: 2.7.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
--- a/.github/workflows/preview-docs.yml
+++ b/.github/workflows/preview-docs.yml
@@ -2,7 +2,7 @@ name: Preview
 on:
  workflow_dispatch:
  pull_request:
-    types: [opened, synchronize, reopened]
+    types: [opened, synchronize, reopened, ready_for_review]

    # Run the workflow only when one of these files changes
    paths:
@@ -25,9 +25,12 @@ permissions:
 jobs:
  preview:
    runs-on: ubuntu-latest
+    if: ${{ !github.event.pull_request.draft }}
    steps:
      - name: Check out repository
        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.head.sha }}

      - name: Set up Quarto
        uses: quarto-dev/quarto-actions/setup@v2
@@ -50,10 +53,12 @@ jobs:

      - name: Netlify Publish
        uses: nwtgck/actions-netlify@v3.0
+        if: ${{ secrets.NETLIFY_AUTH_TOKEN != '' }}
+        id: netlify
        with:
          publish-dir: './_site'
-          enable-pull-request-comment: true
-          enable-github-deployment: true
+          enable-pull-request-comment: false
+          enable-github-deployment: false
          github-token: ${{ secrets.GITHUB_TOKEN }}
          deploy-message: "Deployed On Netlify"
          github-deployment-environment: 'preview'
@@ -61,3 +66,13 @@ jobs:
        env:
          NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
          NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }}
+
+      - name: Update PR with preview link
+        if: ${{ steps.netlify.outcome == 'success' && secrets.NETLIFY_AUTH_TOKEN != '' }}
+        uses: marocchino/sticky-pull-request-comment@v2
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          message: |
+            📖 **Documentation Preview**: ${{ steps.netlify.outputs.deploy-url }}
+
+            Deployed on Netlify from commit ${{ github.event.pull_request.head.sha }}
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -52,7 +52,7 @@ jobs:

      - name: Install PyTorch
        run: |
-          pip3 install torch==${{ matrix.pytorch_version }}
+          pip3 install torch==${{ matrix.pytorch_version }} torchvision

      - name: Update requirements.txt
        run: |
@@ -92,7 +92,7 @@ jobs:
    if: github.repository_owner == 'axolotl-ai-cloud'
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
-    timeout-minutes: 60
+    timeout-minutes: 120
    needs: [pre-commit, pytest]

    strategy:
@@ -106,6 +106,13 @@ jobs:
            num_gpus: 1
            axolotl_extras:
            nightly_build: "true"
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.7.1
+            num_gpus: 1
+            axolotl_extras:
+            nightly_build: "true"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -116,7 +123,7 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==0.71.8 jinja2
+          pip install modal==1.0.2 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
@@ -130,3 +137,45 @@ jobs:
      - name: Run tests job on Modal
        run: |
          modal run cicd.e2e_tests
+  docker-e2e-multigpu-tests:
+    if: github.repository_owner == 'axolotl-ai-cloud'
+    # this job needs to be run on self-hosted GPU runners...
+    runs-on: [self-hosted, modal]
+    timeout-minutes: 120
+    needs: [pre-commit, pytest, docker-e2e-tests]
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.7.1
+            num_gpus: 2
+            axolotl_extras:
+            nightly_build: "true"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install Modal
+        run: |
+          python -m pip install --upgrade pip
+          pip install modal==1.0.2 jinja2
+      - name: Update env vars
+        run: |
+          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
+          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
+          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
+          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
+          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
+          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
+          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
+          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
+      - name: Run tests job on Modal
+        run: |
+          modal run cicd.multigpu
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -13,6 +13,7 @@ on:
      - 'cicd/cicd.sh'
      - 'cicd/Dockerfile.jinja'
  pull_request:
+      types: [opened, synchronize, reopened, ready_for_review]
      paths:
       - '**.py'
       - 'requirements.txt'
@@ -34,6 +35,7 @@ jobs:
  pre-commit:
    name: pre-commit
    runs-on: ubuntu-latest
+    if: ${{ !github.event.pull_request.draft }}
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
@@ -47,6 +49,7 @@ jobs:
  pytest:
    name: PyTest
    runs-on: ubuntu-latest
+    if: ${{ !github.event.pull_request.draft }}
 #    needs: [preload-cache]
    strategy:
      fail-fast: false
@@ -78,7 +81,7 @@ jobs:

      - name: Install PyTorch
        run: |
-          pip3 install torch==${{ matrix.pytorch_version }}
+          pip3 install torch==${{ matrix.pytorch_version }} torchvision

      - name: Install dependencies
        run: |
@@ -121,6 +124,7 @@ jobs:
  pytest-sdist:
    name: PyTest from Source Dist
    runs-on: ubuntu-latest
+    if: ${{ !github.event.pull_request.draft }}
    strategy:
      fail-fast: false
      matrix:
@@ -151,7 +155,7 @@ jobs:

      - name: Install PyTorch
        run: |
-          pip3 install torch==${{ matrix.pytorch_version }}
+          pip3 install torch==${{ matrix.pytorch_version }} torchvision

      - name: Install dependencies
        run: |
@@ -185,7 +189,7 @@ jobs:

  docker-e2e-tests-1st:
    # Run this job first as a gate for running the remainder of the test matrix
-    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' && !github.event.pull_request.draft }}
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
    timeout-minutes: 120
@@ -235,7 +239,7 @@ jobs:
          modal run cicd.e2e_tests

  docker-e2e-tests:
-    if: github.repository_owner == 'axolotl-ai-cloud'
+    if: ${{ github.repository_owner == 'axolotl-ai-cloud' && !github.event.pull_request.draft }}
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
    timeout-minutes: 120
@@ -289,6 +293,7 @@ jobs:
    runs-on: [self-hosted, modal]
    timeout-minutes: 90
    needs: [docker-e2e-tests]
+    if: ${{ !github.event.pull_request.draft }}

    strategy:
      fail-fast: false
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -27,7 +27,7 @@ repos:
    hooks:
    - id: pylint
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.16.1
+    rev: v1.17.0
    hooks:
    - id: mypy
      additional_dependencies:
--- a/.runpod/README.md
+++ b/.runpod/README.md
@@ -119,14 +119,15 @@ datasets:

 ## Dataset Processing

-| Option                        | Default                    | Description                       |
-| ----------------------------- | -------------------------- | --------------------------------- |
-| `dataset_prepared_path`       | `"data/last_run_prepared"` | Path for prepared dataset         |
-| `push_dataset_to_hub`         | `""`                       | Push dataset to HF hub            |
-| `dataset_processes`           | `4`                        | Number of preprocessing processes |
-| `dataset_keep_in_memory`      | `false`                    | Keep dataset in memory            |
-| `shuffle_merged_datasets`     | `true`                     | Shuffle merged datasets           |
-| `dataset_exact_deduplication` | `true`                     | Deduplicate datasets              |
+| Option                            | Default                    | Description                         |
+| --------------------------------- | -------------------------- | ----------------------------------- |
+| `dataset_prepared_path`           | `"data/last_run_prepared"` | Path for prepared dataset           |
+| `push_dataset_to_hub`             | `""`                       | Push dataset to HF hub              |
+| `dataset_processes`               | `4`                        | Number of preprocessing processes   |
+| `dataset_keep_in_memory`          | `false`                    | Keep dataset in memory              |
+| `shuffle_merged_datasets`         | `true`                     | Shuffle merged datasets             |
+| `shuffle_before_merging_datasets` | `false`                    | Shuffle each dataset before merging |
+| `dataset_exact_deduplication`     | `true`                     | Deduplicate datasets                |

 ## LoRA Configuration

--- a/.runpod/src/config/config.yaml
+++ b/.runpod/src/config/config.yaml
@@ -97,7 +97,7 @@
 #       # 'no_input_format' cannot include {input}
 #       no_input_format: "{instruction} "

-#       # For `completion` datsets only, uses the provided field instead of `text` column
+#       # For `completion` datasets only, uses the provided field instead of `text` column
 #       field:

 # # Axolotl attempts to save the dataset as an arrow after packing the data together so
--- a/README.md
+++ b/README.md
@@ -25,6 +25,8 @@

 ## 🎉 Latest Updates

+- 2025/07: Voxtral with mistral-common tokenizer support has been integrated in Axolotl. Read the [docs](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/voxtral)!
+- 2025/07: TiledMLP support for single-GPU to multi-GPU training with DDP, DeepSpeed and FSDP support has been added to support Arctic Long Sequence Training. (ALST). See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/alst) for using ALST with Axolotl!
 - 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral) to start training your own Magistral models with Axolotl!
 - 2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the [docs](https://docs.axolotl.ai/docs/qat.html) to learn more!
 - 2025/04: Llama 4 support has been added in Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/llama-4) to start training your own Llama 4 models with Axolotl's linearized version!
@@ -79,6 +81,20 @@ docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest

 Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html).

+#### Cloud Providers
+
+<details>
+
+- [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
+- [Vast.ai](https://cloud.vast.ai?ref_id=62897&template_id=bdd4a49fa8bce926defc99471864cace&utm_source=github&utm_medium=developer_community&utm_campaign=template_launch_axolotl&utm_content=readme)
+- [PRIME Intellect](https://app.primeintellect.ai/dashboard/create-cluster?image=axolotl&location=Cheapest&security=Cheapest&show_spot=true)
+- [Modal](https://www.modal.com?utm_source=github&utm_medium=github&utm_campaign=axolotl)
+- [Novita](https://novita.ai/gpus-console?templateId=311)
+- [JarvisLabs.ai](https://jarvislabs.ai/templates/axolotl)
+- [Latitude.sh](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)
+
+</details>
+
 ### Your First Fine-tune

 ```bash
@@ -120,12 +136,6 @@ Contributions are welcome! Please see our [Contributing Guide](https://github.co

 ## ❤️ Sponsors

-Thank you to our sponsors who help make Axolotl possible:
-
- [Modal](https://www.modal.com?utm_source=github&utm_medium=github&utm_campaign=axolotl) - Modal lets you run
-jobs in the cloud, by just writing a few lines of Python. Customers use Modal to deploy Gen AI models at large scale,
-fine-tune large language models, run protein folding simulations, and much more.
-
 Interested in sponsoring? Contact us at [wing@axolotl.ai](mailto:wing@axolotl.ai)

 ## 📜 License
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -268,6 +268,8 @@ website:
            - docs/batch_vs_grad.qmd
            - docs/dataset_preprocessing.qmd
            - docs/multipack.qmd
+            - docs/mixed_precision.qmd
+            - docs/gradient_accumulation.qmd

        - section: "Advanced Features"
          contents:
@@ -276,6 +278,7 @@ website:
            - docs/torchao.qmd
            - docs/custom_integrations.qmd
            - docs/sequence_parallelism.qmd
+            - docs/gradient_checkpointing.qmd

        - section: "Troubleshooting"
          contents:
--- a/cicd/Dockerfile-uv.jinja
+++ b/cicd/Dockerfile-uv.jinja
@@ -11,7 +11,7 @@ ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
 ENV HF_HOME="{{ HF_HOME }}"

 RUN apt-get update && \
-    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
+    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm

 WORKDIR /workspace

--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -12,7 +12,7 @@ ENV HF_HOME="{{ HF_HOME }}"
 ENV AXOLOTL_DATASET_PROCESSES="8"

 RUN apt-get update && \
-    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
+    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm

 WORKDIR /workspace

--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
@@ -19,5 +19,7 @@ pytest -v  --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/patched/ \
  --cov-append \
  --cov-report=xml:multigpu-coverage.xml

-# Upload coverage to Codecov
-codecov upload-process -t "${CODECOV_TOKEN}" -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION} || true
+# Upload coverage to Codecov if CODECOV_TOKEN is available
+if [ -n "$CODECOV_TOKEN" ]; then
+  codecov upload-process -t "${CODECOV_TOKEN}" -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION} || true
+fi
--- a/codecov.yml
+++ b/codecov.yml
@@ -22,6 +22,7 @@ coverage:
        only_pulls: true
        flags: null
        paths: null
+        informational: true
    patch:
      default:
        # basic
--- a/deepspeed_configs/zero3.json
+++ b/deepspeed_configs/zero3.json
@@ -7,9 +7,9 @@
    "reduce_bucket_size": "auto",
    "stage3_prefetch_bucket_size": "auto",
    "stage3_param_persistence_threshold": "auto",
-    "stage3_max_live_parameters": 0,
-    "stage3_max_reuse_distance": 0,
-    "stage3_gather_16bit_weights_on_model_save": true
+    "max_live_parameters": 0,
+    "max_reuse_distance": 0,
+    "gather_16bit_weights_on_model_save": true
  },
  "bf16": {
    "enabled": "auto"
--- a/deepspeed_configs/zero3_bf16.json
+++ b/deepspeed_configs/zero3_bf16.json
@@ -7,9 +7,9 @@
    "reduce_bucket_size": "auto",
    "stage3_prefetch_bucket_size": "auto",
    "stage3_param_persistence_threshold": "auto",
-    "stage3_max_live_parameters": 0,
-    "stage3_max_reuse_distance": 0,
-    "stage3_gather_16bit_weights_on_model_save": true
+    "max_live_parameters": 0,
+    "max_reuse_distance": 0,
+    "gather_16bit_weights_on_model_save": true
  },
  "bf16": {
    "enabled": true
--- a/deepspeed_configs/zero3_bf16_cpuoffload_all.json
+++ b/deepspeed_configs/zero3_bf16_cpuoffload_all.json
@@ -17,9 +17,9 @@
    "reduce_bucket_size": "auto",
    "stage3_prefetch_bucket_size": "auto",
    "stage3_param_persistence_threshold": "auto",
-    "stage3_max_live_parameters": 0,
-    "stage3_max_reuse_distance": 0,
-    "stage3_gather_16bit_weights_on_model_save": true
+    "max_live_parameters": 0,
+    "max_reuse_distance": 0,
+    "gather_16bit_weights_on_model_save": true
  },
  "bf16": {
    "enabled": true
--- a/deepspeed_configs/zero3_bf16_cpuoffload_params.json
+++ b/deepspeed_configs/zero3_bf16_cpuoffload_params.json
@@ -13,9 +13,9 @@
    "reduce_bucket_size": "auto",
    "stage3_prefetch_bucket_size": "auto",
    "stage3_param_persistence_threshold": "auto",
-    "stage3_max_live_parameters": 0,
-    "stage3_max_reuse_distance": 0,
-    "stage3_gather_16bit_weights_on_model_save": true
+    "max_live_parameters": 0,
+    "max_reuse_distance": 0,
+    "gather_16bit_weights_on_model_save": true
  },
  "bf16": {
    "enabled": true
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -10,7 +10,9 @@ ARG PYTORCH_VERSION="2.1.2"
 ENV PYTORCH_VERSION=$PYTORCH_VERSION

 RUN apt-get update && \
-    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev rsync s3fs
+    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev rsync s3fs && \
+    rm -rf /var/cache/apt/archives && \
+    rm -rf /var/lib/apt/lists/*

 WORKDIR /workspace

@@ -23,17 +25,17 @@ RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
-    fi
+    fi && \
+    python scripts/unsloth_install.py | sh && \
+    python scripts/cutcrossentropy_install.py | sh && \
+    pip install pytest && \
+    pip cache purge

-RUN python scripts/unsloth_install.py | sh
-RUN python scripts/cutcrossentropy_install.py | sh
-
-# So we can test the Docker image
-RUN pip install pytest
-
-# fix so that git fetch/pull from remote works
+# fix so that git fetch/pull from remote works with shallow clone
 RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
-    git config --get remote.origin.fetch
+    git config --get remote.origin.fetch && \
+    git config --global credential.helper store

-# helper for huggingface-login cli
-RUN git config --global credential.helper store
+COPY .axolotl-complete.bash /root/.axolotl-complete.bash
+RUN chmod +x /root/.axolotl-complete.bash && \
+    echo 'source /root/.axolotl-complete.bash' >> ~/.bashrc
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -16,12 +16,16 @@ ENV PYTHON_VERSION=$PYTHON_VERSION
 ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST

 RUN apt-get update \
-    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
+    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config \
+    && rm -rf /var/cache/apt/archives \
+    && rm -rf /var/lib/apt/lists/* \
    && wget \
    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
    && mkdir /root/.conda \
    && bash Miniconda3-latest-Linux-x86_64.sh -b \
    && rm -f Miniconda3-latest-Linux-x86_64.sh \
+    && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \
+    && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \
    && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"

 ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
@@ -31,12 +35,14 @@ WORKDIR /workspace
 RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
-    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
+    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" && \
+    python3 -m pip cache purge

 RUN git lfs install --skip-repo && \
    pip3 install awscli && \
    # The base image ships with `pydantic==1.8.2` which is not working
-    pip3 install -U --no-cache-dir pydantic==1.10.10
+    pip3 install -U --no-cache-dir pydantic==1.10.10 && \
+    pip3 cache purge

 RUN if [ "$PYTORCH_VERSION" = "2.6.0" ] && [ "$CUDA" = "124" ] ; then \
        FLASH_ATTENTION_FORCE_BUILD="TRUE" pip3 install --no-build-isolation flash-attn==2.8.0.post2; \
--- a/docker/Dockerfile-base-nightly
+++ b/docker/Dockerfile-base-nightly
@@ -22,18 +22,22 @@ RUN apt-get update \
    && mkdir /root/.conda \
    && bash Miniconda3-latest-Linux-x86_64.sh -b \
    && rm -f Miniconda3-latest-Linux-x86_64.sh \
+    && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \
+    && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \
    && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"

 ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"

 WORKDIR /workspace

-RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
+RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
    python3 -m pip install --no-cache-dir -U torch --extra-index-url https://download.pytorch.org/whl/nightly/cu$CUDA && \
    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
-    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
+    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" && \
+    python3 -m pip cache purge

 RUN git lfs install --skip-repo && \
    pip3 install awscli && \
    # The base image ships with `pydantic==1.8.2` which is not working
-    pip3 install -U --no-cache-dir pydantic==1.10.10
+    pip3 install -U --no-cache-dir pydantic==1.10.10 && \
+    pip3 cache purge
--- a/docker/Dockerfile-cloud
+++ b/docker/Dockerfile-cloud
@@ -14,7 +14,10 @@ COPY scripts/motd /etc/motd

 RUN pip install jupyterlab notebook ipywidgets && \
    jupyter lab clean
-RUN apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop && \
+RUN apt update && \
+    apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm && \
+    rm -rf /var/cache/apt/archives && \
+    rm -rf /var/lib/apt/lists/* && \
    mkdir -p ~/.ssh && \
    chmod 700 ~/.ssh && \
    printf "\n[[ -z \"\$TMUX\"  ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \
--- a/docker/Dockerfile-cloud-no-tmux
+++ b/docker/Dockerfile-cloud-no-tmux
@@ -9,13 +9,15 @@ ENV HF_HUB_ENABLE_HF_TRANSFER="1"
 EXPOSE 8888
 EXPOSE 22

-COPY scripts/cloud-entrypoint-term.sh /root/cloud-entrypoint.sh
+COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh
 COPY scripts/motd /etc/motd

 RUN pip install jupyterlab notebook ipywidgets && \
    jupyter lab clean
-RUN apt install --yes --no-install-recommends openssh-server tmux sudo && \
-    pip3 install -U --no-cache-dir grpcio ray[default]==2.9.3 && \
+RUN apt update && \
+    apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm && \
+    rm -rf /var/cache/apt/archives && \
+    rm -rf /var/lib/apt/lists/* && \
    mkdir -p ~/.ssh && \
    chmod 700 ~/.ssh && \
    printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -187,6 +187,7 @@ Instead of passing `tools` via the system prompt, an alternative method would be
            "role": "assistant", // call the function via assistant
            "tool_calls": [
                {
+                    "id": "...",  // required only for mistral
                    "type": "function",
                    "function": {
                        "name": "...",
@@ -199,6 +200,7 @@ Instead of passing `tools` via the system prompt, an alternative method would be
        },
        {
            "role": "tool",
+            "tool_call_id": "...",  // required only for mistral
            "name": "...",
            "content": "..."
        },
--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -34,6 +34,7 @@ Tags examples:

 - `main-base-py3.11-cu128-2.7.1`
 - `main-base-py3.11-cu126-2.7.1`
+- `main-base-py3.11-cu126-2.7.0`
 - `main-base-py3.11-cu126-2.6.0`
 - `main-base-py3.11-cu124-2.6.0`

@@ -75,6 +76,7 @@ Tags examples:

 - `main-py3.11-cu128-2.7.1`
 - `main-py3.11-cu126-2.7.1`
+- `main-py3.11-cu126-2.7.0`
 - `main-py3.11-cu126-2.6.0`
 - `main-py3.11-cu124-2.6.0`
 - `main-latest`
--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -136,3 +136,7 @@ description: Frequently asked questions
 >   dynamic: false
 >   mode: max-autotune-no-cudagraphs
 > ```
+
+**Q: `ValueError("Backward pass should have cleared tracker of all tensors")`
+
+> A: This may happen due to edge cases in using the modern OffloadActivations context manager for CUDA streams. If you encounter this error, you may have success using the naive implementation with `offload_activations: legacy` in your YAML.
--- a/docs/gradient_checkpointing.qmd
+++ b/docs/gradient_checkpointing.qmd
@@ -0,0 +1,29 @@
+---
+title: Gradient Checkpointing and Activation Offloading
+---
+
+Gradient checkpointing and activation offloading are techniques used to optimize the performance of deep learning
+models by reducing the memory footprint and improving computational efficiency.
+
+### Enabling Gradient Checkpointing
+
+```yaml
+gradient_checkpointing: true
+```
+
+### Enabling Activation Offloading
+
+```yaml
+gradient_checkpointing: true  # required for activation offloading
+activation_offloading: true
+```
+
+Activation offloading variants:
+
+The default `activation_offloading: true` offloads activations to CPU and uses CUDA streams
+to overlap the communications and computations when offloading.
+
+The `activation_offloading: legacy` naively offloads activations to CPU and without additional optimizations.
+
+For resource constrained environments with limited CPU memory, `activation_offloading: disk` offloads
+activations to disk instead of CPU RAM so that much larger context lengths can be trained with minimal memory.
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -124,10 +124,13 @@ For providers supporting Docker:

 - Use `axolotlai/axolotl-cloud:main-latest`
 - Available on:
-  - [Latitude.sh](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)
-  - [JarvisLabs.ai](https://jarvislabs.ai/templates/axolotl)
-  - [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
-  - [Novita](https://novita.ai/gpus-console?templateId=311)
+    - [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
+    - [Vast.ai](https://cloud.vast.ai?ref_id=62897&template_id=bdd4a49fa8bce926defc99471864cace&utm_source=axolotl&utm_medium=partner&utm_campaign=template_launch_july2025&utm_content=docs_link)
+    - [PRIME Intellect](https://app.primeintellect.ai/dashboard/create-cluster?image=axolotl&location=Cheapest&security=Cheapest&show_spot=true)
+    - [Modal](https://www.modal.com?utm_source=github&utm_medium=github&utm_campaign=axolotl)
+    - [Novita](https://novita.ai/gpus-console?templateId=311)
+    - [JarvisLabs.ai](https://jarvislabs.ai/templates/axolotl)
+    - [Latitude.sh](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)

 ### Google Colab {#sec-colab}

--- a/docs/mixed_precision.qmd
+++ b/docs/mixed_precision.qmd
@@ -0,0 +1,149 @@
+---
+title: "Mixed Precision Training"
+format:
+  html:
+    toc: true
+    toc-depth: 3
+    number-sections: true
+    code-tools: true
+execute:
+  enabled: false
+---
+
+Mixed precision training uses lower precision data types to reduce memory usage and increase training speed while maintaining model quality. Axolotl supports several mixed precision formats:
+
+- **FP16** - Half precision 16-bit (Pascal generation+)
+- **BF16** - Brain Float 16-bit (Ampere generation+)
+- **FP8** - 8-bit floating point (Hopper generation+)
+
+## FP16 Mixed Precision {#sec-fp16}
+
+### Overview {#sec-fp16-overview}
+
+FP16 is the traditional half-precision format, supported on older GPUs but can be less numerically stable than BF16.
+
+### Configuration {#sec-fp16-config}
+
+```{.yaml}
+fp16: true
+```
+
+### FP16 Considerations {#sec-fp16-considerations}
+
+- May require gradient scaling to prevent underflow
+- Less numerically stable than BF16
+- Can cause training instability with some model architectures
+- Consider using BF16 if your hardware supports it
+
+## BF16 Mixed Precision {#sec-bf16}
+
+### Overview {#sec-bf16-overview}
+
+BF16 (Brain Float 16) offers better numerical stability than FP16 and is the recommended mixed precision format for modern GPUs. It provides the same dynamic range as FP32 while using half the memory.
+
+### Configuration {#sec-bf16-config}
+
+```{.yaml}
+# Automatic BF16 detection (recommended)
+bf16: auto
+
+# Or explicitly enable
+bf16: true
+
+# For evaluation with BF16
+bf16: full  # Equivalent to bf16_full_eval in the HF trainer
+```
+
+## FP8 Mixed Precision {#sec-fp8}
+
+::: {.callout-note}
+FP8 support is experimental and requires compatible hardware (H100, H200) and recent PyTorch versions with TorchAO.
+:::
+
+### What is FP8? {#sec-fp8-overview}
+
+FP8 (8-bit floating point) can provide significant time savings compared to FP16/BF16 while maintaining training stability. Axolotl's implementation uses PyTorch's TorchAO library with "tensorwise" scaling strategy.
+
+### Requirements {#sec-fp8-software}
+
+- Hopper+ GPUs (H100/H200)
+- PyTorch 2.7+ (+ compatible TorchAO version)
+- CUDA 12.4+
+
+### Configuration {#sec-fp8-config}
+
+Add to your YAML config:
+
+```{.yaml}
+# Enable FP8 mixed precision
+fp8: true
+
+# Optional: Enable FP8 for FSDP all-gather operations
+fp8_enable_fsdp_float8_all_gather: true
+
+# Enable torch.compile (almost always necessary for FP8 speedups)
+torch_compile: true
+```
+
+::: {.callout-important}
+**torch.compile is critical for FP8 performance**
+
+FP8 training requires `torch_compile: true` to see meaningful speedups. Without compilation, FP8 may actually be slower and use more memory than FP16/BF16.
+:::
+
+### Advanced FP8 Configs {#sec-fp8-advanced}
+
+For [FSDP](multi-gpu.qmd#sec-fsdp) (Fully Sharded Data Parallel) training:
+
+```{.yaml}
+fp8: true
+fp8_enable_fsdp_float8_all_gather: true
+
+torch_compile: true
+
+# FSDP configuration
+fsdp_version: 2
+fsdp_config:
+  offload_params: false
+  cpu_ram_efficient_loading: true
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  state_dict_type: FULL_STATE_DICT
+  reshard_after_forward: true
+```
+
+## Best Practices {#sec-best-practices}
+
+### Choosing Precision Format {#sec-choosing-format}
+
+- **Start with automatic detection**: `bf16: auto`
+- **For Hopper+ (H100/H200)**: Try FP8 + torch.compile for maximum speed
+- **For Ampere (A100/RTX 30/40)**: Use BF16
+- **For older Pascal/Turing GPUs**: Use FP16 with caution
+- **For very old or unsupported GPUs**: Use FP32
+
+### Validation and Testing {#sec-validation}
+
+Always validate your mixed precision setup:
+
+- **Start with a small dataset** to verify stability
+- **Monitor loss curves** for irregularities
+- **Compare with FP32 baseline** when possible
+- **Test evaluation metrics** match expectations
+
+### FP8 Particulars {#sec-fp8-details}
+
+- Use cases
+  - Single GPU training
+  - Multi GPU training with FSDP2 or Deepspeed
+- Speedups
+  - Please refer to the [TorchAO FP8 training benchmarks](https://github.com/pytorch/ao/tree/main/torchao/float8#rowwise-scaling) for expected matmul speedups for different (M, K, N) settings
+  - Concrete number for LLaMA 3 8B training can be found [here](https://github.com/pytorch/ao/tree/main/torchao/float8#training-benchmarks)
+- Known issues:
+  - FP8 + DDP + `torch.compile` (causes [error](https://gist.github.com/djsaunde/0c1664c32e44a64d31b5e01b4aafe5c4))
+  - FP8 + FSDP2 + `torch.compile` + FSDP2 activation checkpointing tends to be _slower_ than the BF16 equivalent training
+  - Flash Attention 2 does not play nicely with `torch.compile`
+
+See `examples/llama-3/3b-fp8-fsdp2.yaml` for an optimized example config. Enabling FP8 mixed precision + FP8 all-gather training results in ~10% faster iterations per second vs. BF16 for a relatively small (3B param) model
+
+For more information on multi-GPU training, see our [Multi-GPU guide](multi-gpu.qmd).
--- a/docs/multi-gpu.qmd
+++ b/docs/multi-gpu.qmd
@@ -23,8 +23,6 @@ Axolotl supports several methods for multi-GPU training:

 ## DeepSpeed {#sec-deepspeed}

-DeepSpeed is the recommended approach for multi-GPU training due to its stability and performance. It provides various optimization levels through ZeRO stages.
-
 ### Configuration {#sec-deepspeed-config}

 Add to your YAML config:
@@ -32,7 +30,6 @@ Add to your YAML config:
 ```{.yaml}
 deepspeed: deepspeed_configs/zero1.json
 ```
-
 ### Usage {#sec-deepspeed-usage}

 ```{.bash}
@@ -66,9 +63,75 @@ Start from Stage 1 -> Stage 2 -> Stage 3.

 :::

-## FSDP {#sec-fsdp}
+::: {.callout-tip}

-### Basic FSDP Configuration {#sec-fsdp-config}
+Using ZeRO Stage 3 with Single-GPU training
+
+ZeRO Stage 3 can be used for training on a single GPU by manually setting the environment variables:
+`WORLD_SIZE=1 LOCAL_RANK=0 MASTER_ADDR=0.0.0.0 MASTER_PORT=29500`
+
+:::
+
+## Fully Sharded Data Parallel (FSDP) {#sec-fsdp}
+
+::: {.callout-note}
+
+FSDP2 is recommended for new users. FSDP1 is deprecated and will be removed in an upcoming release of Axolotl.
+
+:::
+
+### Migrating from FSDP1 to FSDP2 {#sec-migrate-fsdp1-fsdp2}
+
+To migrate your config from FSDP1 to FSDP2, you must use the `fsdp_version` top-level config field to specify the FSDP version, and
+also follow the config field mapping below to update field names.
+
+#### Config mapping
+
+FSDP1 | FSDP2
+-------- | --------
+fsdp_sharding_strategy | reshard_after_forward
+fsdp_backward_prefetch_policy | **REMOVED**
+fsdp_backward_prefetch | **REMOVED**
+fsdp_forward_prefetch | **REMOVED**
+fsdp_sync_module_states | **REMOVED**
+fsdp_cpu_ram_efficient_loading | cpu_ram_efficient_loading
+fsdp_state_dict_type | state_dict_type
+fsdp_use_orig_params | **REMOVED**
+
+For more details, please see the migration guide in the [torchtitan repo](https://github.com/pytorch/torchtitan/blob/main/docs/fsdp.md). In Axolotl,
+if you were using the following FSDP1 config:
+
+```{.yaml}
+fsdp_version: 1
+fsdp_config:
+  fsdp_offload_params: false
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_transformer_layer_cls_to_wrap: Qwen3DecoderLayer
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sharding_strategy: FULL_SHARD
+```
+
+You can migrate to the following FSDP2 config:
+
+```{.yaml}
+fsdp_version: 2
+fsdp_config:
+  offload_params: false
+  cpu_ram_efficient_loading: true
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: Qwen3DecoderLayer
+  state_dict_type: FULL_STATE_DICT
+  reshard_after_forward: true
+```
+
+### FSDP1 (deprecated) {#sec-fsdp-config}
+
+::: {.callout-note}
+
+Using `fsdp` to configure FSDP is deprecated and will be removed in an upcoming release of Axolotl. Please use `fsdp_config` as above instead.
+
+:::

 ```{.yaml}
 fsdp:
@@ -80,6 +143,7 @@ fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
 ```

+
 ## Sequence parallelism {#sec-sequence-parallelism}

 We support sequence parallelism (SP) via the
--- a/docs/multi-node.qmd
+++ b/docs/multi-node.qmd
@@ -40,13 +40,13 @@ use_cpu: false

 Configure your model to use FSDP in the Axolotl yaml. For example:
 ```yaml
-fsdp:
-  - full_shard
-  - auto_wrap
+fsdp_version: 2
 fsdp_config:
-  fsdp_offload_params: true
-  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  offload_params: true
+  state_dict_type: FULL_STATE_DICT
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  reshard_after_forward: true
 ```

 All you have to do now is launch using accelerate as you would usually do on each machine and voila, the processes will start once you have launched accelerate on every machine.
--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -14,6 +14,7 @@ format:
 - [Llava-1.5](#sec-llava-15)
 - [Mistral-Small-3.1](#sec-mistral-small-31)
 - [Gemma-3](#sec-gemma-3)
+- [Gemma-3n](#sec-gemma-3n)
 - [Qwen2-VL](#sec-qwen2-vl)
 - [Qwen2.5-VL](#sec-qwen25-vl)

@@ -110,6 +111,22 @@ base_model: google/gemma-3-4b-it
 chat_template: gemma3
 ```

+### Gemma-3n {#sec-gemma-3n}
+
+::: {.callout-warning}
+The model's initial loss and grad norm will be very high. We suspect this to be due to the Conv in the vision layers.
+:::
+
+::: {.callout-tip}
+Please make sure to install `timm` via `pip3 install timm==1.0.17`
+:::
+
+```yaml
+base_model: google/gemma-3n-E2B-it
+
+chat_template: gemma3n
+```
+
 ### Qwen2-VL {#sec-qwen2-vl}

 ```yaml
@@ -132,7 +149,9 @@ For multi-modal datasets, we adopt an extended `chat_template` format similar to

 - A message is a list of `role` and `content`.
 - `role` can be `system`, `user`, `assistant`, etc.
- `content` is a list of `type` and (`text` or `image` or `path` or `url` or `base64`).
+- `content` is a list of `type` and (`text`, `image`, `path`, `url`, `base64`, or `audio`).
+
+### Image

 ::: {.callout-note}
 For backwards compatibility:
@@ -141,15 +160,29 @@ For backwards compatibility:
 - If `content` is a string, it will be converted to a list with `type` as `text`.
 :::

-::: {.callout-tip}
 For image loading, you can use the following keys within `content` alongside `"type": "image"`:

 - `"path": "/path/to/image.jpg"`
 - `"url": "https://example.com/image.jpg"`
 - `"base64": "..."`
 - `"image": PIL.Image`
+
+### Audio
+
+For audio loading, you can use the following keys within `content` alongside `"type": "audio"`:
+
+- `"path": "/path/to/audio.mp3"`
+- `"url": "https://example.com/audio.mp3"`
+- `"audio": np.ndarray`
+
+::: {.callout-tip}
+
+You may need to install `librosa` via `pip3 install librosa==0.11.0`.
+
 :::

+### Example
+
 Here is an example of a multi-modal dataset:
 ```json
 [
@@ -178,3 +211,9 @@ Here is an example of a multi-modal dataset:
  }
 ]
 ```
+
+## FAQ
+
+1. `PIL.UnidentifiedImageError: cannot identify image file ...`
+
+`PIL` could not retrieve the file at `url` using `requests`. Please check for typo. One alternative reason is that the request is blocked by the server.
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -17,7 +17,6 @@ feedback. Various methods include, but not limited to:
 - [Kahneman-Tversky Optimization (KTO)](#kto)
 - [Odds Ratio Preference Optimization (ORPO)](#orpo)
 - [Group Relative Policy Optimization (GRPO)](#grpo)
- Proximal Policy Optimization (PPO) (not yet supported in axolotl, if you're interested in contributing, please reach out!)


 ## RLHF using Axolotl
@@ -275,15 +274,14 @@ rl: dpo
 datasets:
  - path: ...
    split: train
-    type: user_defined.default
-
-    field_prompt: "prompt"
-    field_system: "system"
-    field_chosen: "chosen"
-    field_rejected: "rejected"
-    prompt_format: "{prompt}"
-    chosen_format: "{chosen}"
-    rejected_format: "{rejected}"
+    type:
+      field_prompt: "prompt"
+      field_system: "system"
+      field_chosen: "chosen"
+      field_rejected: "rejected"
+      prompt_format: "{prompt}"
+      chosen_format: "{chosen}"
+      rejected_format: "{rejected}"
 ```

 The input format is a simple JSON input with customizable fields based on the above config.
@@ -476,14 +474,13 @@ rl: kto
 datasets:
  - path: ...
    split: train
-    type: user_defined.default
-
-    field_prompt: "prompt"
-    field_system: "system"
-    field_completion: "completion"
-    field_label: "label"
-    prompt_format: "{prompt}"
-    completion_format: "{completion}"
+    type:
+      field_prompt: "prompt"
+      field_system: "system"
+      field_completion: "completion"
+      field_label: "label"
+      prompt_format: "{prompt}"
+      completion_format: "{completion}"
 ```

 The input format is a simple JSON input with customizable fields based on the above config.
--- a/examples/alst/README.md
+++ b/examples/alst/README.md
@@ -0,0 +1,9 @@
+# Arctic Long Sequence Training (ALST)
+
+Artic Long Sequence Training (ALST) is a technique for training long context models using a variety of optimization
+techniques. It is a combination of:
+- TiledMLP: Leverage tiling over the sequence dimension on MLP layers to reduce memory usage
+- Tiled Loss: Using optimized loss functions like Liger-Kernel or Cut Cross Entropy to reduce memory usage
+- Activation Offloading: Offload activations to CPU RAM to reduce memory usage
+
+For more information, you can check out the ALST paper [here](https://www.arxiv.org/abs/2506.13996).
--- a/examples/alst/llama3-8b-deepspeed-alst.yaml
+++ b/examples/alst/llama3-8b-deepspeed-alst.yaml
@@ -0,0 +1,53 @@
+base_model: meta-llama/Llama-3.1-8B
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+datasets:
+  - path: togethercomputer/Long-Data-Collections
+    type: completion
+    field: text
+    data_files:
+      - pretrain/rp_sub.jsonl.zst
+  - path: princeton-nlp/TextbookChapters
+    type: completion
+    field: chapter
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+sequence_len: 500_000
+min_sample_len: 200_000
+sample_packing: true
+
+tiled_mlp: true
+sequence_parallel_degree: 8
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_torch_8bit
+lr_scheduler: cosine
+learning_rate: 2e-5
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+activation_offloading: legacy
+
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_steps: 100
+saves_per_epoch: 1
+evals_per_epoch: 2
+weight_decay: 0.0
+special_tokens:
+  pad_token: <|end_of_text|>
+
+deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_all.json
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/alst/llama3-8b-fsdp2-alst.yaml
+++ b/examples/alst/llama3-8b-fsdp2-alst.yaml
@@ -0,0 +1,59 @@
+base_model: meta-llama/Llama-3.1-8B
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+datasets:
+  - path: togethercomputer/Long-Data-Collections
+    type: completion
+    field: text
+    data_files:
+      - pretrain/rp_sub.jsonl.zst
+  - path: princeton-nlp/TextbookChapters
+    type: completion
+    field: chapter
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+sequence_len: 500_000
+min_sample_len: 200_000
+sample_packing: true
+
+tiled_mlp: true
+context_parallel_size: 8
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_torch_8bit
+lr_scheduler: cosine
+learning_rate: 2e-5
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+activation_offloading: legacy
+
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_steps: 100
+saves_per_epoch: 1
+evals_per_epoch: 2
+weight_decay: 0.0
+special_tokens:
+  pad_token: <|end_of_text|>
+
+fsdp_version: 2
+fsdp_config:
+  offload_params: false  # offloading is currently not compatible with SP + torchao optimizer
+  state_dict_type: SHARDED_STATE_DICT
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  reshard_after_forward: true
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/archived/README.md
+++ b/examples/archived/README.md
@@ -0,0 +1,5 @@
+# Archived Examples
+
+This directory contains examples that are no longer maintained and may no longer be functional.
+
+We keep them around for archival purposes in case they are useful to others.
--- a/examples/archived/cerebras/btlm-ft.yml
+++ b/examples/archived/cerebras/btlm-ft.yml
@@ -66,7 +66,7 @@ flash_optimum:
 gptq_groupsize:
 gptq_model_v1:

-warmup_steps: 32
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 save_total_limit:
--- a/examples/archived/cerebras/qlora.yml
+++ b/examples/archived/cerebras/qlora.yml
@@ -43,7 +43,7 @@ xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
--- a/examples/archived/code-llama/13b/lora.yml
+++ b/examples/archived/code-llama/13b/lora.yml
@@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 adapter: lora
 lora_model_dir:
@@ -47,7 +47,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true

-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/code-llama/13b/qlora.yml
+++ b/examples/archived/code-llama/13b/qlora.yml
@@ -20,7 +20,7 @@ lora_model_dir:

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 lora_r: 32
 lora_alpha: 16
@@ -48,7 +48,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true

-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/code-llama/34b/lora.yml
+++ b/examples/archived/code-llama/34b/lora.yml
@@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 adapter: lora
 lora_model_dir:
@@ -47,7 +47,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true

-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/code-llama/34b/qlora.yml
+++ b/examples/archived/code-llama/34b/qlora.yml
@@ -20,7 +20,7 @@ lora_model_dir:

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 lora_r: 32
 lora_alpha: 16
@@ -48,7 +48,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true

-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/code-llama/7b/lora.yml
+++ b/examples/archived/code-llama/7b/lora.yml
@@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 adapter: lora
 lora_model_dir:
@@ -47,7 +47,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true

-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/code-llama/7b/qlora.yml
+++ b/examples/archived/code-llama/7b/qlora.yml
@@ -20,7 +20,7 @@ lora_model_dir:

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 lora_r: 32
 lora_alpha: 16
@@ -48,7 +48,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true

-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/code-llama/README.md
+++ b/examples/archived/code-llama/README.md
--- a/examples/archived/dbrx/16bit-lora.yaml
+++ b/examples/archived/dbrx/16bit-lora.yaml
@@ -54,7 +54,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true

-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch:
 saves_per_epoch: 1

--- a/examples/archived/dbrx/8bit-lora.yaml
+++ b/examples/archived/dbrx/8bit-lora.yaml
@@ -57,7 +57,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true

-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch:
 saves_per_epoch: 1

--- a/examples/archived/dbrx/README.md
+++ b/examples/archived/dbrx/README.md
--- a/examples/archived/dbrx/fft-ds-zero3.yaml
+++ b/examples/archived/dbrx/fft-ds-zero3.yaml
@@ -41,7 +41,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true

-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch:
 saves_per_epoch: 1

--- a/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
+++ b/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
@@ -21,7 +21,7 @@ output_dir: ./outputs/lora-out
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 adapter: lora
 lora_model_dir:
@@ -51,7 +51,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true

-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/falcon/config-7b-lora.yml
+++ b/examples/archived/falcon/config-7b-lora.yml
@@ -47,7 +47,7 @@ xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_steps: 40
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/falcon/config-7b-qlora.yml
+++ b/examples/archived/falcon/config-7b-qlora.yml
@@ -77,7 +77,7 @@ xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.000001
--- a/examples/archived/falcon/config-7b.yml
+++ b/examples/archived/falcon/config-7b.yml
@@ -44,7 +44,7 @@ xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_steps: 40
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/gemma/qlora.yml
+++ b/examples/archived/gemma/qlora.yml
@@ -25,7 +25,7 @@ lora_target_linear: true
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
--- a/examples/archived/gptj/qlora.yml
+++ b/examples/archived/gptj/qlora.yml
@@ -40,7 +40,7 @@ xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
--- a/examples/archived/jeopardy-bot/config.yml
+++ b/examples/archived/jeopardy-bot/config.yml
@@ -41,7 +41,7 @@ xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_steps: 20
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
--- a/examples/archived/mpt-7b/README.md
+++ b/examples/archived/mpt-7b/README.md
--- a/examples/archived/mpt-7b/config.yml
+++ b/examples/archived/mpt-7b/config.yml
@@ -42,7 +42,7 @@ logging_steps: 5
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_steps: 20
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0001
--- a/examples/archived/openllama-3b/README.md
+++ b/examples/archived/openllama-3b/README.md
--- a/examples/archived/openllama-3b/config.yml
+++ b/examples/archived/openllama-3b/config.yml
@@ -42,7 +42,7 @@ logging_steps: 1
 flash_attention: true
 gptq_groupsize:
 gptq_model_v1:
-warmup_steps: 20
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
--- a/examples/archived/openllama-3b/lora.yml
+++ b/examples/archived/openllama-3b/lora.yml
@@ -50,7 +50,7 @@ logging_steps: 1
 flash_attention: true
 gptq_groupsize:
 gptq_model_v1:
-warmup_steps: 20
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
--- a/examples/archived/openllama-3b/qlora.yml
+++ b/examples/archived/openllama-3b/qlora.yml
@@ -43,7 +43,7 @@ logging_steps: 1
 flash_attention: true
 gptq_groupsize:
 gptq_model_v1:
-warmup_steps: 20
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
--- a/examples/archived/pythia-12b/README.md
+++ b/examples/archived/pythia-12b/README.md
--- a/examples/archived/pythia-12b/config.yml
+++ b/examples/archived/pythia-12b/config.yml
--- a/examples/archived/pythia/lora.yml
+++ b/examples/archived/pythia/lora.yml
--- a/examples/archived/qwen/README.md
+++ b/examples/archived/qwen/README.md
--- a/examples/archived/qwen/lora.yml
+++ b/examples/archived/qwen/lora.yml
@@ -49,7 +49,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention:

-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/qwen/qlora.yml
+++ b/examples/archived/qwen/qlora.yml
@@ -49,7 +49,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention:

-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/qwen/qwen2-moe-lora.yaml
+++ b/examples/archived/qwen/qwen2-moe-lora.yaml
@@ -45,7 +45,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true

-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/qwen/qwen2-moe-qlora.yaml
+++ b/examples/archived/qwen/qwen2-moe-qlora.yaml
@@ -48,7 +48,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true

-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/redpajama/README.md
+++ b/examples/archived/redpajama/README.md
--- a/examples/archived/redpajama/config-3b.yml
+++ b/examples/archived/redpajama/config-3b.yml
@@ -43,7 +43,7 @@ logging_steps: 5
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_steps: 20
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0001
--- a/examples/archived/replit-3b/config-lora.yml
+++ b/examples/archived/replit-3b/config-lora.yml
@@ -41,7 +41,7 @@ logging_steps: 1
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_steps: 20
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0
--- a/examples/archived/stablelm-2/1.6b/fft.yml
+++ b/examples/archived/stablelm-2/1.6b/fft.yml
@@ -16,7 +16,7 @@ output_dir: ./outputs/out

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 adapter:
 lora_model_dir:
@@ -50,7 +50,7 @@ flash_attn_rms_norm: true
 flash_attn_fuse_qkv: false
 flash_attn_fuse_mlp: true

-warmup_steps: 100
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1

--- a/examples/archived/stablelm-2/1.6b/lora.yml
+++ b/examples/archived/stablelm-2/1.6b/lora.yml
@@ -19,7 +19,7 @@ output_dir: ./outputs/lora-out

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 adapter: lora
 lora_model_dir:
@@ -51,7 +51,7 @@ flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true

-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/stablelm-2/README.md
+++ b/examples/archived/stablelm-2/README.md
--- a/examples/archived/starcoder2/qlora.yml
+++ b/examples/archived/starcoder2/qlora.yml
@@ -19,7 +19,7 @@ lora_model_dir:

 sequence_len: 8192
 sample_packing: true
-pad_to_sequence_len: true
+

 lora_r: 32
 lora_alpha: 16
@@ -48,7 +48,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true

-warmup_steps: 20
+warmup_ratio: 0.1
 evals_per_epoch: 4
 eval_steps:
 saves_per_epoch: 4
--- a/examples/archived/tiny-llama/README.md
+++ b/examples/archived/tiny-llama/README.md
--- a/examples/archived/tiny-llama/lora-mps.yml
+++ b/examples/archived/tiny-llama/lora-mps.yml
@@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 eval_sample_packing: false

 adapter: lora
@@ -49,7 +49,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: false

-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 0
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/tiny-llama/lora.yml
+++ b/examples/archived/tiny-llama/lora.yml
@@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 adapter: lora
 lora_model_dir:
@@ -47,7 +47,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true

-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/tiny-llama/pretrain.yml
+++ b/examples/archived/tiny-llama/pretrain.yml
@@ -38,7 +38,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true

-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/tiny-llama/qlora.yml
+++ b/examples/archived/tiny-llama/qlora.yml
@@ -21,7 +21,7 @@ lora_model_dir:
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 lora_r: 32
 lora_alpha: 16
@@ -49,7 +49,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true

-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml
+++ b/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml
@@ -75,7 +75,7 @@ xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/yi-34B-chat/README.md
+++ b/examples/archived/yi-34B-chat/README.md
--- a/examples/archived/yi-34B-chat/qlora.yml
+++ b/examples/archived/yi-34B-chat/qlora.yml
@@ -20,7 +20,7 @@ special_tokens:
 datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
-warmup_steps: 10
+warmup_ratio: 0.1

 # Iterations
 num_epochs: 1
--- a/examples/cohere/command-r-7b-qlora.yml
+++ b/examples/cohere/command-r-7b-qlora.yml
@@ -27,7 +27,7 @@ lora_target_linear: true
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
@@ -35,7 +35,6 @@ wandb_watch:
 wandb_name:
 wandb_log_model:

-
 gradient_accumulation_steps: 4
 micro_batch_size: 1
 num_epochs: 4
@@ -56,3 +55,5 @@ evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -40,7 +40,7 @@
        "%%capture\n",
        "# This step can take ~5-10 minutes to install dependencies\n",
        "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
-        "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@78b2a45713a54c9bedf8b33f5e31cf07a1a57154\""
+        "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@010c3ac3f1e725098961832830303eeb4142dd88\""
      ]
    },
    {
--- a/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
@@ -21,7 +21,7 @@ output_dir: ./outputs/lora-out
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 adapter: lora
 lora_model_dir:
@@ -51,8 +51,10 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true

-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
@@ -21,7 +21,7 @@ output_dir: ./outputs/lora-out
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 adapter: lora
 lora_model_dir:
@@ -51,8 +51,10 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true

-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/deepseek-v2/fft-fsdp-16b.yaml
+++ b/examples/deepseek-v2/fft-fsdp-16b.yaml
@@ -12,7 +12,7 @@ output_dir: ./outputs/out

 sequence_len: 2048
 sample_packing: true
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
@@ -37,7 +37,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true

-warmup_steps: 100
+warmup_ratio: 0.1
 evals_per_epoch: 2
 saves_per_epoch: 1
 weight_decay: 0.0
@@ -55,3 +55,5 @@ fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: DeepseekV2DecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -30,7 +30,7 @@ output_dir: ./outputs/out

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
@@ -61,7 +61,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true

-warmup_steps: 100
+warmup_ratio: 0.1
 evals_per_epoch: 2
 saves_per_epoch: 1
 weight_decay: 0.0
@@ -79,3 +79,5 @@ fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: DeepseekV2DecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/devstral/README.md
+++ b/examples/devstral/README.md
@@ -1,8 +1,12 @@
 # Finetune Devstral with Axolotl

-Devstral Small is a 24B parameter opensource model from MistralAI found on HuggingFace [Devstral-Small-2505](https://huggingface.co/mistralai/Devstral-Small-2505). This guide shows how to fine-tune it with Axolotl with multi-turn conversations with proper masking.
+Devstral Small is a 24B parameter opensource model from MistralAI found on HuggingFace [Devstral-Small-2505](https://huggingface.co/mistralai/Devstral-Small-2505) and [Devstral-Small-2507](https://huggingface.co/mistralai/Devstral-Small-2507). `Devstral-Small-2507` is the latest version of the model and has [function calling](https://mistralai.github.io/mistral-common/usage/tools/) support.

-The model was fine-tuned ontop of [Mistral-Small-3.1](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503) without the vision layer and has a context of upto 128k tokens.
+This guide shows how to fine-tune it with Axolotl with multi-turn conversations with proper masking.
+
+The model was fine-tuned ontop of [Mistral-Small-3.1](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503) without the vision layer and has a context of up to 128k tokens.
+
+Thanks to the team at MistralAI for giving us early access to prepare for this release.

 ## Getting started

@@ -17,11 +21,6 @@ cd axolotl

 pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation -e '.[flash-attn]'
-
-# Install the latest mistral-common from source
-pip3 uninstall mistral-common
-pip3 install git+https://github.com/mistralai/mistral-common.git@039465d
-
 ```

 2. Run the finetuning example:
@@ -39,6 +38,7 @@ Let us know how it goes. Happy finetuning! 🚀
 - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
 - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
 - The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
+- Learn how to use function calling with Axolotl at [docs](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#using-tool-use).

 ## Optimization Guides

@@ -57,6 +57,7 @@ In addition, we do not support overriding tokens yet.
 ## Related Resources

 - [MistralAI Devstral Blog](https://mistral.ai/news/devstral)
+- [MistralAI Devstral 1.1 Blog](https://mistral.ai/news/devstral-2507)
 - [Axolotl Docs](https://docs.axolotl.ai)
 - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
 - [Axolotl Website](https://axolotl.ai)
--- a/examples/devstral/devstral-small-qlora.yml
+++ b/examples/devstral/devstral-small-qlora.yml
@@ -1,4 +1,4 @@
-base_model: mistralai/Devstral-Small-2505
+base_model: mistralai/Devstral-Small-2507

 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
@@ -25,7 +25,7 @@ lora_model_dir:

 sequence_len: 2048
 sample_packing: true
-pad_to_sequence_len: true
+

 lora_r: 32
 lora_alpha: 16
@@ -62,3 +62,5 @@ saves_per_epoch: 1

 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml
@@ -38,7 +38,7 @@ lora_target_modules:
 sequence_len: 2048
 sample_packing: false
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
@@ -69,3 +69,5 @@ evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Wing Lian	08aa74e418	fix llama modeling	2025-07-30 11:37:58 -04:00
Wing Lian	dfa14f87ab	fix residuals and add llama support	2025-07-30 10:22:38 -04:00
Wing Lian	fbe1b504da	add custom modeling for gemma3 using liger fused add rms	2025-07-30 08:21:03 -04:00
Wing Lian	5b8370969c	actually call the register method on plugins	2025-07-30 08:05:25 -04:00
Wing Lian	22810c97b7	use warmup_ratio as a better default than warmup steps since it's data dependent (#2897 ) [skip ci] * use warmup_ratio as a better default than warmup steps since it's data dependent * replace remainder of warmup_steps	2025-07-30 06:44:06 -04:00
Vincenzo di Cicco	2eb7ff95af	Use '<\|finetune_right_pad\|>' as padding token for LLama4 (#2988 ) [skip ci]	2025-07-30 06:38:13 -04:00
NanoCode012	90e5598930	Feat: Add voxtral, magistral small 1.1, and misc gemma3n fixes (#2979 ) * fix: lock version in gemma3n docs * feat: add sample configs and docs * chore: move mistraltokenizer into mistral folder * feat: update instructions * feat: add dynamic load voxtral * fix: remove incorrect vision config, add audio * fix: support voxtral processing strategy and address none in data * feat: patch mistraltokenizer subclass upstream and add missing * feat: update cce commit to include voxtral * fix: remove old comment * fix: gemma3 patch not needed anymore * fix: voxtral modeling code * fix: remove incorrect ds path * fix: adjust apply chat template parsing * feat: enable voxtral patch * fix: patch * feat: update example datasets * fix: target layer * feat: update gemma3n docs * feat: update voxtral docs * feat: revert assistant parsing to rely on new upstream changes * chore: skip test till next PR fix * fix: override upstream decode due to missing handling * feat: update readme * fix: update * feat: add magistral small think support * feat: update mistral-common dep * fix: lint * fix: remove optional dep * chore: typing * chore: simply import * feat(doc): update differences for 2507 * fix: coderrabbit comments * feat: update clarify docs on new transformers	2025-07-30 15:57:05 +07:00
Wing Lian	1d2aa1e467	upgrade to support latest transformers release (#2984 ) * upgrade to support latest transformers release * bump mistral common too * Fix dependencies	2025-07-27 17:05:12 -04:00
NICOLAS BZRD	430be216d8	add shuffle_before_merging_datasets option to allow independent shuffling of datasets before merging (#2981 ) [skip ci]	2025-07-27 17:04:56 -04:00
Wing Lian	28804b82e4	don't create a reference model if grpo beta is 0.0 (#2983 ) [skip ci]	2025-07-27 17:04:42 -04:00
Wing Lian	add3e5076b	don't publish to netlify on contributor submissions since it requires auth tokens (#2985 ) [skip ci] * don't publish to netlify on contributor submissions since it requires auth tokens * fix no-tmux build and add contact to motd	2025-07-27 17:04:27 -04:00
NanoCode012	41434f0c28	feat(doc): add all providers to readme (#2972 ) [skip ci] * feat(doc): add vastai link * feat: add cloud providers to readme for more visibility * add prime intellect, remove Modal as sponsor --------- Co-authored-by: Wing Lian <wing@axolotl.ai>	2025-07-27 17:03:50 -04:00
Wing Lian	f7ea140838	TiledMLP support for FSDP2 (#2950 ) * make TiledMLP work with FSDP * cleanup/gc at start of train to prevent large VRAM spike * chore: lint * generic function for non-deepspeed training * unify patch to fix imports * update readme for ALST and add examples * make deepspeed attribute on params check more robust * update with new info from PR review	2025-07-25 07:15:03 -04:00
Wing Lian	460e0f9ed9	improve handling of file lock when content is empty (#2959 )	2025-07-24 16:10:38 -04:00
Wing Lian	e80faea0db	garbage collect on the end of the step if we're going to save a checkpoint (#2971 ) [skip ci]	2025-07-24 16:10:23 -04:00
Wing Lian	0ff2f172ef	Act offload lora fix (#2928 ) [skip ci] * fix activation offloading with lora * update w e2e test * add docs for error	2025-07-24 16:10:04 -04:00
salman	1407aac779	Skip CI for draft PRs (#2970 )	2025-07-24 09:11:46 +01:00
Dan Saunders	b34c3371ed	upgrade torchao (#2968 )	2025-07-23 10:27:28 -04:00
Wing Lian	5f1a4306b0	don't check dataset labels during preprocess for GRPO (#2952 ) [skip ci] * don't check dataset labels during preprocess for GRPO * use enum check per PR feedback	2025-07-22 20:40:44 -04:00
Wing Lian	93709eb5ce	handle refactor upstream for flash attention (#2966 )	2025-07-22 20:40:04 -04:00
Dan Saunders	208fb7b8e7	basic torchao fp8 mixed precision training (#2926 ) * debug * debug * debug * revert unneeded change * add accelerator config to base trainer builder * add back accumulated_cache_size_limit setting * lint * accelerator constructor patch for single-GPU torch fp8 * lint * re-using existing fp8 code * lint * remove accelerate patch now fix in latest release * fix * docs * add fp8 + fsdp2 example * remove unused config * update config * smoke tests * add validator * add 2.7.0 guard for fsdp2 * fix * add config descriptions * add FSDP doc link * nit * set force_recompute_fp8_weight_in_bwd with enable_fsdp_float8_all_gather * better cfg for smoke tests * add test for accelerate patching * update fp8 validator	2025-07-22 16:27:47 -04:00
Wing Lian	b86a1d47b0	we don't need to call check_dataset_labels when skip_prepare_dataset is set (#2962 ) * we don't need to call check_dataset_labels when skip_prepare_dataset is set * Fix actual bug and revert prior fix * warn and early return instead of raising an error * use error	2025-07-22 10:00:53 -04:00
NanoCode012	01d8175d48	fix: revert changing default optimizer to muon (#2965 ) [skip ci]	2025-07-22 10:00:30 -04:00
NanoCode012	631268a0ca	revert renaming of deepspeed stage3 args that use auto (#2964 ) [skip ci] * Revert "fix deprecate deepspeed stage3_gather_16bit_weights_on_model_save arg…" This reverts commit `e207762928`. * don't revert the values that don't use 'auto' --------- Co-authored-by: Wing Lian <wing@axolotl.ai>	2025-07-22 09:59:47 -04:00
Wing Lian	3a208cfd84	Autocomplete axolotl CLI (#2955 ) * static autocomplete script for axolotl cli * use list of commands that should autocomplete yaml files * make sure to chmod the autocomplete script as executable * shellcheck and fix autocompletion of directory/sub-dirs * more shellcheck fixes	2025-07-22 08:30:31 -04:00
github-actions[bot]	7267edc168	chore: update pre-commit hooks (#2954 ) [skip ci] Co-authored-by: djsaunde <1245942+djsaunde@users.noreply.github.com>	2025-07-22 08:30:00 -04:00
NanoCode012	dfba881e99	Feat: add gemma3n support (#2852 ) * feat: add gemma3n cce * feat: add sample config * feat: add gemma3n multimodal mode * feat: add audio example * feat: support audio and return pixel values in collator * feat: support unmask only assistant region (gemma3n for now) * feat(doc): add notes for audio loading * feat: add audio support for gemma3n * feat: update examples * feat: add gemma3n to the docs * fix: add link at top * feat(doc): clarify additional requirements * fix: mllama missing aspect ratio * fix: mllama need attention fixes for fa2 * Partially Revert "fix: mllama need attention fixes for fa2" This reverts commit `a0bfdd1777`. * fix: disable FA2 for mllama in vision mode * feat: update configs to use proper attention * fix: support other vision features * feat(doc): clarify requirements for gemma3n	2025-07-22 16:52:15 +07:00
Wing Lian	d32058e149	include torchvision in build for upstream changes requiring it now (#2953 ) [skip ci]	2025-07-22 04:19:16 -04:00
NanoCode012	bc1076d8a2	fix: suppress warning if we enabled skip prepare (#2958 )	2025-07-21 11:42:04 -04:00
Wing Lian	b7e8f66e5a	upstream fixes in cce for dora and tensor paralel support (#2960 ) [skip ci]	2025-07-21 11:41:53 -04:00
Wing Lian	e207762928	fix deprecate deepspeed stage3_gather_16bit_weights_on_model_save arg (#2956 ) [skip ci] * fix deprecate deepspeed stage3_gather_16bit_weights_on_model_save arg * replace the rest of the migrated deepspeed params	2025-07-21 11:41:31 -04:00
Wing Lian	fefb0797ee	better handling for reward function checks for GRPO (#2933 ) [skip ci] * better handling for reward function checks for GRPO * consolidate msg copy	2025-07-21 11:41:15 -04:00
Wing Lian	af8d257aa2	make pad_to_sequence_len default to the same value as sample_packing (#2941 ) [skip ci] * make pad_to_sequence_len default to the same value as sample_packing * remove duplicate validation * fix test * update description meta Co-authored-by: NanoCode012 <nano@axolotl.ai> --------- Co-authored-by: NanoCode012 <nano@axolotl.ai>	2025-07-21 11:40:56 -04:00
Wing Lian	db5f6f4693	limit num_proc when saving datasets to disk (#2948 ) [skip ci] * limit num_proc when saving datasets to disk * enforce at least 1 in case it rounds down to 0, and sane divisor is at least 8 rows per worker to save * update fixtures with dataset processes since that should never be NoneType * improve reusability for tests	2025-07-21 11:39:38 -04:00
Wing Lian	8e5f146701	Fix cloud docker image build and remove apt files for optim (#2961 ) * make sure to apt update to install sudo and tmux * remove apt archives too	2025-07-21 11:05:00 -04:00
Wing Lian	31a15a49b6	add additional packages via apt for better multi-node support (#2949 ) * cleanup in Dockerfile and add infiniband packages * fixes for ci * fix nightly too	2025-07-20 21:19:23 -04:00
NanoCode012	b986f7c7cb	fix: return proper attention for llama4 lora kernel and fsdp2 llama4 example fix (#2943 ) * fix: return proper attention for llama4 lora optim * fix: update fsdp2 llama4 config	2025-07-19 13:54:43 -04:00
salman	e5734e5cf0	adding torchtitan link (#2945 ) [skip ci]	2025-07-19 13:54:14 -04:00
Wing Lian	109d9c7442	make the initial call to tokenizer.pad not spam the console (#2946 ) [skip ci] * make the initial call to tokenizer.pad not spam the console * add guard from feedback * make another common console output less verbose * more logging fixes	2025-07-19 13:53:35 -04:00
Wing Lian	170322a1f0	make sure log level is upper (#2934 )	2025-07-17 15:32:55 -04:00
Wing Lian	5f5ae76213	add validation around cce + chunked_ce (#2932 ) [skip ci] * add validation around cce + chunked_ce * return on end of validation method	2025-07-17 15:32:38 -04:00
Wing Lian	a798975b7c	coderabbit manual settings (#2940 ) [skip ci]	2025-07-17 15:32:16 -04:00
Wing Lian	d23f972602	use state for wandb in callbacks (#2930 ) [skip ci]	2025-07-17 15:31:56 -04:00
Wing Lian	8e41317250	don't use include_tokens_per_second for GRPO (#2931 ) [skip ci] * don't use include_tokens_per_second for GRPO * use blocklist instead	2025-07-17 15:31:21 -04:00
Varun Gumma	9f2bb188a4	Improve Dataset Processing Multiprocessing, Sharding, and Qwen Tokenizer Bug Fix. (#2918 ) * Added a feature to save prepared dataset in specified shards, removed limiter on multiprocessing during tokenization, and a bug fix of qwen tokenizer * removed limiters and fixed config variable name * black lint * chore: lint * feat: update handling of dataset_processes --------- Co-authored-by: NanoCode012 <nano@axolotl.ai>	2025-07-17 09:47:58 -04:00
Wing Lian	9dde9e1b71	misc fixes 202507 (#2937 ) [skip ci] * misc fixes 202507 * manually handle attn class for llama4	2025-07-17 09:47:45 -04:00
Wing Lian	f2474ef941	bump accelerate to 1.9.0 (#2936 ) [skip ci]	2025-07-17 09:46:43 -04:00
Wing Lian	8a4bcacdb2	cu126-torch271 for cloud docker image should be tagged with main-latest (#2935 )	2025-07-17 00:01:23 -04:00
Wing Lian	d2c3d5a954	run nightly-vs-upstream-main on 2.7.1 and multi-gpu also (#2929 ) [skip ci]	2025-07-16 21:45:42 -04:00
Wing Lian	36cbe13d18	activation offloading with cuda streams doesn't work with LoRA (#2927 )	2025-07-16 11:59:20 -04:00
Wing Lian	2c408b5c5e	Apply generic fused liger ce, cce, and tiledmlp for arbitrary models (#2908 ) * Apply generic fused liger ce for unknown models * fix deepseek liger modeling * generic cce and config tiled mlp to use original mlp and auto detect compute params * fix weight and lint * update warnings * address PR feedback * use lookup for model class prefixes * revert inadvertent change to flash attn verison * remove un-needed pylint annotations * fix import	2025-07-15 22:40:41 -04:00
Wing Lian	942005f526	use modal==1.0.2 for nightlies and for cli (#2925 ) [skip ci] * use modal==1.0.2 for nightlies and for cli * use latest cce fork for upstream changes * increase timeout	2025-07-15 20:31:23 -04:00
Dan Saunders	10ba1622f7	checkpoint model on first step callback (#2906 ) * checkpoint model on first step callback * remove debug * add test cases; update existing tests not to save on first step * move test out of solo * delete * default to False * typo	2025-07-15 15:00:48 -04:00
Wing Lian	d320ef6199	fix for upstream refactor of KwargsForCausalLM (#2911 )	2025-07-15 11:28:41 -04:00
NanoCode012	354eaaf0d3	feat: add call method to mistral tokenizer wrapper (#2898 )	2025-07-14 22:33:35 -04:00
greenhestu	a061446540	Fix: Prevents merging of tool arguments during preprocessing (#2909 )	2025-07-14 22:33:10 -04:00
Wing Lian	cd079b5536	Tensor parallel w DeepSpeed AutoTP (#2574 ) * support for deepspeed autotup * bump to latest deepspeed that supports deepcompile too * add deepcompile support too * fix total steps calculation for TP * setup fixture for tp * update ds config to ensure weights are gathered for checkpoint * fix duplicate validation names * chore: lint	2025-07-14 21:33:48 -04:00
Wing Lian	5cc16040a8	move the plugin post trainer create to the setup trainer (#2907 ) * move the plugin post trainer create to the setup trainer * move post-train plugins to execute-training fn	2025-07-14 20:11:33 -04:00
Wing Lian	38359a8997	allow profiling in mid-training rather from the start (#2899 ) [skip ci] * allow profiling in mid-training rather from the start * simplify based on PR feedback * fix logic, improve saving at end, add tests	2025-07-14 20:11:11 -04:00
Wing Lian	7dc3ac6cb3	update nightlies builds (#2921 ) [skip ci]	2025-07-14 20:10:43 -04:00
Wing Lian	99187cd208	Activation Offloading w CUDA Streams (#2900 ) [skip ci] * use cuda streams for activation offloading * use torch native ops * update cfg schema for streams * fix literal constructor for set * use context for training step so it doesn't affect evals * disable streams * auto gc on eval steps * use activation_offloading config arg * add docs for gradient checkpointing * handle validation for gc/ao * use cuda streams for act offloading * add more validation for AC w/o GC * fix docs * move activation_offloading lower in definition so it doesn't break args/kwargs * fix kd due to import order	2025-07-14 20:10:20 -04:00
Wing Lian	aa684122f1	upgrade peft==0.16.0 and datasets==4.0.0 (#2917 ) [skip ci] * upgrade peft to 0.16.0 * upgrade datasets to 4.0.0 * refactor dupes from merge/rebase * fix check for fsdp1 + sharded_state_dict * use full state dict for ci	2025-07-14 20:09:26 -04:00
Wing Lian	ca4d4ef793	don't init distributed for deepspeed if preprocessing (#2920 ) * don't init distributed for deepspeed if preprocessing * add e2e test to validate preprocess cli with deepspeed * ignore duplicate code for cfg	2025-07-14 14:19:19 -04:00
Dan Saunders	37edbe4999	Remove extra torch.compile call (#2904 ) * debug * debug * debug * moving validation code to transformers * revert unneeded change * add accelerator config to base trainer builder * add back accumulated_cache_size_limit setting * lint	2025-07-14 12:32:45 -04:00
Wing Lian	e581c15d40	refactor dupes from merge/rebase (#2919 ) [skip ci]	2025-07-14 10:05:26 -04:00
Wing Lian	af92151a7b	FSDP2 fix validation and add tests (#2910 ) * fix validation and add tests * remove debugging and add more tests * remove migrate_fsdp	2025-07-14 09:25:44 -04:00
Wing Lian	80dc4c261a	fix xformers version for python 2.6 (#2916 ) [skip ci]	2025-07-14 09:24:29 -04:00
Wing Lian	7ccbbd8e77	upgrade liger to 0.6.0 (#2893 ) [skip ci]	2025-07-14 09:24:07 -04:00
Wing Lian	5081db7f8a	upgrade trl==0.19.1 (#2892 ) [skip ci] * upgrade trl==0.19.1 * add vllm for tests for grpo * fixes to work with latest trl * need data_parallel_size config too * support for vllm_mode for server / colocate * vllm settings for colocate * relax vllm version * bump min hf hub for latest vllm support * add hints on string literal for vllm mode * use latest transformers 4.53.2 * tweak acceptable loss on flaky test_ds_zero3_packed test * don't run flaky vllm/grpo tests for now	2025-07-14 09:23:42 -04:00
Wing Lian	41664c7c4c	fix ddp for incorrect steps (#2915 ) * fix ddp for incorrect steps * add test	2025-07-14 07:51:16 -04:00
Wing Lian	9a8073e73d	Liquid Foundation Model 2 support (#2905 ) * LFM2 support * docs * packing seems to work * update install to force install in case already on dev version * default to use chunked cross entropy	2025-07-12 11:41:34 -04:00
Jiawei Liu	7fb8441e0e	fix: customized dataset with simpo (#2894 ) [skip ci]	2025-07-12 11:40:30 -04:00
NanoCode012	4dc5910e1c	feat(doc): re-add docker 2.7.0 tag back (#2902 ) [skip ci]	2025-07-12 11:40:01 -04:00
Wing Lian	fb7bc9250d	move unmaintained examples to archive (#2903 ) [skip ci]	2025-07-12 11:39:51 -04:00
salman	d6e4a611e5	FSDP1 -> FSDP2 (#2760 ) * FSDP2 args migration implementation This commit implements the migration to FSDP2 arguments including: - FSDP2 support with LoRA training - DPO integration with FSDP2 - Model loading fixes and refactoring - CPU offloading and PEFT handling - Test updates and CI improvements - Bug fixes for dtype errors and various edge cases	2025-07-12 15:18:01 +01:00
Ed Sealing	eb662557a7	Register Plugins in Ray Workers (#2901 ) [skip ci] * Access plugins in ray cluster * Add comment * chore: lint --------- Co-authored-by: Ed Sealing <ed.sealing@patapsco.ai> Co-authored-by: Wing Lian <wing@axolotl.ai>	2025-07-11 16:59:59 -04:00
salman	03b2a113fe	Update doc preview workflow to use sticky comments (#2873 )	2025-07-11 14:08:35 +01:00
NanoCode012	9b95a625ab	feat: add devstral small 2507 (#2896 ) * feat: add devstral small 2507 * chore: update blog doc	2025-07-11 09:34:19 +07:00
Wing Lian	c370d0795c	[doc] Fix docs for text field mapping for completion datasets (#2890 ) * Fix docs for text field mapping for completion datasets * update another reference	2025-07-09 14:52:44 -04:00
Wing Lian	76aeb16156	tiled_mlp supports single gpu (#2891 ) * tiled_mlp supports single gpu * use checkpoint offloading for arctic training * patch torch checkpoint too * support for single gpu zero3 * add linkback to where it was copied from	2025-07-09 12:48:22 -04:00
Wing Lian	7c5ea0010f	bump dev version (#2889 ) [skip ci]	2025-07-09 09:43:42 -04:00