Revert "checkpoint model on first step callback (#2906 )"

This reverts commit 10ba1622f7.
checkpoint model on first step callback (#2906 )
2025-07-15 15:01:12 -04:00 · 2025-07-15 15:00:48 -04:00 · 2025-07-15 11:28:41 -04:00 · 2025-07-14 22:33:35 -04:00 · 2025-07-14 22:33:10 -04:00 · 2025-07-14 21:33:48 -04:00
215 changed files with 15788 additions and 1596 deletions
--- a/.bandit
+++ b/.bandit
@@ -1,3 +1,3 @@
 [bandit]
 exclude = tests
-skips = B101
+skips = B101,B615
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -5,11 +5,13 @@ on:
    branches:
      - "main"
    paths:
-      - 'Dockerfile-base'
+      - 'docker/Dockerfile-base'
+      - 'docker/Dockerfile-uv-base'
      - '.github/workflows/base.yml'
  pull_request:
    paths:
-      - 'Dockerfile-base'
+      - 'docker/Dockerfile-base'
+      - 'docker/Dockerfile-uv-base'
      - '.github/workflows/base.yml'
  workflow_dispatch:

@@ -27,11 +29,11 @@ jobs:
            cuda_version: 12.4.1
            cudnn_version: ""
            python_version: "3.11"
-            pytorch: 2.5.1
+            pytorch: 2.6.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
-          - cuda: "124"
-            cuda_version: 12.4.1
+          - cuda: "126"
+            cuda_version: 12.6.3
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.6.0
@@ -41,7 +43,7 @@ jobs:
            cuda_version: 12.6.3
            cudnn_version: ""
            python_version: "3.11"
-            pytorch: 2.6.0
+            pytorch: 2.7.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
          - cuda: "126"
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -15,17 +15,16 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.5.1
-            axolotl_extras:
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.6.0
+            axolotl_extras:
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.7.0
            axolotl_extras: vllm
-            is_latest: true
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
@@ -83,17 +82,17 @@ jobs:
    strategy:
      matrix:
        include:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.5.1
-            axolotl_extras:
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.6.0
            axolotl_extras:
            is_latest: true
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.7.0
+            axolotl_extras:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
@@ -146,8 +145,8 @@ jobs:
    strategy:
      matrix:
        include:
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.6.0
            axolotl_extras:
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -26,18 +26,18 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.6.0
-            axolotl_extras: vllm
+            axolotl_extras:
            num_gpus: 2
            nightly_build: "true"
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.5.1
-            axolotl_extras:
+            pytorch: 2.7.0
+            axolotl_extras: vllm
            num_gpus: 2
            nightly_build: "true"
          - cuda: 126
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -12,16 +12,16 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.5.1
-            axolotl_extras:
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.6.0
            axolotl_extras:
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.7.1
+            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -65,16 +65,16 @@ jobs:
    strategy:
      matrix:
        include:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.5.1
-            axolotl_extras:
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.6.0
            axolotl_extras:
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.7.1
+            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
--- a/.github/workflows/preview-docs.yml
+++ b/.github/workflows/preview-docs.yml
@@ -28,6 +28,8 @@ jobs:
    steps:
      - name: Check out repository
        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.head.sha }}

      - name: Set up Quarto
        uses: quarto-dev/quarto-actions/setup@v2
@@ -50,10 +52,11 @@ jobs:

      - name: Netlify Publish
        uses: nwtgck/actions-netlify@v3.0
+        id: netlify
        with:
          publish-dir: './_site'
-          enable-pull-request-comment: true
-          enable-github-deployment: true
+          enable-pull-request-comment: false
+          enable-github-deployment: false
          github-token: ${{ secrets.GITHUB_TOKEN }}
          deploy-message: "Deployed On Netlify"
          github-deployment-environment: 'preview'
@@ -61,3 +64,13 @@ jobs:
        env:
          NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
          NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }}
+
+      - name: Update PR with preview link
+        if: ${{ steps.netlify.outcome == 'success' }}
+        uses: marocchino/sticky-pull-request-comment@v2
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          message: |
+            📖 **Documentation Preview**: ${{ steps.netlify.outputs.deploy-url }}
+
+            Deployed on Netlify from commit ${{ github.event.pull_request.head.sha }}
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -18,116 +18,26 @@ jobs:
        env:
          SKIP: no-commit-to-branch

-  preload-cache:
-    name: Preload HF cache
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        python_version: ["3.11"]
-        pytorch_version: ["2.6.0"]
-    timeout-minutes: 20
-
-    env:
-      AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
-
-    steps:
-      - name: Check out repository code
-        uses: actions/checkout@v4
-
-      - name: Restore HF cache
-        id: hf-cache-restore
-        uses: actions/cache/restore@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ runner.os }}-hf-hub-cache-v2
-
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python_version }}
-          cache: 'pip' # caching pip dependencies
-
-      - name: upgrade pip
-        run: |
-          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
-
-      - name: Install PyTorch
-        run: |
-          pip3 install torch==${{ matrix.pytorch_version }}
-
-      - name: Install dependencies
-        run: |
-          pip3 show torch
-          pip3 install --no-build-isolation -U -e .
-          python scripts/unsloth_install.py | sh
-          python scripts/cutcrossentropy_install.py | sh
-          pip3 install -r requirements-dev.txt -r requirements-tests.txt
-
-      - name: Make sure PyTorch version wasn't clobbered
-        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
-
-      - name: Ensure axolotl CLI was installed
-        run: |
-          axolotl --help
-
-      - name: Pre-Download dataset fixture
-        run: |
-          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
-
-      - name: Run tests
-        run: |
-          pytest -v tests/conftest.py
-
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v5
-        with:
-          token: ${{ secrets.CODECOV_TOKEN }}
-          files: ./coverage.xml
-          flags: unittests,pytorch-${{ matrix.pytorch_version }}
-          fail_ci_if_error: false
-
-      - name: cleanup pip cache
-        run: |
-          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
-
-      - name: Save HF cache
-        id: hf-cache
-        uses: actions/cache/save@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
-
  pytest:
    name: PyTest
    runs-on: ubuntu-latest
-    needs: [preload-cache]
    strategy:
      fail-fast: false
      max-parallel: 2
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
+        pytorch_version: ["2.6.0", "2.7.0"]
    timeout-minutes: 20

    steps:
      - name: Check out repository code
        uses: actions/checkout@v4

-      - name: Restore HF cache
-        id: hf-cache-restore
-        uses: actions/cache/restore@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ runner.os }}-hf-hub-cache-v2
+      - name: Restore Cache from S3
+        id: hf-cache-restore-s3
+        run: |
+          mkdir -p /home/runner/.cache/huggingface/hub
+          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd

      - name: Setup Python
        uses: actions/setup-python@v5
@@ -168,15 +78,11 @@ jobs:
        run: |
          axolotl --help

-      - name: Pre-Download dataset fixture
-        run: |
-          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
-
      - name: Run tests
        run: |
-          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
-          pytest -v tests/patched/
-          pytest -v tests/cli/
+          pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
+          pytest -v --durations=10 tests/patched/
+          pytest -v --durations=10 tests/cli/

      - name: cleanup pip cache
        run: |
@@ -193,15 +99,8 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.5.1
-            num_gpus: 1
-            axolotl_extras:
-            nightly_build: "true"
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.6.0
            num_gpus: 1
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -52,7 +52,7 @@ jobs:
      fail-fast: false
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.5.1", "2.6.0", "2.7.1"]
+        pytorch_version: ["2.6.0", "2.7.0", "2.7.1"]
    timeout-minutes: 20

    steps:
@@ -102,9 +102,9 @@ jobs:

      - name: Run tests
        run: |
-          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/ --cov=axolotl --cov-report=xml
-          pytest -v tests/patched/ --cov=axolotl --cov-append --cov-report=xml
-          pytest -v tests/cli/ --cov=axolotl --cov-append --cov-report=xml
+          pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/ --cov=axolotl --cov-report=xml
+          pytest -v --durations=10 tests/patched/ --cov=axolotl --cov-append --cov-report=xml
+          pytest -v --durations=10 tests/cli/ --cov=axolotl --cov-append --cov-report=xml

      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v5
@@ -125,7 +125,7 @@ jobs:
      fail-fast: false
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.5.1", "2.6.0", "2.7.1"]
+        pytorch_version: ["2.6.0", "2.7.0", "2.7.1"]
    timeout-minutes: 20

    steps:
@@ -175,9 +175,9 @@ jobs:

      - name: Run tests
        run: |
-          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
-          pytest -v tests/patched/
-          pytest -v tests/cli/
+          pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
+          pytest -v --durations=10 tests/patched/
+          pytest -v --durations=10 tests/cli/

      - name: cleanup pip cache
        run: |
@@ -195,12 +195,12 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.6.0
+            pytorch: 2.7.1
            num_gpus: 1
-            axolotl_extras: vllm
+            axolotl_extras:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
@@ -247,22 +247,10 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.6.0
-            num_gpus: 1
-            axolotl_extras: llmcompressor
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.5.1
-            num_gpus: 1
-            axolotl_extras:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.6.0
            num_gpus: 1
            axolotl_extras:
          - cuda: 128
@@ -311,7 +299,7 @@ jobs:
            python_version: "3.11"
            pytorch: 2.6.0
            num_gpus: 1
-            axolotl_extras: vllm
+            axolotl_extras:
    steps:
      - name: Checkout
        uses: actions/checkout@v4
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,7 +19,7 @@ repos:
    hooks:
      - id: isort
 -   repo: https://github.com/PyCQA/flake8
-    rev: 7.2.0
+    rev: 7.3.0
    hooks:
    - id: flake8
 -   repo: https://github.com/pylint-dev/pylint
@@ -27,7 +27,7 @@ repos:
    hooks:
    - id: pylint
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.16.0
+    rev: v1.16.1
    hooks:
    - id: mypy
      additional_dependencies:
@@ -36,7 +36,7 @@ repos:
            'pydantic>=2.5.3',
        ]
 -   repo: https://github.com/PyCQA/bandit
-    rev: 1.8.3
+    rev: 1.8.6
    hooks:
    -   id: bandit
        args: [
--- a/.runpod/src/config/config.yaml
+++ b/.runpod/src/config/config.yaml
@@ -97,7 +97,7 @@
 #       # 'no_input_format' cannot include {input}
 #       no_input_format: "{instruction} "

-#       # For `completion` datsets only, uses the provided field instead of `text` column
+#       # For `completion` datasets only, uses the provided field instead of `text` column
 #       field:

 # # Axolotl attempts to save the dataset as an arrow after packing the data together so
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,4 +2,5 @@ include requirements.txt
 include README.md
 include LICENSE
 include src/setuptools_axolotl_dynamic_dependencies.py
+include src/axolotl/utils/chat_templates/templates/*.jinja
 recursive-include axolotl *.py
--- a/README.md
+++ b/README.md
@@ -43,7 +43,7 @@ Features:
 - **Multiple Model Support**: Train various models like LLaMA, Mistral, Mixtral, Pythia, and more. We are compatible with HuggingFace transformers causal language models.
 - **Training Methods**: Full fine-tuning, LoRA, QLoRA, GPTQ, QAT, Preference Tuning (DPO, IPO, KTO, ORPO), RL (GRPO), Multimodal, and Reward Modelling (RM) / Process Reward Modelling (PRM).
 - **Easy Configuration**: Re-use a single YAML file between dataset preprocess, training, evaluation, quantization, and inference.
- **Performance Optimizations**: [Multipacking](https://docs.axolotl.ai/docs/multipack.html), [Flash Attention](https://github.com/Dao-AILab/flash-attention), [Xformers](https://github.com/facebookresearch/xformers), [Flex Attention](https://pytorch.org/blog/flexattention/), [Liger Kernel](https://github.com/linkedin/Liger-Kernel), [Cut Cross Entropy](https://github.com/apple/ml-cross-entropy/tree/main), Sequence Parallelism (SP), LoRA optimizations, Multi-GPU training (FSDP1, FSDP2, DeepSpeed), Multi-node training (Torchrun, Ray), and many more!
+- **Performance Optimizations**: [Multipacking](https://docs.axolotl.ai/docs/multipack.html), [Flash Attention](https://github.com/Dao-AILab/flash-attention), [Xformers](https://github.com/facebookresearch/xformers), [Flex Attention](https://pytorch.org/blog/flexattention/), [Liger Kernel](https://github.com/linkedin/Liger-Kernel), [Cut Cross Entropy](https://github.com/apple/ml-cross-entropy/tree/main), [Sequence Parallelism (SP)](https://docs.axolotl.ai/docs/sequence_parallelism.html), [LoRA optimizations](https://docs.axolotl.ai/docs/lora_optims.html), [Multi-GPU training (FSDP1, FSDP2, DeepSpeed)](https://docs.axolotl.ai/docs/multi-gpu.html), [Multi-node training (Torchrun, Ray)](https://docs.axolotl.ai/docs/multi-node.html), and many more!
 - **Flexible Dataset Handling**: Load from local, HuggingFace, and cloud (S3, Azure, GCP, OCI) datasets.
 - **Cloud Ready**: We ship [Docker images](https://hub.docker.com/u/axolotlai) and also [PyPI packages](https://pypi.org/project/axolotl/) for use on cloud platforms and local hardware.

@@ -55,10 +55,12 @@ Features:

 - NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
 - Python 3.11
- PyTorch ≥2.5.1
+- PyTorch ≥2.6.0

 ### Installation

+#### Using pip
+
 ```bash
 pip3 install -U packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
@@ -68,6 +70,13 @@ axolotl fetch examples
 axolotl fetch deepspeed_configs  # OPTIONAL
 ```

+#### Using Docker
+
+Installing with Docker can be less error prone than installing in your own environment.
+```bash
+docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
+```
+
 Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html).

 ### Your First Fine-tune
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -276,6 +276,7 @@ website:
            - docs/torchao.qmd
            - docs/custom_integrations.qmd
            - docs/sequence_parallelism.qmd
+            - docs/gradient_checkpointing.qmd

        - section: "Troubleshooting"
          contents:
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -9,6 +9,7 @@ ENV GITHUB_REF="{{ GITHUB_REF }}"
 ENV GITHUB_SHA="{{ GITHUB_SHA }}"
 ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
 ENV HF_HOME="{{ HF_HOME }}"
+ENV AXOLOTL_DATASET_PROCESSES="8"

 RUN apt-get update && \
    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -24,9 +24,9 @@ df_template = template_env.get_template("Dockerfile.jinja")
 df_args = {
    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.5.1"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu124-2.5.1"),
-    "CUDA": os.environ.get("CUDA", "124"),
+    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.6.0"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu126-2.6.0"),
+    "CUDA": os.environ.get("CUDA", "126"),
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
@@ -24,14 +24,16 @@ df_template = template_env.get_template(dockerfile)
 df_args = {
    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.5.1"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu124-2.5.1"),
-    "CUDA": os.environ.get("CUDA", "124"),
+    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.6.0"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu126-2.6.0"),
+    "CUDA": os.environ.get("CUDA", "126"),
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
    "HF_HOME": "/workspace/data/huggingface-cache/hub",
+    "PYTHONUNBUFFERED": os.environ.get("PYTHONUNBUFFERED", "1"),
+    "DEEPSPEED_LOG_LEVEL": os.environ.get("DEEPSPEED_LOG_LEVEL", "WARNING"),
 }

 dockerfile_contents = df_template.render(**df_args)
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -38,6 +38,6 @@ RUN git lfs install --skip-repo && \
    # The base image ships with `pydantic==1.8.2` which is not working
    pip3 install -U --no-cache-dir pydantic==1.10.10

-RUN if [ "$PYTORCH_VERSION" = "2.7.1" ] ; then \
-        pip3 install flash-attn==2.7.4.post1; \
+RUN if [ "$PYTORCH_VERSION" = "2.6.0" ] && [ "$CUDA" = "124" ] ; then \
+        FLASH_ATTENTION_FORCE_BUILD="TRUE" pip3 install --no-build-isolation flash-attn==2.8.0.post2; \
    fi
--- a/docker/Dockerfile-uv-base
+++ b/docker/Dockerfile-uv-base
@@ -34,7 +34,3 @@ RUN uv pip install packaging setuptools wheel psutil \
    && uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" \
    && uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" \
    && uv pip install awscli pydantic
-
-RUN if [ "$PYTORCH_VERSION" = "2.7.1" ] ; then \
-        uv pip install --no-build-isolation flash-attn==2.7.4.post1; \
-    fi
--- a/docs/custom_integrations.qmd
+++ b/docs/custom_integrations.qmd
@@ -7,6 +7,7 @@ toc-depth: 3
 ```{python}
 #| echo: false

+import os
 import re

 def process_readme(integration_name):
@@ -53,6 +54,24 @@ sections = [
    ("LLMCompressor", "llm_compressor")
 ]

+for folder_name in os.listdir("../src/axolotl/integrations/"):
+    if folder_name in [path for name, path in sections]:
+        # skip if already in sections
+        continue
+    if os.path.exists(f"../src/axolotl/integrations/{folder_name}/README.md"):
+        # grab the first heading in README.md as the section name
+        with open(f"../src/axolotl/integrations/{folder_name}/README.md", "r") as f:
+            txt = f.read()
+            matches = re.search(r'^# (.*)\n?', txt, flags=re.MULTILINE)
+            if matches:
+                name = matches.group(1)
+            else:
+                continue
+            sections.append((name, folder_name))
+
+# sort sections by name
+sections = sorted(sections, key=lambda x: x[0])
+
 for section_name, folder_name in sections:
    print(print_section(section_name, folder_name))
 ```
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -9,7 +9,7 @@ order: 3
 Chat Template strategy uses a jinja2 template that converts a list of messages into a prompt. Support using tokenizer's template, a supported template, or custom jinja2.

 ```{.json filename="data.jsonl"}
-{"conversations": [{"role": "...", "content": "..."}]}
+{"messages": [{"role": "...", "content": "..."}, {"role": "...", "content": "..."}, ...]}
 ```

 See [configs](../config-reference.qmd) for full configs and supported templates.
@@ -187,6 +187,7 @@ Instead of passing `tools` via the system prompt, an alternative method would be
            "role": "assistant", // call the function via assistant
            "tool_calls": [
                {
+                    "id": "...",  // required only for mistral
                    "type": "function",
                    "function": {
                        "name": "...",
@@ -199,6 +200,7 @@ Instead of passing `tools` via the system prompt, an alternative method would be
        },
        {
            "role": "tool",
+            "tool_call_id": "...",  // required only for mistral
            "name": "...",
            "content": "..."
        },
--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -9,7 +9,7 @@ format:
 This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai).

 ::: {.callout-important}
-For Blackwell GPUs, please use the tags with Pytorch 2.7.1 and CUDA 12.8.
+For Blackwell GPUs, please use the tags with PyTorch 2.7.1 and CUDA 12.8.
 :::

 ## Base
@@ -34,8 +34,9 @@ Tags examples:

 - `main-base-py3.11-cu128-2.7.1`
 - `main-base-py3.11-cu126-2.7.1`
+- `main-base-py3.11-cu126-2.7.0`
+- `main-base-py3.11-cu126-2.6.0`
 - `main-base-py3.11-cu124-2.6.0`
- `main-base-py3.11-cu124-2.5.1`

 ## Main

@@ -73,13 +74,15 @@ There may be some extra tags appended to the image, like `-vllm` which installs

 Tags examples:

+- `main-py3.11-cu128-2.7.1`
+- `main-py3.11-cu126-2.7.1`
 - `main-py3.11-cu126-2.7.0`
+- `main-py3.11-cu126-2.6.0`
 - `main-py3.11-cu124-2.6.0`
- `main-py3.11-cu124-2.5.1`
 - `main-latest`
 - `main-20250303-py3.11-cu124-2.6.0`
- `main-20250303-py3.11-cu124-2.5.1`
- `0.9.2`
+- `main-20250303-py3.11-cu126-2.6.0`
+- `0.10.1`

 ## Cloud

--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -51,6 +51,18 @@ description: Frequently asked questions
 >   pad_token: "..."
 > ```

+**Q: `IterableDataset error` or `KeyError: 'input_ids'` when using `preprocess` CLI**
+
+> A: This is because you may be using `preprocess` CLI with `pretraining_dataset:` or `skip_prepare_dataset: true` respectively. Please use `axolotl train` CLI directly instead as these datasets are prepared on demand.
+
+**Q: vLLM is not working with Axolotl**
+
+> A: We currently recommend torch 2.6.0 for use with `vllm`. Please ensure you use the right version. For Docker, please use the `main-py3.11-cu124-2.6.0` tag.
+
+**Q: FA2 2.8.0 `undefined symbol` runtime error on CUDA 12.4**
+
+> A: There seems to be a wheel issue with FA2 2.8.0 on CUDA 12.4. Try CUDA 12.6 instead or downgrade to FA2 2.7.4. Please refer to the upstream issue: https://github.com/Dao-AILab/flash-attention/issues/1717.
+
 ### Chat templates

 **Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**
--- a/docs/fsdp_qlora.qmd
+++ b/docs/fsdp_qlora.qmd
@@ -20,7 +20,7 @@ To enable `QLoRA` with `FSDP`, you need to perform the following steps:
 > See the [example config](#example-config) file in addition to reading these instructions.

 1. Set `adapter: qlora` in your axolotl config file.
-2. Enable FSDP in your axolotl config, as [described here](https://github.com/axolotl-ai-cloud/axolotl?tab=readme-ov-file#fsdp).
+2. Enable FSDP in your axolotl config, as [described here](multi-gpu.qmd#sec-fsdp).
 3. Use one of the supported model types: `llama`, `mistral` or `mixtral`.

 ## Example Config
--- a/docs/gradient_checkpointing.qmd
+++ b/docs/gradient_checkpointing.qmd
@@ -0,0 +1,29 @@
+---
+title: Gradient Checkpointing and Activation Offloading
+---
+
+Gradient checkpointing and activation offloading are techniques used to optimize the performance of deep learning
+models by reducing the memory footprint and improving computational efficiency.
+
+### Enabling Gradient Checkpointing
+
+```yaml
+gradient_checkpointing: true
+```
+
+### Enabling Activation Offloading
+
+```yaml
+gradient_checkpointing: true  # required for activation offloading
+activation_offloading: true
+```
+
+Activation offloading variants:
+
+The default `activation_offloading: true` offloads activations to CPU and uses CUDA streams
+to overlap the communications and computations when offloading.
+
+The `activation_offloading: legacy` naively offloads activations to CPU and without additional optimizations.
+
+For resource constrained environments with limited CPU memory, `activation_offloading: disk` offloads
+activations to disk instead of CPU RAM so that much larger context lengths can be trained with minimal memory.
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -15,7 +15,7 @@ This guide covers all the ways you can install and set up Axolotl for your envir

 - NVIDIA GPU (Ampere architecture or newer for `bf16` and Flash Attention) or AMD GPU
 - Python ≥3.11
- PyTorch ≥2.5.1
+- PyTorch ≥2.6.0

 ## Installation Methods {#sec-installation-methods}

--- a/docs/multi-gpu.qmd
+++ b/docs/multi-gpu.qmd
@@ -23,8 +23,6 @@ Axolotl supports several methods for multi-GPU training:

 ## DeepSpeed {#sec-deepspeed}

-DeepSpeed is the recommended approach for multi-GPU training due to its stability and performance. It provides various optimization levels through ZeRO stages.
-
 ### Configuration {#sec-deepspeed-config}

 Add to your YAML config:
@@ -32,7 +30,6 @@ Add to your YAML config:
 ```{.yaml}
 deepspeed: deepspeed_configs/zero1.json
 ```
-
 ### Usage {#sec-deepspeed-usage}

 ```{.bash}
@@ -66,9 +63,75 @@ Start from Stage 1 -> Stage 2 -> Stage 3.

 :::

-## FSDP {#sec-fsdp}
+::: {.callout-tip}

-### Basic FSDP Configuration {#sec-fsdp-config}
+Using ZeRO Stage 3 with Single-GPU training
+
+ZeRO Stage 3 can be used for training on a single GPU by manually setting the environment variables:
+`WORLD_SIZE=1 LOCAL_RANK=0 MASTER_ADDR=0.0.0.0 MASTER_PORT=29500`
+
+:::
+
+## Fully Sharded Data Parallel (FSDP) {#sec-fsdp}
+
+::: {.callout-note}
+
+FSDP2 is recommended for new users. FSDP1 is deprecated and will be removed in an upcoming release of Axolotl.
+
+:::
+
+### Migrating from FSDP1 to FSDP2 {#sec-migrate-fsdp1-fsdp2}
+
+To migrate your config from FSDP1 to FSDP2, you must use the `fsdp_version` top-level config field to specify the FSDP version, and
+also follow the config field mapping below to update field names.
+
+#### Config mapping
+
+FSDP1 | FSDP2
+-------- | --------
+fsdp_sharding_strategy | reshard_after_forward
+fsdp_backward_prefetch_policy | **REMOVED**
+fsdp_backward_prefetch | **REMOVED**
+fsdp_forward_prefetch | **REMOVED**
+fsdp_sync_module_states | **REMOVED**
+fsdp_cpu_ram_efficient_loading | cpu_ram_efficient_loading
+fsdp_state_dict_type | state_dict_type
+fsdp_use_orig_params | **REMOVED**
+
+
+For example, if you were using the following FSDP1 config:
+
+```{.yaml}
+fsdp_version: 1
+fsdp_config:
+  fsdp_offload_params: false
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_transformer_layer_cls_to_wrap: Qwen3DecoderLayer
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sharding_strategy: FULL_SHARD
+```
+
+You can migrate to the following FSDP2 config:
+
+```{.yaml}
+fsdp_version: 2
+fsdp_config:
+  offload_params: false
+  cpu_ram_efficient_loading: true
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: Qwen3DecoderLayer
+  state_dict_type: FULL_STATE_DICT
+  reshard_after_forward: true
+```
+
+### FSDP1 (deprecated) {#sec-fsdp-config}
+
+::: {.callout-note}
+
+Using `fsdp` to configure FSDP is deprecated and will be removed in an upcoming release of Axolotl. Please use `fsdp_config` as above instead.
+
+:::

 ```{.yaml}
 fsdp:
@@ -80,6 +143,7 @@ fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
 ```

+
 ## Sequence parallelism {#sec-sequence-parallelism}

 We support sequence parallelism (SP) via the
--- a/docs/multi-node.qmd
+++ b/docs/multi-node.qmd
@@ -40,13 +40,13 @@ use_cpu: false

 Configure your model to use FSDP in the Axolotl yaml. For example:
 ```yaml
-fsdp:
-  - full_shard
-  - auto_wrap
+fsdp_version: 2
 fsdp_config:
-  fsdp_offload_params: true
-  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  offload_params: true
+  state_dict_type: FULL_STATE_DICT
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  reshard_after_forward: true
 ```

 All you have to do now is launch using accelerate as you would usually do on each machine and voila, the processes will start once you have launched accelerate on every machine.
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -17,7 +17,6 @@ feedback. Various methods include, but not limited to:
 - [Kahneman-Tversky Optimization (KTO)](#kto)
 - [Odds Ratio Preference Optimization (ORPO)](#orpo)
 - [Group Relative Policy Optimization (GRPO)](#grpo)
- Proximal Policy Optimization (PPO) (not yet supported in axolotl, if you're interested in contributing, please reach out!)


 ## RLHF using Axolotl
@@ -275,15 +274,14 @@ rl: dpo
 datasets:
  - path: ...
    split: train
-    type: user_defined.default
-
-    field_prompt: "prompt"
-    field_system: "system"
-    field_chosen: "chosen"
-    field_rejected: "rejected"
-    prompt_format: "{prompt}"
-    chosen_format: "{chosen}"
-    rejected_format: "{rejected}"
+    type:
+      field_prompt: "prompt"
+      field_system: "system"
+      field_chosen: "chosen"
+      field_rejected: "rejected"
+      prompt_format: "{prompt}"
+      chosen_format: "{chosen}"
+      rejected_format: "{rejected}"
 ```

 The input format is a simple JSON input with customizable fields based on the above config.
@@ -476,14 +474,13 @@ rl: kto
 datasets:
  - path: ...
    split: train
-    type: user_defined.default
-
-    field_prompt: "prompt"
-    field_system: "system"
-    field_completion: "completion"
-    field_label: "label"
-    prompt_format: "{prompt}"
-    completion_format: "{completion}"
+    type:
+      field_prompt: "prompt"
+      field_system: "system"
+      field_completion: "completion"
+      field_label: "label"
+      prompt_format: "{prompt}"
+      completion_format: "{completion}"
 ```

 The input format is a simple JSON input with customizable fields based on the above config.
--- a/examples/archived/README.md
+++ b/examples/archived/README.md
@@ -0,0 +1,5 @@
+# Archived Examples
+
+This directory contains examples that are no longer maintained and may no longer be functional.
+
+We keep them around for archival purposes in case they are useful to others.
--- a/examples/archived/cerebras/btlm-ft.yml
+++ b/examples/archived/cerebras/btlm-ft.yml
--- a/examples/archived/cerebras/qlora.yml
+++ b/examples/archived/cerebras/qlora.yml
--- a/examples/archived/code-llama/13b/lora.yml
+++ b/examples/archived/code-llama/13b/lora.yml
--- a/examples/archived/code-llama/13b/qlora.yml
+++ b/examples/archived/code-llama/13b/qlora.yml
--- a/examples/archived/code-llama/34b/lora.yml
+++ b/examples/archived/code-llama/34b/lora.yml
--- a/examples/archived/code-llama/34b/qlora.yml
+++ b/examples/archived/code-llama/34b/qlora.yml
--- a/examples/archived/code-llama/7b/lora.yml
+++ b/examples/archived/code-llama/7b/lora.yml
--- a/examples/archived/code-llama/7b/qlora.yml
+++ b/examples/archived/code-llama/7b/qlora.yml
--- a/examples/archived/code-llama/README.md
+++ b/examples/archived/code-llama/README.md
--- a/examples/archived/dbrx/16bit-lora.yaml
+++ b/examples/archived/dbrx/16bit-lora.yaml
--- a/examples/archived/dbrx/8bit-lora.yaml
+++ b/examples/archived/dbrx/8bit-lora.yaml
--- a/examples/archived/dbrx/README.md
+++ b/examples/archived/dbrx/README.md
--- a/examples/archived/dbrx/fft-ds-zero3.yaml
+++ b/examples/archived/dbrx/fft-ds-zero3.yaml
--- a/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
+++ b/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
--- a/examples/archived/falcon/config-7b-lora.yml
+++ b/examples/archived/falcon/config-7b-lora.yml
--- a/examples/archived/falcon/config-7b-qlora.yml
+++ b/examples/archived/falcon/config-7b-qlora.yml
--- a/examples/archived/falcon/config-7b.yml
+++ b/examples/archived/falcon/config-7b.yml
--- a/examples/archived/gemma/qlora.yml
+++ b/examples/archived/gemma/qlora.yml
--- a/examples/archived/gptj/qlora.yml
+++ b/examples/archived/gptj/qlora.yml
--- a/examples/archived/jeopardy-bot/config.yml
+++ b/examples/archived/jeopardy-bot/config.yml
--- a/examples/archived/mpt-7b/README.md
+++ b/examples/archived/mpt-7b/README.md
--- a/examples/archived/mpt-7b/config.yml
+++ b/examples/archived/mpt-7b/config.yml
--- a/examples/archived/openllama-3b/README.md
+++ b/examples/archived/openllama-3b/README.md
--- a/examples/archived/openllama-3b/config.yml
+++ b/examples/archived/openllama-3b/config.yml
--- a/examples/archived/openllama-3b/lora.yml
+++ b/examples/archived/openllama-3b/lora.yml
--- a/examples/archived/openllama-3b/qlora.yml
+++ b/examples/archived/openllama-3b/qlora.yml
--- a/examples/archived/pythia-12b/README.md
+++ b/examples/archived/pythia-12b/README.md
--- a/examples/archived/pythia-12b/config.yml
+++ b/examples/archived/pythia-12b/config.yml
--- a/examples/archived/pythia/lora.yml
+++ b/examples/archived/pythia/lora.yml
--- a/examples/archived/qwen/README.md
+++ b/examples/archived/qwen/README.md
--- a/examples/archived/qwen/lora.yml
+++ b/examples/archived/qwen/lora.yml
--- a/examples/archived/qwen/qlora.yml
+++ b/examples/archived/qwen/qlora.yml
--- a/examples/archived/qwen/qwen2-moe-lora.yaml
+++ b/examples/archived/qwen/qwen2-moe-lora.yaml
--- a/examples/archived/qwen/qwen2-moe-qlora.yaml
+++ b/examples/archived/qwen/qwen2-moe-qlora.yaml
--- a/examples/archived/redpajama/README.md
+++ b/examples/archived/redpajama/README.md
--- a/examples/archived/redpajama/config-3b.yml
+++ b/examples/archived/redpajama/config-3b.yml
--- a/examples/archived/replit-3b/config-lora.yml
+++ b/examples/archived/replit-3b/config-lora.yml
--- a/examples/archived/stablelm-2/1.6b/fft.yml
+++ b/examples/archived/stablelm-2/1.6b/fft.yml
--- a/examples/archived/stablelm-2/1.6b/lora.yml
+++ b/examples/archived/stablelm-2/1.6b/lora.yml
--- a/examples/archived/stablelm-2/README.md
+++ b/examples/archived/stablelm-2/README.md
--- a/examples/archived/starcoder2/qlora.yml
+++ b/examples/archived/starcoder2/qlora.yml
--- a/examples/archived/tiny-llama/README.md
+++ b/examples/archived/tiny-llama/README.md
--- a/examples/archived/tiny-llama/lora-mps.yml
+++ b/examples/archived/tiny-llama/lora-mps.yml
--- a/examples/archived/tiny-llama/lora.yml
+++ b/examples/archived/tiny-llama/lora.yml
--- a/examples/archived/tiny-llama/pretrain.yml
+++ b/examples/archived/tiny-llama/pretrain.yml
--- a/examples/archived/tiny-llama/qlora.yml
+++ b/examples/archived/tiny-llama/qlora.yml
--- a/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml
+++ b/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml
--- a/examples/archived/yi-34B-chat/README.md
+++ b/examples/archived/yi-34B-chat/README.md
--- a/examples/archived/yi-34B-chat/qlora.yml
+++ b/examples/archived/yi-34B-chat/qlora.yml
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
--- a/examples/devstral/README.md
+++ b/examples/devstral/README.md
@@ -0,0 +1,70 @@
+# Finetune Devstral with Axolotl
+
+Devstral Small is a 24B parameter opensource model from MistralAI found on HuggingFace [Devstral-Small-2505](https://huggingface.co/mistralai/Devstral-Small-2505) and [Devstral-Small-2507](https://huggingface.co/mistralai/Devstral-Small-2507). `Devstral-Small-2507` is the latest version of the model and has [function calling](https://mistralai.github.io/mistral-common/usage/tools/) support.
+
+This guide shows how to fine-tune it with Axolotl with multi-turn conversations with proper masking.
+
+The model was fine-tuned ontop of [Mistral-Small-3.1](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503) without the vision layer and has a context of up to 128k tokens.
+
+Thanks to the team at MistralAI for giving us early access to prepare for this release.
+
+## Getting started
+
+1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Devstral is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
+
+    Here is an example of how to install from main for pip:
+
+```bash
+# Ensure you have Pytorch installed (Pytorch 2.6.0+)
+git clone https://github.com/axolotl-ai-cloud/axolotl.git
+cd axolotl
+
+pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation -e '.[flash-attn]'
+```
+
+2. Run the finetuning example:
+
+```bash
+axolotl train examples/devstral/devstral-small-qlora.yml
+```
+
+This config uses about 21GB VRAM.
+
+Let us know how it goes. Happy finetuning! 🚀
+
+### TIPS
+
+- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
+- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
+- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
+- Learn how to use function calling with Axolotl at [docs](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#using-tool-use).
+
+## Optimization Guides
+
+- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
+- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
+- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
+- [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy)
+- [Liger Kernel](https://docs.axolotl.ai/docs/custom_integrations.html#liger-kernels)
+
+## Limitations
+
+We only support the `mistral-common` tokenizer for Supervised Fine-tuning at the moment and for `type: chat_template` only.
+
+In addition, we do not support overriding tokens yet.
+
+## Related Resources
+
+- [MistralAI Devstral Blog](https://mistral.ai/news/devstral)
+- [MistralAI Devstral 1.1 Blog](https://mistral.ai/news/devstral-2507)
+- [Axolotl Docs](https://docs.axolotl.ai)
+- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
+- [Axolotl Website](https://axolotl.ai)
+- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
+
+
+## Future Work
+
+- Add parity to Preference Tuning, RL, Multi-modal, etc.
+- Add parity to other tokenizer configs like overriding tokens.
--- a/examples/devstral/devstral-small-qlora.yml
+++ b/examples/devstral/devstral-small-qlora.yml
@@ -0,0 +1,64 @@
+base_model: mistralai/Devstral-Small-2507
+
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+# Enable to use mistral-common tokenizer
+tokenizer_use_mistral_common: true
+
+load_in_8bit: false
+load_in_4bit: true
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+datasets:
+  - path: fozziethebeat/alpaca_messages_2k_test
+    type: chat_template
+
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.1
+output_dir: ./outputs/qlora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 2048
+sample_packing: true
+pad_to_sequence_len: true
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0
+lora_target_linear: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 1
+optimizer: adamw_torch
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+loss_watchdog_threshold: 5.0
+loss_watchdog_patience: 3
+
+warmup_ratio: 0.05
+evals_per_epoch: 4
+saves_per_epoch: 1
+
+weight_decay: 0.0
+special_tokens:
--- a/examples/lfm2/README.md
+++ b/examples/lfm2/README.md
@@ -0,0 +1,7 @@
+# Liquid Foundation Models 2
+
+LFM2 support in transformers exists in the main branch, but is not yet included in the transformers release.
+
+```bash
+pip install --upgrade --no-deps --force-reinstall git+https://github.com/huggingface/transformers.git
+```
--- a/examples/lfm2/lfm2-350m-fft.yaml
+++ b/examples/lfm2/lfm2-350m-fft.yaml
@@ -0,0 +1,48 @@
+base_model: LiquidAI/LFM2-350M
+
+chunked_cross_entropy: true
+
+chat_template: tokenizer_default
+eot_tokens:
+  - "<|im_end|>"
+datasets:
+  - path: mlabonne/FineTome-100k
+    type: chat_template
+    split: train[:20%]
+    field_messages: conversations
+    message_field_role: from
+    message_field_content: value
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.05
+output_dir: ./outputs/out
+
+sequence_len: 4096
+sample_packing: true
+pad_to_sequence_len: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 2
+micro_batch_size: 4
+num_epochs: 1
+optimizer: adamw_torch_fused
+lr_scheduler: cosine
+learning_rate: 5e-5
+
+bf16: true
+tf32: true
+
+gradient_checkpointing: false
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 2
+saves_per_epoch: 1
+
+weight_decay: 0.0
--- a/examples/magistral/README.md
+++ b/examples/magistral/README.md
@@ -18,16 +18,10 @@ git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl

 pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation -e '.[flash-attn,mistral]'
+pip3 install --no-build-isolation -e '.[flash-attn]'
 ```

-2. Download the example config:
-
-```bash
-axolotl fetch examples
-```
-
-3. Run the finetuning example:
+2. Run the finetuning example:

 ```bash
 axolotl train examples/magistral/magistral-small-qlora.yaml
@@ -42,7 +36,7 @@ Let us know how it goes. Happy finetuning! 🚀
 - For inference, the official MistralAI team recommends `top_p: 0.95` and `temperature: 0.7` with `max_tokens: 40960`.
 - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
 - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The dataset format is the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
+- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).

 ## Optimization Guides

@@ -54,7 +48,7 @@ Let us know how it goes. Happy finetuning! 🚀

 We only support the `mistral-common` tokenizer for Supervised Fine-tuning at the moment and for `type: chat_template` only.

-The tokenizer does not work with `dataset.map` with multiprocessing, so we had to disable it. In addition, we do not support overriding tokens yet.
+In addition, we do not support overriding tokens yet.

 ## Related Resources

--- a/requirements.txt
+++ b/requirements.txt
@@ -1,24 +1,24 @@
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/

 # START section of dependencies that don't install on Darwin/MacOS
-bitsandbytes==0.45.4
+bitsandbytes==0.46.0
 triton>=3.0.0
 mamba-ssm==1.2.0.post1
 xformers>=0.0.23.post1
 autoawq==0.2.7.post3
-liger-kernel==0.5.10
+liger-kernel==0.6.0
 # END section

 packaging==23.2

-huggingface_hub==0.32.2
-peft==0.15.2
-transformers==4.52.4
+huggingface_hub>=0.33.0
+peft==0.16.0
+transformers==4.53.2
 tokenizers>=0.21.1
-accelerate==1.7.0
-datasets==3.6.0
+accelerate==1.8.1
+datasets==4.0.0
 deepspeed>=0.17.0
-trl==0.18.2
+trl==0.19.1
 hf_xet==1.1.2

 optimum==1.16.2
@@ -68,4 +68,4 @@ schedulefree==1.4.1
 axolotl-contribs-lgpl==0.0.6
 axolotl-contribs-mit==0.0.3

-mistral-common==1.6.0
+mistral-common==1.7.0
--- a/scripts/cutcrossentropy_install.py
+++ b/scripts/cutcrossentropy_install.py
@@ -29,5 +29,5 @@ UV_PREFIX = "uv " if USE_UV else ""

 print(
    UNINSTALL_PREFIX
-    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@78b2a45713a54c9bedf8b33f5e31cf07a1a57154"'
+    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@865b899"'
 )
--- a/setup.py
+++ b/setup.py
@@ -66,13 +66,16 @@ def parse_requirements(extras_require_map):

            if (major, minor) >= (2, 7):
                _install_requires.pop(_install_requires.index(xformers_version))
-                # _install_requires.append("xformers==0.0.29.post3")  # xformers seems to be hard pinned to 2.6.0
-                extras_require_map["vllm"] = ["vllm==0.8.5.post1"]
+                if patch == 0:
+                    _install_requires.append("xformers==0.0.30")
+                else:
+                    _install_requires.append("xformers==0.0.31.post1")
+                extras_require_map["vllm"] = ["vllm>=0.9.0"]
            elif (major, minor) >= (2, 6):
                _install_requires.pop(_install_requires.index(xformers_version))
-                _install_requires.append(
-                    "xformers==0.0.29.post2"
-                )  # vllm needs post2 w torch 2.6
+                _install_requires.append("xformers==0.0.29.post3")
+                # since we only support 2.6.0+cu126
+                _dependency_links.append("https://download.pytorch.org/whl/cu126")
                extras_require_map["vllm"] = ["vllm==0.8.5.post1"]
            elif (major, minor) >= (2, 5):
                _install_requires.pop(_install_requires.index(xformers_version))
@@ -111,14 +114,14 @@ def get_package_version():


 extras_require = {
-    "flash-attn": ["flash-attn==2.7.4.post1"],
+    "flash-attn": ["flash-attn==2.8.0.post2"],
    "ring-flash-attn": [
-        "flash-attn==2.7.4.post1",
-        "ring-flash-attn>=0.1.4",
+        "flash-attn==2.8.0.post2",
+        "ring-flash-attn>=0.1.5",
        "yunchang==0.6.0",
    ],
    "deepspeed": [
-        "deepspeed==0.17.1",
+        "deepspeed==0.17.2",
        "deepspeed-kernels",
    ],
    "mamba-ssm": [
--- a/src/axolotl/init.py
+++ b/src/axolotl/init.py
@@ -4,4 +4,4 @@ import pkgutil

 __path__ = pkgutil.extend_path(__path__, __name__)  # Make this a namespace package

-__version__ = "0.11.0.dev"
+__version__ = "0.12.0.dev"
--- a/src/axolotl/cli/preprocess.py
+++ b/src/axolotl/cli/preprocess.py
@@ -1,5 +1,6 @@
 """CLI to run preprocessing of a dataset."""

+import os
 import warnings
 from pathlib import Path
 from typing import Union
@@ -35,6 +36,12 @@ def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None:
    check_accelerate_default_config()
    check_user_token()

+    for key in ["skip_prepare_dataset", "pretraining_dataset"]:
+        if cfg.get("key"):
+            raise ValueError(
+                f"You have set `{key}:`. `preprocess` is not needed. Run the `axolotl train` CLI directly instead."
+            )
+
    if not cfg.dataset_prepared_path:
        msg = (
            Fore.RED
@@ -89,6 +96,7 @@ def do_cli(
        kwargs: Additional keyword arguments to override config file values.
    """
    # pylint: disable=duplicate-code
+    os.environ["AXOLOTL_IS_PREPROCESS"] = "1"
    parsed_cfg = load_cfg(config, **kwargs)
    parsed_cfg.is_preprocess = True
    parser = transformers.HfArgumentParser(PreprocessCliArgs)
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -109,6 +109,13 @@ def ray_train_func(kwargs: dict):
    # initialize accelerator before model instantiation
    Accelerator(gradient_accumulation_steps=cfg.gradient_accumulation_steps)

+    # Register plugins in Ray workers
+    if cfg.get("plugins"):
+        from axolotl.cli.config import plugin_set_cfg, prepare_plugins
+
+        prepare_plugins(cfg)
+        plugin_set_cfg(cfg)
+
    kwargs["cfg"] = cfg

    do_train(**kwargs)
--- a/src/axolotl/cli/vllm_serve.py
+++ b/src/axolotl/cli/vllm_serve.py
@@ -37,7 +37,6 @@ def do_vllm_serve(
    Returns:
        process_id: the process id of the started VLLM server
    """
-    patch_vllm_worker()
    cfg = load_cfg(config)
    model = cfg.base_model

@@ -47,6 +46,9 @@ def do_vllm_serve(
    tensor_parallel_size = (
        cli_args.get("tensor_parallel_size") or cfg.vllm.tensor_parallel_size
    )
+    data_parallel_size = (
+        cli_args.get("data_parallel_size") or cfg.vllm.data_parallel_size
+    )
    host = cli_args.get("host") or cfg.vllm.host
    port = cli_args.get("port") or cfg.vllm.port
    gpu_memory_utilization = (
@@ -68,6 +70,7 @@ def do_vllm_serve(
    vllm_script_args = AxolotlScriptArguments(
        model=model,
        tensor_parallel_size=tensor_parallel_size,
+        data_parallel_size=data_parallel_size,
        host=host,
        port=port,
        gpu_memory_utilization=gpu_memory_utilization,
--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -75,13 +75,17 @@ def load_datasets(

        num_examples = cli_args.debug_num_examples if cli_args else 1
        text_only = cli_args.debug_text_only if cli_args else False
-        train_samples = sample_dataset(train_dataset, num_examples)
-        check_dataset_labels(
-            train_samples,
-            tokenizer,
-            num_examples=num_examples,
-            text_only=text_only,
-        )
+        try:
+            train_samples = sample_dataset(train_dataset, num_examples)
+            check_dataset_labels(
+                train_samples,
+                tokenizer,
+                num_examples=num_examples,
+                text_only=text_only,
+            )
+        except AttributeError:
+            # can't sample iterable datasets
+            pass

        LOG.info("printing prompters...")
        for prompter in prompters:
--- a/src/axolotl/core/attention/init.py
+++ b/src/axolotl/core/attention/init.py
--- a/src/axolotl/core/attention/flex_block_mask.py
+++ b/src/axolotl/core/attention/flex_block_mask.py
@@ -0,0 +1,162 @@
+"""
+monkeypatch for flex + packing
+"""
+
+import sys
+from typing import Callable, Optional, Union
+
+import torch
+from torch.nn.attention.flex_attention import BlockMask
+from transformers import Cache, PretrainedConfig
+from transformers.masking_utils import (
+    ALL_MASK_ATTENTION_FUNCTIONS,
+    _preprocess_mask_arguments,
+    and_masks,
+    causal_mask_function,
+    or_masks,
+)
+from transformers.utils import is_torch_greater_or_equal
+
+_is_torch_greater_or_equal_than_2_6 = is_torch_greater_or_equal("2.6", accept_dev=True)
+
+
+def create_causal_mask(
+    config: PretrainedConfig,
+    input_embeds: torch.Tensor,
+    attention_mask: torch.Tensor,
+    cache_position: torch.Tensor,
+    past_key_values: Optional[Cache],
+    or_mask_function: Optional[Callable] = None,
+    and_mask_function: Optional[Callable] = None,
+) -> Optional[Union[torch.Tensor, BlockMask]]:
+    """
+    Create a standard causal mask based on the attention implementation used (stored in the config). If `past_key_values`
+    has an HybridCache structure, this function will return the mask corresponding to one of the "full_attention" layers (to align
+    to what is needed in the `modeling_xxx.py` files).
+
+    Args:
+        config (`PretrainedConfig`):
+            The model config.
+        input_embeds (`torch.Tensor`):
+            The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the
+            batch size, query length and dtype.
+        attention_mask (`torch.Tensor`, optional):
+            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length).
+            It can also be an already prepared 4D mask, in which case it is returned as-is.
+        cache_position (`torch.Tensor`):
+            A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
+        past_key_values (`Cache`, optional):
+            The past key values, if we use a cache.
+        or_mask_function (`Callable`, optional):
+            An optional mask function to combine with the causal mask function (by doing the union of both). This is
+            useful to easily overlay another mask on top of the causal one, for example for image tokens handling.
+        and_mask_function (`Callable`, optional):
+            An optional mask function to combine with the causal mask function (by doing the intersection of both). This is
+            useful to easily overlay another mask on top of the causal one, for example for image tokens handling.
+    """
+    # If we have an HybridCache structure, here we want to create the mask for the full layers
+    if (
+        past_key_values
+        and hasattr(past_key_values, "is_sliding")
+        and False in past_key_values.is_sliding
+    ):
+        layer_idx = past_key_values.is_sliding.index(False)
+    else:
+        layer_idx = 0
+
+    original_attention_mask = (
+        None
+        if attention_mask is None
+        else attention_mask.clone().to(cache_position.device)
+    )
+    early_exit, attention_mask, kv_length, kv_offset = _preprocess_mask_arguments(
+        config, input_embeds, attention_mask, cache_position, past_key_values, layer_idx
+    )
+    if early_exit:
+        return attention_mask
+
+    batch_size, total_seq_len = cache_position.shape
+    key_length = total_seq_len
+    document_ids = torch.nn.functional.pad(
+        original_attention_mask, value=0, pad=(0, key_length)
+    )
+
+    batch_size, dtype = input_embeds.shape[0], input_embeds.dtype
+    if attention_mask is not None:
+
+        def causal_doc_mask_mod(
+            batch_idx, head_idx, q_idx, kv_idx
+        ):  # pylint: disable=unused-argument
+            """
+            Defines the logic of a block causal mask by combining both a standard causal mask
+            and a block diagonal document mask.
+            See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
+            for an illustration.
+            """
+            causal_mask_ = q_idx >= kv_idx  # not valid when decoding
+            document_mask = (
+                document_ids[batch_idx, q_idx] == document_ids[batch_idx, kv_idx]
+            )
+            final_mask = causal_mask_ & document_mask
+            return final_mask
+
+        mask_factory_function = causal_doc_mask_mod
+    else:
+        mask_factory_function = causal_mask_function
+    mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[
+        config._attn_implementation  # pylint: disable=protected-access
+    ]
+
+    # Do not allow skip if we are compiling (this is to match BC)
+    allow_is_causal_skip = (
+        not past_key_values.is_compileable if past_key_values is not None else True
+    )
+
+    # Allow slight deviations from causal mask
+    if or_mask_function is not None:
+        if not _is_torch_greater_or_equal_than_2_6:
+            raise ValueError(
+                "Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6"
+            )
+        mask_factory_function = or_masks(mask_factory_function, or_mask_function)
+        allow_is_causal_skip = False
+    if and_mask_function is not None:
+        if not _is_torch_greater_or_equal_than_2_6:
+            raise ValueError(
+                "Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6"
+            )
+        mask_factory_function = and_masks(mask_factory_function, and_mask_function)
+        allow_is_causal_skip = False
+
+    # We now create the mask
+    causal_mask = mask_interface(
+        batch_size=batch_size,
+        cache_position=cache_position,
+        kv_length=kv_length,
+        kv_offset=kv_offset,
+        mask_function=mask_factory_function,
+        attention_mask=attention_mask,
+        allow_is_causal_skip=allow_is_causal_skip,  # additional kwarg for sdpa
+        dtype=dtype,  # Additional kwarg for eager
+        config=config,  # Pass the config as well, in case someone wants to easily have their own mask_interface
+    )
+    return causal_mask
+
+
+def patch_create_causal_mask(model_type):
+    import transformers.masking_utils
+
+    transformers.masking_utils.create_causal_mask = create_causal_mask
+
+    if model_type:
+        try:
+            # Dynamically import the module and attention class
+            module_path = f"transformers.models.{model_type}.modeling_{model_type}"
+            module = __import__(module_path)
+            module.create_causal_mask = create_causal_mask
+            del sys.modules[module_path]
+        except (ImportError, AttributeError) as e:
+            raise ValueError(
+                f"Could not import attention class for model_type: {model_type}. "
+                f"Error: {str(e)}"
+            ) from e
--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -112,13 +112,6 @@ class TrainerBuilderBase(abc.ABC):
            plugin_manager.add_callbacks_pre_trainer(cfg=self.cfg, model=self.model)
        )

-        if self.cfg.profiler_steps:
-            callbacks.append(
-                PytorchProfilerCallback(
-                    steps_to_profile=self.cfg.profiler_steps,
-                )
-            )
-
        if self.cfg.gc_steps:
            callbacks.append(GCCallback(gc_steps=self.cfg.gc_steps))

@@ -145,6 +138,14 @@ class TrainerBuilderBase(abc.ABC):

        callbacks.append(GPUStatsCallback(cfg=self.cfg))

+        if self.cfg.profiler_steps:
+            callbacks.append(
+                PytorchProfilerCallback(
+                    steps_to_profile=self.cfg.profiler_steps,
+                    profiler_steps_start=self.cfg.profiler_steps_start,
+                )
+            )
+
        return callbacks

    def get_post_trainer_create_callbacks(self, trainer):
@@ -219,7 +220,9 @@ class TrainerBuilderBase(abc.ABC):
        if self.cfg.bf16 == "full":
            training_args_kwargs["bf16_full_eval"] = True
        else:
-            training_args_kwargs["bf16"] = self.cfg.bf16 or self.cfg.bfloat16
+            bf16 = self.cfg.bf16 or self.cfg.bfloat16
+            bf16 = bf16 if bf16 is not None else False
+            training_args_kwargs["bf16"] = bf16

    def _configure_scheduler(self, training_args_kwargs: dict):
        if self.cfg.lr_scheduler in ["one_cycle", "rex"]:
@@ -416,6 +419,9 @@ class TrainerBuilderBase(abc.ABC):
            torch._dynamo.config.suppress_errors = (  # pylint: disable=protected-access
                True
            )
+            torch._dynamo.config.accumulated_cache_size_limit = (  # pylint: disable=protected-access
+                256
+            )
            training_args_kwargs["torch_compile"] = self.cfg.torch_compile
            if self.cfg.torch_compile_backend:
                training_args_kwargs["torch_compile_backend"] = (
@@ -424,8 +430,16 @@ class TrainerBuilderBase(abc.ABC):
            if self.cfg.torch_compile_mode:
                training_args_kwargs["torch_compile_mode"] = self.cfg.torch_compile_mode

+    def _configure_accelerator_config(self, training_args_kwargs: dict):
+        if self.cfg.accelerator_config:
+            training_args_kwargs["accelerator_config"] = self.cfg.accelerator_config
+
    def _configure_gradient_checkpointing(self, training_args_kwargs: dict):
-        if self.cfg.gradient_checkpointing:
+        if self.cfg.activation_offloading is True:
+            # don't use the HF gradient checkpointing, manually wrap
+            training_args_kwargs["gradient_checkpointing"] = False
+            training_args_kwargs["activation_offloading"] = True
+        elif self.cfg.gradient_checkpointing:
            training_args_kwargs["gradient_checkpointing"] = (
                self.cfg.gradient_checkpointing
            )
@@ -499,10 +513,15 @@ class TrainerBuilderBase(abc.ABC):
        if self.cfg.reward_model or self.cfg.rl:
            training_args_kwargs["max_length"] = self.cfg.sequence_len

+        if self.cfg.fsdp_config or self.cfg.fsdp:
+            training_args_kwargs["fsdp_config"] = self.cfg.fsdp_config
+            training_args_kwargs["fsdp"] = self.cfg.fsdp if self.cfg.fsdp else True
+
        self._configure_reporting(training_args_kwargs)
        self._configure_hub_parameters(training_args_kwargs)
        self._configure_scheduler(training_args_kwargs)
        self._configure_optimizer(training_args_kwargs, trainer_kwargs)
        self._configure_torch_compile(training_args_kwargs)
+        self._configure_accelerator_config(training_args_kwargs)

        return training_args_kwargs, trainer_kwargs
--- a/src/axolotl/core/builders/causal.py
+++ b/src/axolotl/core/builders/causal.py
@@ -151,14 +151,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        training_arguments_kwargs, trainer_kwargs = self._set_base_training_args(
            total_num_steps
        )
-
-        if self.cfg.fsdp:
-            training_arguments_kwargs["fsdp"] = self.cfg.fsdp
-            if self.cfg.fsdp_config:
-                training_arguments_kwargs["fsdp_config"] = {
-                    k.lstrip("fsdp_"): v for k, v in dict(self.cfg.fsdp_config).items()
-                }
-
        if self.cfg.adapter == "qlora":
            training_arguments_kwargs["qlora"] = True

@@ -245,14 +237,27 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        training_arguments_kwargs["curriculum_sampling"] = self.cfg.curriculum_sampling

        training_arguments_kwargs["sample_packing"] = bool(self.cfg.sample_packing)
+        training_arguments_kwargs["sample_packing_drop_attention_mask"] = bool(
+            self.cfg.flash_attention
+            or self.cfg.xformers_attention
+            or self.cfg.flex_attention
+        )
        training_arguments_kwargs["multipack_real_batches"] = (
            self.cfg.multipack_real_batches
            if self.cfg.multipack_real_batches is not None
-            else not self.cfg.flash_attention
+            else not (
+                self.cfg.flash_attention
+                or self.cfg.flex_attention
+                or self.cfg.xformers_attention
+            )
        )
        training_arguments_kwargs["eval_sample_packing"] = bool(
            self.cfg.eval_sample_packing
        )
+        if self.cfg.sample_packing_sequentially is not None:
+            training_arguments_kwargs["sample_packing_sequentially"] = (
+                self.cfg.sample_packing_sequentially
+            )
        if self.cfg.sample_packing_bin_size is not None:
            training_arguments_kwargs["sample_packing_bin_size"] = (
                self.cfg.sample_packing_bin_size
@@ -305,11 +310,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                self.cfg.neftune_noise_alpha
            )

-        if self.cfg.accelerator_config:
-            training_arguments_kwargs["accelerator_config"] = (
-                self.cfg.accelerator_config
-            )
-
        if self.cfg.image_size:
            training_arguments_kwargs["image_size"] = self.cfg.image_size
        if self.cfg.image_resize_algorithm:
@@ -413,7 +413,8 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                or self.cfg.micro_batch_size > 1
            ):
                return DataCollatorForSeq2Seq(self.tokenizer, **kwargs)
-            return None
+            if not (self.cfg.sample_packing and self.cfg.pretrain_multipack_attn):
+                return None

        if self.cfg.model_config_type == "mamba":
            return MambaDataCollator(tokenizer=self.tokenizer)
--- a/src/axolotl/core/builders/rl.py
+++ b/src/axolotl/core/builders/rl.py
@@ -208,7 +208,7 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            callbacks=self.get_callbacks(),
            **trainer_kwargs,
        )
-        if self.cfg.fsdp:
+        if self.cfg.fsdp_config or self.cfg.fsdp:
            ensure_dtype(trainer.model, dtype=self.cfg.torch_dtype)
            if self.cfg.rl in [RLType.DPO, RLType.IPO] and trainer.ref_model:
                ensure_dtype(trainer.ref_model, dtype=self.cfg.torch_dtype)
@@ -218,21 +218,3 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            trainer.add_callback(callback)

        return trainer
-
-
-class HFPPOTrainerBuilder(TrainerBuilderBase):
-    """
-    HF Factory class for PPO Trainer
-    """
-
-    def get_callbacks(self):
-        callbacks = super().get_callbacks()
-        return callbacks
-
-    def get_post_trainer_create_callbacks(self, trainer):
-        callbacks = super().get_post_trainer_create_callbacks(trainer=trainer)
-        return callbacks
-
-    def build(self, total_num_steps):
-        # TODO: build PPOConfig
-        raise NotImplementedError("PPO trainer builder is not implemented yet.")
--- a/src/axolotl/core/trainers/init.py
+++ b/src/axolotl/core/trainers/init.py
@@ -14,5 +14,4 @@ from .trl import (
    AxolotlORPOTrainer,
    AxolotlPRMTrainer,
    AxolotlRewardTrainer,
-    TRLPPOTrainer,
 )
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -25,8 +25,10 @@ from trl.trainer.utils import pad_to_length
 from typing_extensions import override

 from axolotl.core.trainers.mixins import (
+    ActivationOffloadingMixin,
    CheckpointSaveMixin,
    OptimizerMixin,
+    PackingMixin,
    RngLoaderMixin,
    SchedulerMixin,
 )
@@ -42,7 +44,13 @@ LOG = get_logger(__name__)


 class AxolotlTrainer(
-    SchedulerMixin, OptimizerMixin, RngLoaderMixin, CheckpointSaveMixin, Trainer
+    PackingMixin,
+    SchedulerMixin,
+    OptimizerMixin,
+    RngLoaderMixin,
+    CheckpointSaveMixin,
+    ActivationOffloadingMixin,
+    Trainer,
 ):
    """Extend the base Trainer for axolotl helpers"""

@@ -69,18 +77,6 @@ class AxolotlTrainer(
        if self.args.orpo_alpha:
            self.loss_fct = torch.nn.CrossEntropyLoss(reduction="none")

-    def _wrap_model(self, model, training=True, dataloader=None):
-        if self.args.torch_compile:
-            torch._dynamo.config.accumulated_cache_size_limit = (  # pylint: disable=protected-access
-                256
-            )
-            model = torch.compile(
-                model,
-                backend=self.args.torch_compile_backend,
-                mode=self.args.torch_compile_mode,
-            )
-        return super()._wrap_model(model, training=training, dataloader=dataloader)
-
    def _create_multipack_sampler(
        self, base_sampler: Sampler, dataset: Dataset
    ) -> MultipackBatchSampler:
@@ -116,6 +112,7 @@ class AxolotlTrainer(
            sequential=self.args.sample_packing_sequentially,
            drop_last=True,
            num_processes=self.args.dataset_num_proc,
+            mp_start_method=self.args.sample_packing_mp_start_method or "fork",
        )

        len(sampler)
@@ -205,6 +202,14 @@ class AxolotlTrainer(

        if dataset.column_names and "length" in dataset.column_names:
            dataset = dataset.remove_columns(["length"])
+        if (
+            dataset.column_names
+            and "position_ids" in dataset.column_names
+            and "attention_mask" in dataset.column_names
+            and self.args.sample_packing
+            and self.args.sample_packing_drop_attention_mask
+        ):
+            dataset = dataset.remove_columns(["attention_mask"])

        if isinstance(dataset, datasets.Dataset):
            if is_training:
--- a/src/axolotl/core/trainers/dpo/init.py
+++ b/src/axolotl/core/trainers/dpo/init.py
@@ -28,7 +28,7 @@ class DPOStrategy:
        training_args_kwargs["max_completion_length"] = None
        training_args_kwargs["max_length"] = cfg.sequence_len
        training_args_kwargs["max_prompt_length"] = cfg.sequence_len
-        training_args_kwargs["generate_during_eval"] = cfg.use_wandb
+        training_args_kwargs["generate_during_eval"] = cfg.dpo_generate_during_eval
        if cfg.dpo_use_weighting is not None:
            training_args_kwargs["use_weighting"] = cfg.dpo_use_weighting
        if cfg.dpo_padding_free is not None:
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Dan Saunders	6f6d917a99	Revert "checkpoint model on first step callback (#2906 )" This reverts commit `10ba1622f7`.	2025-07-15 15:01:12 -04:00
Dan Saunders	10ba1622f7	checkpoint model on first step callback (#2906 ) * checkpoint model on first step callback * remove debug * add test cases; update existing tests not to save on first step * move test out of solo * delete * default to False * typo	2025-07-15 15:00:48 -04:00
Wing Lian	d320ef6199	fix for upstream refactor of KwargsForCausalLM (#2911 )	2025-07-15 11:28:41 -04:00
NanoCode012	354eaaf0d3	feat: add call method to mistral tokenizer wrapper (#2898 )	2025-07-14 22:33:35 -04:00
greenhestu	a061446540	Fix: Prevents merging of tool arguments during preprocessing (#2909 )	2025-07-14 22:33:10 -04:00
Wing Lian	cd079b5536	Tensor parallel w DeepSpeed AutoTP (#2574 ) * support for deepspeed autotup * bump to latest deepspeed that supports deepcompile too * add deepcompile support too * fix total steps calculation for TP * setup fixture for tp * update ds config to ensure weights are gathered for checkpoint * fix duplicate validation names * chore: lint	2025-07-14 21:33:48 -04:00
Wing Lian	5cc16040a8	move the plugin post trainer create to the setup trainer (#2907 ) * move the plugin post trainer create to the setup trainer * move post-train plugins to execute-training fn	2025-07-14 20:11:33 -04:00
Wing Lian	38359a8997	allow profiling in mid-training rather from the start (#2899 ) [skip ci] * allow profiling in mid-training rather from the start * simplify based on PR feedback * fix logic, improve saving at end, add tests	2025-07-14 20:11:11 -04:00
Wing Lian	7dc3ac6cb3	update nightlies builds (#2921 ) [skip ci]	2025-07-14 20:10:43 -04:00
Wing Lian	99187cd208	Activation Offloading w CUDA Streams (#2900 ) [skip ci] * use cuda streams for activation offloading * use torch native ops * update cfg schema for streams * fix literal constructor for set * use context for training step so it doesn't affect evals * disable streams * auto gc on eval steps * use activation_offloading config arg * add docs for gradient checkpointing * handle validation for gc/ao * use cuda streams for act offloading * add more validation for AC w/o GC * fix docs * move activation_offloading lower in definition so it doesn't break args/kwargs * fix kd due to import order	2025-07-14 20:10:20 -04:00
Wing Lian	aa684122f1	upgrade peft==0.16.0 and datasets==4.0.0 (#2917 ) [skip ci] * upgrade peft to 0.16.0 * upgrade datasets to 4.0.0 * refactor dupes from merge/rebase * fix check for fsdp1 + sharded_state_dict * use full state dict for ci	2025-07-14 20:09:26 -04:00
Wing Lian	ca4d4ef793	don't init distributed for deepspeed if preprocessing (#2920 ) * don't init distributed for deepspeed if preprocessing * add e2e test to validate preprocess cli with deepspeed * ignore duplicate code for cfg	2025-07-14 14:19:19 -04:00
Dan Saunders	37edbe4999	Remove extra torch.compile call (#2904 ) * debug * debug * debug * moving validation code to transformers * revert unneeded change * add accelerator config to base trainer builder * add back accumulated_cache_size_limit setting * lint	2025-07-14 12:32:45 -04:00
Wing Lian	e581c15d40	refactor dupes from merge/rebase (#2919 ) [skip ci]	2025-07-14 10:05:26 -04:00
Wing Lian	af92151a7b	FSDP2 fix validation and add tests (#2910 ) * fix validation and add tests * remove debugging and add more tests * remove migrate_fsdp	2025-07-14 09:25:44 -04:00
Wing Lian	80dc4c261a	fix xformers version for python 2.6 (#2916 ) [skip ci]	2025-07-14 09:24:29 -04:00
Wing Lian	7ccbbd8e77	upgrade liger to 0.6.0 (#2893 ) [skip ci]	2025-07-14 09:24:07 -04:00
Wing Lian	5081db7f8a	upgrade trl==0.19.1 (#2892 ) [skip ci] * upgrade trl==0.19.1 * add vllm for tests for grpo * fixes to work with latest trl * need data_parallel_size config too * support for vllm_mode for server / colocate * vllm settings for colocate * relax vllm version * bump min hf hub for latest vllm support * add hints on string literal for vllm mode * use latest transformers 4.53.2 * tweak acceptable loss on flaky test_ds_zero3_packed test * don't run flaky vllm/grpo tests for now	2025-07-14 09:23:42 -04:00
Wing Lian	41664c7c4c	fix ddp for incorrect steps (#2915 ) * fix ddp for incorrect steps * add test	2025-07-14 07:51:16 -04:00
Wing Lian	9a8073e73d	Liquid Foundation Model 2 support (#2905 ) * LFM2 support * docs * packing seems to work * update install to force install in case already on dev version * default to use chunked cross entropy	2025-07-12 11:41:34 -04:00
Jiawei Liu	7fb8441e0e	fix: customized dataset with simpo (#2894 ) [skip ci]	2025-07-12 11:40:30 -04:00
NanoCode012	4dc5910e1c	feat(doc): re-add docker 2.7.0 tag back (#2902 ) [skip ci]	2025-07-12 11:40:01 -04:00
Wing Lian	fb7bc9250d	move unmaintained examples to archive (#2903 ) [skip ci]	2025-07-12 11:39:51 -04:00
salman	d6e4a611e5	FSDP1 -> FSDP2 (#2760 ) * FSDP2 args migration implementation This commit implements the migration to FSDP2 arguments including: - FSDP2 support with LoRA training - DPO integration with FSDP2 - Model loading fixes and refactoring - CPU offloading and PEFT handling - Test updates and CI improvements - Bug fixes for dtype errors and various edge cases	2025-07-12 15:18:01 +01:00
Ed Sealing	eb662557a7	Register Plugins in Ray Workers (#2901 ) [skip ci] * Access plugins in ray cluster * Add comment * chore: lint --------- Co-authored-by: Ed Sealing <ed.sealing@patapsco.ai> Co-authored-by: Wing Lian <wing@axolotl.ai>	2025-07-11 16:59:59 -04:00
salman	03b2a113fe	Update doc preview workflow to use sticky comments (#2873 )	2025-07-11 14:08:35 +01:00
NanoCode012	9b95a625ab	feat: add devstral small 2507 (#2896 ) * feat: add devstral small 2507 * chore: update blog doc	2025-07-11 09:34:19 +07:00
Wing Lian	c370d0795c	[doc] Fix docs for text field mapping for completion datasets (#2890 ) * Fix docs for text field mapping for completion datasets * update another reference	2025-07-09 14:52:44 -04:00
Wing Lian	76aeb16156	tiled_mlp supports single gpu (#2891 ) * tiled_mlp supports single gpu * use checkpoint offloading for arctic training * patch torch checkpoint too * support for single gpu zero3 * add linkback to where it was copied from	2025-07-09 12:48:22 -04:00
Wing Lian	7c5ea0010f	bump dev version (#2889 ) [skip ci]	2025-07-09 09:43:42 -04:00
Wing Lian	c6d69d5c1b	release v0.11.0 (#2875 ) Some checks failed ci-cd / build-axolotl (<nil>, 126, 12.6.3, 3.11, 2.6.0) (push) Has been cancelled Details ci-cd / build-axolotl (<nil>, 126, 12.6.3, 3.11, 2.7.1) (push) Has been cancelled Details ci-cd / build-axolotl (<nil>, 128, 12.8.1, 3.11, 2.7.1) (push) Has been cancelled Details ci-cd / build-axolotl (vllm, 126, 12.6.3, 3.11, 2.7.0) (push) Has been cancelled Details publish pypi / Create Release (push) Has been cancelled Details ci-cd / build-axolotl-cloud (<nil>, 126, 12.6.3, 3.11, 2.7.0) (push) Has been cancelled Details ci-cd / build-axolotl-cloud (<nil>, 126, 12.6.3, 3.11, 2.7.1) (push) Has been cancelled Details ci-cd / build-axolotl-cloud (<nil>, 126, 12.6.3, true, 3.11, 2.6.0) (push) Has been cancelled Details ci-cd / build-axolotl-cloud (<nil>, 128, 12.8.1, 3.11, 2.7.1) (push) Has been cancelled Details ci-cd / build-axolotl-cloud-no-tmux (<nil>, 126, 12.6.3, 3.11, 2.6.0) (push) Has been cancelled Details publish pypi / Upload release to PyPI (push) Has been cancelled Details * release v0.11.0 * don't build vllm into release for now * remove 2.5.1 references * smollm3 multipack support * fix ordering of e2e tests	2025-07-09 09:22:35 -04:00
Wing Lian	4ff96a2526	fix xformers version (#2888 )	2025-07-09 08:43:40 -04:00
salman	89e99eaaa7	slowest durations (#2887 ) [skip ci]	2025-07-09 08:43:26 -04:00
Wing Lian	6ed501f6dc	add 2.7.0 torch images back to support vlllm (#2885 )	2025-07-08 16:28:14 -04:00
NanoCode012	8c6a6ea6eb	Feat: add devstral model support (#2880 ) [skip ci] * fix: do not add training and training_detail block by default * fixed: magistral docs * fix: address pad adding new fields and use built-in from_openai * feat: try enable multiprocessing * fix: check for keys before deleting attn_mask * feat: add mistral pad test * feat: add tool calling test * feat: add devstral tokenizer tests * fix: comma format * chore: remove unused support_preprocessing as tokenizer is pickable now * chore: update magistral doc * feat: add devstral readme and example * chore: refactor error handling	2025-07-08 11:01:19 -04:00
NanoCode012	78bff4925e	fix: set add_generation_prompt to False when apply chat template (#2859 ) [skip ci]	2025-07-08 11:00:44 -04:00
NanoCode012	b237c8a3f3	chore: update cce commit to include gemma3n fixes (#2881 ) [skip ci]	2025-07-08 10:59:35 -04:00
float-trip	1032e22650	Fix link in FSDP + QLoRA docs. (#2879 ) [skip ci]	2025-07-08 09:19:09 -04:00
Wing Lian	d68cc1e8ab	densemixer plugin integration (#2868 ) * densemixer plugin integration * update readme with usage docs * automatically find new integrations that aren't explicitly defined * make sure to import os	2025-07-07 17:05:19 -04:00
github-actions[bot]	21f1bf4805	chore: update pre-commit hooks (#2870 ) [skip ci] * chore: update pre-commit hooks * don't bandit huggingface hub downloads without revision --------- Co-authored-by: djsaunde <1245942+djsaunde@users.noreply.github.com> Co-authored-by: Wing Lian <wing@axolotl.ai>	2025-07-07 15:26:15 -04:00
Wing Lian	de2c5ba103	mark flaky geglu tests and add torch seed (#2876 ) [skip ci] * mark flaky geglu tests and add torch seed * restore accidental removal of seed	2025-07-07 15:24:16 -04:00
Wing Lian	9c0d7ee761	TiledMLP support (#2865 )	2025-07-07 15:23:49 -04:00
NanoCode012	22d4a838dc	feat(doc): add vllm and fa2 incompat error to faq (#2877 )	2025-07-07 14:13:37 -04:00
Wing Lian	a108e5db56	use latest version of cce fork for SP fix (#2871 ) [skip ci] * use latest version of cce fork for SP fix * latest sha to handle older transformers	2025-07-07 13:05:11 -04:00
Wing Lian	faff0cff41	manage jinja templates as nicely formatted files (#2795 ) * manage jinja templates as nicely formatted files * chore: lint * use path for templates relative to the module * fix template reformating * handle newlines in llama3 template * fix gemma3 jinja * fix templates * suport for passing jinja template file in yaml * handle file loading of jinja template outside of validation * fix typing and typo	2025-07-07 10:11:48 -04:00
Wing Lian	759cefb741	setup defaults for dataloader to ensure GPU is kept busy (#2632 ) [skip ci]	2025-07-07 10:10:58 -04:00
Wing Lian	69cd49a7aa	update transformers to 4.53.1 (#2844 ) [skip ci] * update transformers to 4.53.0 * remove attention_mask from signature columns if using packing * remove attention_mask column from dataloader * update signature of flash attn forward for ring attn patch * fix FSDP * patch ring-flash-attn with upstream signature fix * fix patch indentation level * fix the patch * add batch flattening smoke test with loss check that works in older transformers * fix patch * don't drop attention mask for flex * more fixes * patch create_causal_mask for packing w flex * global torch manual_seed fixture * tweak loss checks * fix patch and use single batch for flex * don't need to reload * fix causal mask patch * use transformers patch releasE * make sure env var is string * make sure to drop attention mask for flex w packing for latest transformers patch release * tweak loss * guard on signature columns before removing attention mask * bump loss * set remove isn't chainable * skip slow mistral test in 2.5.1	2025-07-07 09:35:22 -04:00
NanoCode012	5a961ecadf	Fix: do not call preprocess in multimodal or pretraining case (#2861 ) * fix: let users know to not call preprocess for vision mode * fix: improve ux for pretraining dataset and skip prepare ds * feat: add info to doc * Update src/axolotl/cli/preprocess.py following comment Co-authored-by: salman <salman.mohammadi@outlook.com> --------- Co-authored-by: salman <salman.mohammadi@outlook.com>	2025-07-06 21:55:33 -04:00
Wing Lian	b37ddf9778	don't use tokenizer parallelism when using packing (#2862 ) [skip ci]	2025-07-06 21:55:09 -04:00
Wing Lian	bf38e507fb	respect shuffle_merged_datasets for single dataset too (#2866 ) [skip ci] * respect shuffle_merged_datasets for single dataset too * update inline comment for behavior Co-authored-by: NanoCode012 <nano@axolotl.ai> --------- Co-authored-by: NanoCode012 <nano@axolotl.ai>	2025-07-06 21:20:41 -04:00
Wing Lian	a5946ff1f0	build fa2 from source for base image with torch2.6 and cu124 (#2867 )	2025-07-05 09:21:18 -04:00
Wing Lian	70ca1b2291	fix nightlies to use correct cache (#2848 ) [skip ci] * fix nightlies to use correct cache * fix for handling None for bf16	2025-07-03 12:21:39 -04:00
NanoCode012	8ae5a2311b	feat: update handling for mistraltokenizer decode and multiprocessing pickling fix (#2790 ) * feat: update handling for mistraltokenizer decode * fix: update mistral common package version * fix: to use correct release * fix triton path --------- Co-authored-by: Wing Lian <wing@axolotl.ai>	2025-07-02 08:07:18 -04:00
NanoCode012	6383630155	Fix: tokenize stall due to not shuffling dataset (#2845 ) * fix: shuffle dataset even if only one to fix tokenize stall * fix: warn if shuffling merged with curriculum sampling * chore: refactor	2025-07-02 08:06:00 -04:00
Vincenzo di Cicco	f2b352f2e5	Add sample_packing_sequentially to trainer args (#2853 ) [skip ci]	2025-07-02 08:05:35 -04:00
NanoCode012	bf5928d0ee	feat(doc): update docker tag examples (#2851 ) [skip ci] * feat(doc): update docker tag examples * chore: comment	2025-07-02 08:05:01 -04:00
Dhruv Mullick	d1224db8f4	Decouple generate_during_eval from wandb to support other visualizers (#2849 ) [skip ci] * Add generate_during_eval for mlflow for dpo * Decouple generate_during_eval from wandb	2025-07-02 08:04:40 -04:00
mhenrichsen	327b4e48e9	Add installation instructions for pip and Docker to README.md (#2854 ) * Add installation instructions for pip and Docker to README.md * Enhance README.md with Docker installation guidance for improved setup reliability.	2025-07-02 09:03:52 +02:00
Dan Saunders	35fdbce102	Ensure device mesh patching is applied (#2842 ) * move patches; make patch stronger * fix broken tests * guard sequence_parallel_degree comparison against none --------- Co-authored-by: Wing Lian <wing@axolotl.ai>	2025-06-29 22:16:32 -04:00
Wing Lian	cb811f8bf1	upgrade to flash-attn 2.8.0.post2 (#2828 ) * upgrade to flash-attn 2.8.0.post2 * use cu126 with torch 2.6 * seems vllm 0.8.5.post1 not compatible with cuda12.6.3 and torch 2.6 * cu126 + torch 2.6 as the default * use cu126 for multigpu w torch 2.6 too * drop vllm for now from ci for now	2025-06-29 22:11:16 -04:00
Wing Lian	7563e1bd30	set a different triton cache for each test to avoid blocking writes to cache (#2843 ) * set a different triton cache for each test to avoid blocking writes to cache * set log level * disable debug logging for filelock	2025-06-29 22:05:21 -04:00
Wing Lian	81893c775c	Accelerate 1.8.1 and BNB 0.46.0 update (#2815 ) * update accelerate to v1.8.0 * update bnb also * fix multigpu ci timeout * fix test set size * use latest accelerate 1.8.1 * disable default dtype	2025-06-28 15:29:19 -04:00
Wing Lian	a1a740608d	add assertion for packing patch to _get_unpad_data (#2840 )	2025-06-27 11:20:23 -04:00
kallewoof	ec15a7a691	Support --lora-on-cpu flag for DPO model merging (#2766 ) [skip ci] * Support --lora-on-cpu flag for DPO model merging * fix: use device=cpu in _convert_embedding_modules_dtype when lora_on_cpu is set	2025-06-27 11:19:24 -04:00
Wing Lian	0a7a216b60	allow for different sequence_len for evaluations (#2836 ) [skip ci] * allow for different sequence_len for evaluations * reversed 🤦 * add more information to filter msg	2025-06-27 11:02:51 -04:00
NanoCode012	d8280d45c1	feat: add chat_template kwargs (#2837 )	2025-06-27 10:38:46 -04:00
Wing Lian	24f2887e87	don't fail during preprocess for sampling from iterable dataset (#2825 ) [skip ci]	2025-06-27 10:37:53 -04:00
NanoCode012	29289a4de9	feat: replace old colab notebook with newer one (#2838 ) [skip ci] * feat: replace old colab notebook with newer one * fix: point to update cce fork	2025-06-27 10:35:47 -04:00
Wing Lian	a24957fa04	fix for iterable datasets and pickling (#2831 ) [skip ci] * fix for iterable datasets and pickling * more fixes for pretraining * can't pickle mock generator dataset	2025-06-27 10:35:23 -04:00
NanoCode012	927bf530bc	fix(doc): default messages example used wrong key (#2832 ) * fix(doc): default messages example used wrong key * feat: add links to SP, multi-gpu, multi-node on readme	2025-06-26 10:47:31 -04:00
github-actions[bot]	18954ba100	chore: update pre-commit hooks (#2821 ) [skip ci] Co-authored-by: djsaunde <1245942+djsaunde@users.noreply.github.com>	2025-06-26 10:46:53 -04:00
Wing Lian	d8cf66edbd	use fork for multiprocess start method for packing in parallel (#2830 )	2025-06-25 13:17:33 -04:00