diff --git a/.axolotl-complete.bash b/.axolotl-complete.bash
new file mode 100644
index 000000000..9a51399e6
--- /dev/null
+++ b/.axolotl-complete.bash
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+_axolotl_completions() {
+    local cur prev
+    COMPREPLY=()
+    cur="${COMP_WORDS[COMP_CWORD]}"
+    prev="${COMP_WORDS[COMP_CWORD-1]}"
+
+    # If we're completing the first argument (the command)
+    if [[ $COMP_CWORD -eq 1 ]]; then
+        mapfile -t COMPREPLY < <(compgen -W "delinearize-llama4 fetch lm-eval merge-sharded-fsdp-weights quantize vllm-serve evaluate inference merge-lora preprocess train" -- "$cur")
+        return 0
+    fi
+
+    # Commands that should complete with directories and YAML files
+    local -a yaml_commands=("merge-sharded-fsdp-weights" "quantize" "vllm-serve" "evaluate" "inference" "merge-lora" "preprocess" "train")
+
+    # Check if previous word is in our list
+    if [[ " ${yaml_commands[*]} " =~ (^|[[:space:]])$prev($|[[:space:]]) ]]; then
+        # Use filename completion which handles directories properly
+        compopt -o filenames
+        mapfile -t COMPREPLY < <(compgen -f -- "$cur")
+
+        # Filter to only include directories and YAML files
+        local -a filtered=()
+        for item in "${COMPREPLY[@]}"; do
+            if [[ -d "$item" ]] || [[ "$item" == *.yaml ]] || [[ "$item" == *.yml ]]; then
+                filtered+=("$item")
+            fi
+        done
+        COMPREPLY=("${filtered[@]}")
+
+        return 0
+    fi
+
+    # Default: no completion
+    return 0
+}
+
+# Remove the -o nospace option - let filenames handle it
+complete -F _axolotl_completions axolotl
diff --git a/.bandit b/.bandit
index 2d81286ae..82e88e814 100644
--- a/.bandit
+++ b/.bandit
@@ -1,3 +1,3 @@
 [bandit]
 exclude = tests
-skips = B101
+skips = B101,B615
diff --git a/.coderabbit.yaml b/.coderabbit.yaml
new file mode 100644
index 000000000..95c044f02
--- /dev/null
+++ b/.coderabbit.yaml
@@ -0,0 +1,16 @@
+# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json
+language: "en-US"
+early_access: false
+reviews:
+  profile: "chill"
+  request_changes_workflow: false
+  high_level_summary: true
+  review_status: true
+  collapse_walkthrough: true
+  poem: false
+  sequence_diagrams: false
+  auto_review:
+    enabled: true
+    drafts: false
+chat:
+  auto_reply: true
diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml
index 9e19114d7..160ed7df9 100644
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -5,65 +5,83 @@ on:
     branches:
       - "main"
     paths:
-      - 'Dockerfile-base'
+      - 'docker/Dockerfile-base'
+      - 'docker/Dockerfile-uv-base'
       - '.github/workflows/base.yml'
   pull_request:
     paths:
-      - 'Dockerfile-base'
+      - 'docker/Dockerfile-base'
+      - 'docker/Dockerfile-uv-base'
       - '.github/workflows/base.yml'
   workflow_dispatch:
 
 jobs:
   build-base:
-    if: github.repository_owner == 'axolotl-ai-cloud'
+    if: ${{ github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
+    timeout-minutes: 480
     # this job needs to be run on self-hosted GPU runners...
-    runs-on: axolotl-gpu-runner
+    runs-on: ubuntu-latest-m
     strategy:
       fail-fast: false
       matrix:
         include:
-          - cuda: "124"
-            cuda_version: 12.4.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.5.1
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
           - cuda: "124"
             cuda_version: 12.4.1
             cudnn_version: ""
             python_version: "3.11"
             pytorch: 2.6.0
             torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-base"
           - cuda: "126"
             cuda_version: 12.6.3
             cudnn_version: ""
             python_version: "3.11"
             pytorch: 2.6.0
             torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-base"
           - cuda: "126"
             cuda_version: 12.6.3
             cudnn_version: ""
             python_version: "3.11"
             pytorch: 2.7.0
             torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-          - cuda: "128"
+            dockerfile: "Dockerfile-base"
+          - cuda: "126"
             cuda_version: 12.6.3
             cudnn_version: ""
             python_version: "3.11"
-            pytorch: 2.7.0
+            pytorch: 2.7.1
             torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-base"
           - cuda: "128"
             cuda_version: 12.8.1
             cudnn_version: ""
             python_version: "3.11"
-            pytorch: nightly
+            pytorch: 2.7.1
             torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-base"
           - cuda: "128"
             cuda_version: 12.8.1
             cudnn_version: ""
             python_version: "3.11"
-            pytorch: next
+            pytorch: 2.8.0
             torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-base"
+#          - cuda: "128"
+#            cuda_version: 12.8.1
+#            cudnn_version: ""
+#            python_version: "3.11"
+#            pytorch: nightly
+#            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+#            dockerfile: "Dockerfile-base-nightly"
+#          # "next" is for release candidates of pytorch
+#          - cuda: "128"
+#            cuda_version: 12.8.1
+#            cudnn_version: ""
+#            python_version: "3.11"
+#            pytorch: next
+#            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+#            dockerfile: "Dockerfile-base-next"
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -85,7 +103,74 @@ jobs:
         uses: docker/build-push-action@v4
         with:
           context: .
-          file: ${{ matrix.pytorch == 'nightly' && './docker/Dockerfile-base-nightly' || matrix.pytorch == 'next' && './docker/Dockerfile-base-next' || './docker/Dockerfile-base' }}
+          file: ./docker/${{ matrix.dockerfile }}
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+          labels: ${{ steps.metadata.outputs.labels }}
+          build-args: |
+            CUDA_VERSION=${{ matrix.cuda_version }}
+            CUDNN_VERSION=${{ matrix.cudnn_version }}
+            CUDA=${{ matrix.cuda }}
+            PYTHON_VERSION=${{ matrix.python_version }}
+            PYTORCH_VERSION=${{ matrix.pytorch }}
+            TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }}
+  build-base-uv:
+    if: ${{ github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
+    timeout-minutes: 480
+    runs-on: ubuntu-latest-m
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - cuda: "126"
+            cuda_version: 12.6.3
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: 2.6.0
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-uv-base"
+          - cuda: "126"
+            cuda_version: 12.6.3
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: 2.7.1
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-uv-base"
+          - cuda: "128"
+            cuda_version: 12.8.1
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: 2.7.1
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-uv-base"
+          - cuda: "128"
+            cuda_version: 12.8.1
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: 2.8.0
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-uv-base"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Docker metadata
+        id: metadata
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            axolotlai/axolotl-base-uv
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Build
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: ./docker/${{ matrix.dockerfile }}
           push: ${{ github.event_name != 'pull_request' }}
           tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
           labels: ${{ steps.metadata.outputs.labels }}
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 2d3c209cc..5b5cc5489 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -23,7 +23,7 @@ jobs:
         - name: Install dependencies
           run: |
             python3 -m pip install jupyter quartodoc
-            python3 -m pip install -e . --no-deps
+            python3 -m pip install -e .
         - name: Build autodoc
           run: quartodoc build
         - name: Publish to GitHub Pages (and render)
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index d85892b43..cf322f105 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -3,18 +3,21 @@ on:
   # check on PRs, and manual triggers
   merge_group:
   pull_request:
+      types: [opened, synchronize, reopened, ready_for_review]
       paths:
        - '**.py'
        - 'requirements.txt'
        - '.github/workflows/*.yml'
        - "*.[q]md"
        - "examples/**/*.y[a]?ml"
+       - ".pre-commit-config.yaml"
   workflow_dispatch:
 
 jobs:
   pre-commit:
     name: pre-commit
     runs-on: ubuntu-latest
+    if: ${{ !github.event.pull_request.draft }}
     steps:
       - uses: actions/checkout@v4
       - uses: actions/setup-python@v5
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 01606f902..3daf39e43 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -15,26 +15,26 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.5.1
-            axolotl_extras:
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
             python_version: "3.11"
             pytorch: 2.6.0
-            axolotl_extras: vllm
-            is_latest: true
+            axolotl_extras:
           - cuda: 126
             cuda_version: 12.6.3
             python_version: "3.11"
             pytorch: 2.7.0
             axolotl_extras:
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.7.1
+            axolotl_extras: vllm
+            is_latest: true
           - cuda: 128
             cuda_version: 12.8.1
             python_version: "3.11"
-            pytorch: 2.7.0
+            pytorch: 2.7.1
             axolotl_extras:
     runs-on: axolotl-gpu-runner
     steps:
@@ -83,26 +83,32 @@ jobs:
     strategy:
       matrix:
         include:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.5.1
-            axolotl_extras:
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
             python_version: "3.11"
             pytorch: 2.6.0
             axolotl_extras:
-            is_latest: true
           - cuda: 126
             cuda_version: 12.6.3
             python_version: "3.11"
             pytorch: 2.7.0
             axolotl_extras:
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.7.1
+            axolotl_extras:
+            is_latest:
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.7.1
+            axolotl_extras: vllm
+            is_latest: true
           - cuda: 128
             cuda_version: 12.8.1
             python_version: "3.11"
-            pytorch: 2.7.0
+            pytorch: 2.7.1
             axolotl_extras:
     runs-on: axolotl-gpu-runner
     steps:
@@ -146,11 +152,23 @@ jobs:
     strategy:
       matrix:
         include:
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
             python_version: "3.11"
             pytorch: 2.6.0
             axolotl_extras:
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.7.1
+            axolotl_extras:
+            is_latest:
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.7.1
+            axolotl_extras: vllm
+            is_latest: true
     runs-on: axolotl-gpu-runner
     steps:
       - name: Checkout
diff --git a/.github/workflows/multi-gpu-e2e.yml b/.github/workflows/multi-gpu-e2e.yml
index 8c7692d13..308526151 100644
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -21,22 +21,15 @@ concurrency:
 
 jobs:
   test-axolotl-multigpu:
-    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
     strategy:
       fail-fast: false
       matrix:
         include:
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
             python_version: "3.11"
             pytorch: 2.6.0
-            axolotl_extras: vllm
-            num_gpus: 2
-            nightly_build: "true"
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.5.1
             axolotl_extras:
             num_gpus: 2
             nightly_build: "true"
@@ -47,6 +40,13 @@ jobs:
             axolotl_extras:
             num_gpus: 2
             nightly_build: "true"
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.7.1
+            axolotl_extras: vllm
+            num_gpus: 2
+            nightly_build: "true"
     runs-on: [self-hosted, modal]
     timeout-minutes: 120
     steps:
@@ -59,7 +59,7 @@ jobs:
       - name: Install Modal
         run: |
           python -m pip install --upgrade pip
-          pip install modal==0.71.8 jinja2
+          pip install modal==1.0.2 jinja2
       - name: Update env vars
         run: |
           echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
diff --git a/.github/workflows/nightlies.yml b/.github/workflows/nightlies.yml
index 4e61984fb..49bce470b 100644
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -12,16 +12,16 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.5.1
-            axolotl_extras:
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
             python_version: "3.11"
             pytorch: 2.6.0
             axolotl_extras:
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.7.1
+            axolotl_extras:
     runs-on: axolotl-gpu-runner
     steps:
       - name: Checkout
@@ -65,16 +65,16 @@ jobs:
     strategy:
       matrix:
         include:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.5.1
-            axolotl_extras:
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
             python_version: "3.11"
             pytorch: 2.6.0
             axolotl_extras:
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.7.1
+            axolotl_extras:
     runs-on: axolotl-gpu-runner
     steps:
       - name: Checkout
diff --git a/.github/workflows/precommit-autoupdate.yml b/.github/workflows/precommit-autoupdate.yml
index 921742211..10330f955 100644
--- a/.github/workflows/precommit-autoupdate.yml
+++ b/.github/workflows/precommit-autoupdate.yml
@@ -25,7 +25,6 @@ jobs:
           pre-commit autoupdate
           if [[ -n $(git status --porcelain) ]]; then
             echo "changes=true" >> $GITHUB_OUTPUT
-            git diff .pre-commit-config.yaml > pre-commit-update.diff
           fi
 
       - name: Create Pull Request
@@ -39,11 +38,3 @@ jobs:
           commit-message: "chore: update pre-commit hooks"
           body: |
             Automated PR to update pre-commit hooks to their latest versions.
-
-            <details>
-            <summary>Changes:</summary>
-
-            ```diff
-            ${{ steps.update.outputs.diff }}
-            ```
-            </details>
diff --git a/.github/workflows/preview-docs.yml b/.github/workflows/preview-docs.yml
index 5af70b0dc..db4abddce 100644
--- a/.github/workflows/preview-docs.yml
+++ b/.github/workflows/preview-docs.yml
@@ -2,13 +2,15 @@ name: Preview
 on:
   workflow_dispatch:
   pull_request:
-    types: [opened, synchronize, reopened]
+    types: [opened, synchronize, reopened, ready_for_review]
 
     # Run the workflow only when one of these files changes
     paths:
       - '**/*.md'      # any Markdown file
       - '**/*.qmd'     # any Quarto file
-      - '_quarto.yaml'
+      - '_quarto.yml'
+      - docs/scripts/generate_config_docs.py
+      - src/axolotl/utils/schemas/**.py
 
 permissions:
   checks: write
@@ -23,9 +25,12 @@ permissions:
 jobs:
   preview:
     runs-on: ubuntu-latest
+    if: ${{ !github.event.pull_request.draft }}
     steps:
       - name: Check out repository
         uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.head.sha }}
 
       - name: Set up Quarto
         uses: quarto-dev/quarto-actions/setup@v2
@@ -38,7 +43,7 @@ jobs:
       - name: Install dependencies
         run: |
           python3 -m pip install jupyter quartodoc
-          python3 -m pip install -e . --no-deps
+          python3 -m pip install -e .
 
       - name: Build autodoc
         run: quartodoc build
@@ -48,10 +53,12 @@ jobs:
 
       - name: Netlify Publish
         uses: nwtgck/actions-netlify@v3.0
+        if: ${{ github.event.pull_request.head.repo.full_name == github.repository }}
+        id: netlify
         with:
           publish-dir: './_site'
-          enable-pull-request-comment: true
-          enable-github-deployment: true
+          enable-pull-request-comment: false
+          enable-github-deployment: false
           github-token: ${{ secrets.GITHUB_TOKEN }}
           deploy-message: "Deployed On Netlify"
           github-deployment-environment: 'preview'
@@ -59,3 +66,13 @@ jobs:
         env:
           NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
           NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }}
+
+      - name: Update PR with preview link
+        if: ${{ steps.netlify.outcome == 'success' }}
+        uses: marocchino/sticky-pull-request-comment@v2
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          message: |
+            📖 **Documentation Preview**: ${{ steps.netlify.outputs.deploy-url }}
+
+            Deployed on Netlify from commit ${{ github.event.pull_request.head.sha }}
diff --git a/.github/workflows/tests-nightly.yml b/.github/workflows/tests-nightly.yml
index 539f7f71b..fc6c2b396 100644
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -18,116 +18,26 @@ jobs:
         env:
           SKIP: no-commit-to-branch
 
-  preload-cache:
-    name: Preload HF cache
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        python_version: ["3.11"]
-        pytorch_version: ["2.6.0"]
-    timeout-minutes: 20
-
-    env:
-      AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
-
-    steps:
-      - name: Check out repository code
-        uses: actions/checkout@v4
-
-      - name: Restore HF cache
-        id: hf-cache-restore
-        uses: actions/cache/restore@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ runner.os }}-hf-hub-cache-v2
-
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python_version }}
-          cache: 'pip' # caching pip dependencies
-
-      - name: upgrade pip
-        run: |
-          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
-
-      - name: Install PyTorch
-        run: |
-          pip3 install torch==${{ matrix.pytorch_version }}
-
-      - name: Install dependencies
-        run: |
-          pip3 show torch
-          pip3 install --no-build-isolation -U -e .
-          python scripts/unsloth_install.py | sh
-          python scripts/cutcrossentropy_install.py | sh
-          pip3 install -r requirements-dev.txt -r requirements-tests.txt
-
-      - name: Make sure PyTorch version wasn't clobbered
-        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
-
-      - name: Ensure axolotl CLI was installed
-        run: |
-          axolotl --help
-
-      - name: Pre-Download dataset fixture
-        run: |
-          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
-
-      - name: Run tests
-        run: |
-          pytest -v tests/conftest.py
-
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v5
-        with:
-          token: ${{ secrets.CODECOV_TOKEN }}
-          files: ./coverage.xml
-          flags: unittests,pytorch-${{ matrix.pytorch_version }}
-          fail_ci_if_error: false
-
-      - name: cleanup pip cache
-        run: |
-          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
-
-      - name: Save HF cache
-        id: hf-cache
-        uses: actions/cache/save@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
-
   pytest:
     name: PyTest
     runs-on: ubuntu-latest
-    needs: [preload-cache]
     strategy:
       fail-fast: false
       max-parallel: 2
       matrix:
         python_version: ["3.11"]
-        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
+        pytorch_version: ["2.6.0", "2.7.0"]
     timeout-minutes: 20
 
     steps:
       - name: Check out repository code
         uses: actions/checkout@v4
 
-      - name: Restore HF cache
-        id: hf-cache-restore
-        uses: actions/cache/restore@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ runner.os }}-hf-hub-cache-v2
+      - name: Restore Cache from S3
+        id: hf-cache-restore-s3
+        run: |
+          mkdir -p /home/runner/.cache/huggingface/hub
+          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd
 
       - name: Setup Python
         uses: actions/setup-python@v5
@@ -142,7 +52,7 @@ jobs:
 
       - name: Install PyTorch
         run: |
-          pip3 install torch==${{ matrix.pytorch_version }}
+          pip3 install torch==${{ matrix.pytorch_version }} torchvision
 
       - name: Update requirements.txt
         run: |
@@ -168,15 +78,11 @@ jobs:
         run: |
           axolotl --help
 
-      - name: Pre-Download dataset fixture
-        run: |
-          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
-
       - name: Run tests
         run: |
-          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
-          pytest -v tests/patched/
-          pytest -v tests/cli/
+          pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
+          pytest -v --durations=10 tests/patched/
+          pytest -v --durations=10 tests/cli/
 
       - name: cleanup pip cache
         run: |
@@ -186,24 +92,24 @@ jobs:
     if: github.repository_owner == 'axolotl-ai-cloud'
     # this job needs to be run on self-hosted GPU runners...
     runs-on: [self-hosted, modal]
-    timeout-minutes: 60
+    timeout-minutes: 120
     needs: [pre-commit, pytest]
 
     strategy:
       fail-fast: false
       matrix:
         include:
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
             python_version: "3.11"
-            pytorch: 2.5.1
+            pytorch: 2.6.0
             num_gpus: 1
             axolotl_extras:
             nightly_build: "true"
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
             python_version: "3.11"
-            pytorch: 2.6.0
+            pytorch: 2.7.1
             num_gpus: 1
             axolotl_extras:
             nightly_build: "true"
@@ -217,7 +123,7 @@ jobs:
       - name: Install Modal
         run: |
           python -m pip install --upgrade pip
-          pip install modal==0.71.8 jinja2
+          pip install modal==1.0.2 jinja2
       - name: Update env vars
         run: |
           echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
@@ -231,3 +137,45 @@ jobs:
       - name: Run tests job on Modal
         run: |
           modal run cicd.e2e_tests
+  docker-e2e-multigpu-tests:
+    if: github.repository_owner == 'axolotl-ai-cloud'
+    # this job needs to be run on self-hosted GPU runners...
+    runs-on: [self-hosted, modal]
+    timeout-minutes: 120
+    needs: [pre-commit, pytest, docker-e2e-tests]
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.7.1
+            num_gpus: 2
+            axolotl_extras:
+            nightly_build: "true"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install Modal
+        run: |
+          python -m pip install --upgrade pip
+          pip install modal==1.0.2 jinja2
+      - name: Update env vars
+        run: |
+          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
+          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
+          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
+          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
+          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
+          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
+          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
+          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
+      - name: Run tests job on Modal
+        run: |
+          modal run cicd.multigpu
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 69f0a030d..912b3f1d6 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -13,6 +13,7 @@ on:
       - 'cicd/cicd.sh'
       - 'cicd/Dockerfile.jinja'
   pull_request:
+      types: [opened, synchronize, reopened, ready_for_review]
       paths:
        - '**.py'
        - 'requirements.txt'
@@ -34,6 +35,7 @@ jobs:
   pre-commit:
     name: pre-commit
     runs-on: ubuntu-latest
+    if: ${{ !github.event.pull_request.draft }}
     steps:
       - uses: actions/checkout@v4
       - uses: actions/setup-python@v5
@@ -44,122 +46,22 @@ jobs:
         env:
           SKIP: no-commit-to-branch
 
-#  preload-cache:
-#    name: Preload HF cache
-#    runs-on: ubuntu-latest
-#    strategy:
-#      fail-fast: false
-#      matrix:
-#        python_version: ["3.11"]
-#        pytorch_version: ["2.6.0"]
-#    timeout-minutes: 20
-#
-#    env:
-#      AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
-#
-#    steps:
-#      - name: Check out repository code
-#        uses: actions/checkout@v4
-#
-#      - name: Restore HF cache
-#        id: hf-cache-restore
-#        uses: actions/cache/restore@v4
-#        with:
-#          path: |
-#            /home/runner/.cache/huggingface/hub/datasets--*
-#            /home/runner/.cache/huggingface/hub/models--*
-#          key: ${{ runner.os }}-hf-hub-cache-v2
-#
-#      - name: Restore Cache from S3
-#        id: hf-cache-restore-s3
-#        run: |
-#          mkdir -p /home/runner/.cache/huggingface/hub
-#          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd
-#
-#      - name: Setup Python
-#        uses: actions/setup-python@v5
-#        with:
-#          python-version: ${{ matrix.python_version }}
-#          cache: 'pip' # caching pip dependencies
-#
-#      - name: upgrade pip
-#        run: |
-#          pip3 install --upgrade pip
-#          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
-#
-#      - name: Install PyTorch
-#        run: |
-#          pip3 install torch==${{ matrix.pytorch_version }}
-#
-#      - name: Install dependencies
-#        run: |
-#          pip3 show torch
-#          pip3 install --no-build-isolation -U -e .
-#          python scripts/unsloth_install.py | sh
-#          python scripts/cutcrossentropy_install.py | sh
-#          pip3 install -r requirements-dev.txt -r requirements-tests.txt
-#
-#      - name: Make sure PyTorch version wasn't clobbered
-#        run: |
-#          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
-#
-#      - name: Ensure axolotl CLI was installed
-#        run: |
-#          axolotl --help
-#
-#      - name: Pre-Download dataset fixture
-#        run: |
-#          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
-#
-#      - name: Run tests
-#        run: |
-#          pytest -v tests/conftest.py
-#
-#      - name: Upload coverage to Codecov
-#        uses: codecov/codecov-action@v5
-#        with:
-#          token: ${{ secrets.CODECOV_TOKEN }}
-#          files: ./coverage.xml
-#          flags: unittests,pytorch-${{ matrix.pytorch_version }}
-#          fail_ci_if_error: false
-#
-#      - name: cleanup pip cache
-#        run: |
-#          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
-#
-#      - name: Save HF cache
-#        id: hf-cache
-#        uses: actions/cache/save@v4
-#        with:
-#          path: |
-#            /home/runner/.cache/huggingface/hub/datasets--*
-#            /home/runner/.cache/huggingface/hub/models--*
-#          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
-
   pytest:
     name: PyTest
     runs-on: ubuntu-latest
+    if: ${{ !github.event.pull_request.draft }}
 #    needs: [preload-cache]
     strategy:
       fail-fast: false
       matrix:
         python_version: ["3.11"]
-        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
+        pytorch_version: ["2.6.0", "2.7.0", "2.7.1"]
     timeout-minutes: 20
 
     steps:
       - name: Check out repository code
         uses: actions/checkout@v4
 
-#      - name: Restore HF cache
-#        id: hf-cache-restore
-#        uses: actions/cache/restore@v4
-#        with:
-#          path: |
-#            /home/runner/.cache/huggingface/hub/datasets--*
-#            /home/runner/.cache/huggingface/hub/models--*
-#          key: ${{ runner.os }}-hf-hub-cache-v2
-
       - name: Restore Cache from S3
         id: hf-cache-restore-s3
         run: |
@@ -179,7 +81,7 @@ jobs:
 
       - name: Install PyTorch
         run: |
-          pip3 install torch==${{ matrix.pytorch_version }}
+          pip3 install torch==${{ matrix.pytorch_version }} torchvision
 
       - name: Install dependencies
         run: |
@@ -203,9 +105,10 @@ jobs:
 
       - name: Run tests
         run: |
-          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/ --cov=axolotl --cov-report=xml
-          pytest -v tests/patched/ --cov=axolotl --cov-append --cov-report=xml
-          pytest -v tests/cli/ --cov=axolotl --cov-append --cov-report=xml
+          pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/ --cov=axolotl --cov-report=xml
+          pytest -v --durations=10 tests/monkeypatch/ --cov=axolotl --cov-append --cov-report=xml
+          pytest -v --durations=10 tests/patched/ --cov=axolotl --cov-append --cov-report=xml
+          pytest -v --durations=10 tests/cli/ --cov=axolotl --cov-append --cov-report=xml
 
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v5
@@ -222,27 +125,18 @@ jobs:
   pytest-sdist:
     name: PyTest from Source Dist
     runs-on: ubuntu-latest
-#    needs: [preload-cache]
+    if: ${{ !github.event.pull_request.draft }}
     strategy:
       fail-fast: false
       matrix:
         python_version: ["3.11"]
-        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
+        pytorch_version: ["2.6.0", "2.7.0", "2.7.1"]
     timeout-minutes: 20
 
     steps:
       - name: Check out repository code
         uses: actions/checkout@v4
 
-#      - name: Restore HF cache
-#        id: hf-cache-restore
-#        uses: actions/cache/restore@v4
-#        with:
-#          path: |
-#            /home/runner/.cache/huggingface/hub/datasets--*
-#            /home/runner/.cache/huggingface/hub/models--*
-#          key: ${{ runner.os }}-hf-hub-cache-v2
-
       - name: Restore Cache from S3
         id: hf-cache-restore-s3
         run: |
@@ -262,7 +156,7 @@ jobs:
 
       - name: Install PyTorch
         run: |
-          pip3 install torch==${{ matrix.pytorch_version }}
+          pip3 install torch==${{ matrix.pytorch_version }} torchvision
 
       - name: Install dependencies
         run: |
@@ -286,9 +180,9 @@ jobs:
 
       - name: Run tests
         run: |
-          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
-          pytest -v tests/patched/
-          pytest -v tests/cli/
+          pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/ --cov=axolotl --cov-report=xml
+          pytest -v --durations=10 tests/monkeypatch/ --cov=axolotl --cov-append --cov-report=xml
+          pytest -v --durations=10 tests/cli/
 
       - name: cleanup pip cache
         run: |
@@ -296,22 +190,29 @@ jobs:
 
   docker-e2e-tests-1st:
     # Run this job first as a gate for running the remainder of the test matrix
-    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' && !github.event.pull_request.draft }}
     # this job needs to be run on self-hosted GPU runners...
     runs-on: [self-hosted, modal]
-    timeout-minutes: 90
+    timeout-minutes: 120
     needs: [pre-commit, pytest, pytest-sdist]
 
     strategy:
       fail-fast: false
       matrix:
         include:
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.7.1
+            num_gpus: 1
+            axolotl_extras:
+          - cuda: 126
+            cuda_version: 12.6.3
             python_version: "3.11"
             pytorch: 2.6.0
             num_gpus: 1
-            axolotl_extras: vllm
+            axolotl_extras:
+            dockerfile: "Dockerfile-uv.jinja"
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -322,7 +223,7 @@ jobs:
       - name: Install Modal
         run: |
           python -m pip install --upgrade pip
-          pip install modal==0.71.8 jinja2
+          pip install modal==1.0.2 jinja2
       - name: Update env vars
         run: |
           echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
@@ -333,15 +234,16 @@ jobs:
           echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
           echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
           echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
       - name: Run tests job on Modal
         run: |
           modal run cicd.e2e_tests
 
   docker-e2e-tests:
-    if: github.repository_owner == 'axolotl-ai-cloud'
+    if: ${{ github.repository_owner == 'axolotl-ai-cloud' && !github.event.pull_request.draft }}
     # this job needs to be run on self-hosted GPU runners...
     runs-on: [self-hosted, modal]
-    timeout-minutes: 90
+    timeout-minutes: 120
     # Only run the remainder of the matrix if the first e2e check passed;
     # this is to save on wasted compute costs for known failures that get caught in the first run
     needs: [pre-commit, pytest, docker-e2e-tests-1st]
@@ -350,28 +252,16 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.6.0
-            num_gpus: 1
-            axolotl_extras: llmcompressor
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.5.1
-            num_gpus: 1
-            axolotl_extras:
           - cuda: 126
             cuda_version: 12.6.3
             python_version: "3.11"
-            pytorch: 2.7.0
+            pytorch: 2.6.0
             num_gpus: 1
             axolotl_extras:
           - cuda: 128
             cuda_version: 12.8.1
             python_version: "3.11"
-            pytorch: 2.7.0
+            pytorch: 2.7.1
             num_gpus: 1
             axolotl_extras:
     steps:
@@ -384,7 +274,7 @@ jobs:
       - name: Install Modal
         run: |
           python -m pip install --upgrade pip
-          pip install modal==0.71.8 jinja2
+          pip install modal==1.0.2 jinja2
       - name: Update env vars
         run: |
           echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
@@ -395,6 +285,7 @@ jobs:
           echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
           echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
           echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
+          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
       - name: Run tests job on Modal
         run: |
           modal run cicd.e2e_tests
@@ -403,6 +294,7 @@ jobs:
     runs-on: [self-hosted, modal]
     timeout-minutes: 90
     needs: [docker-e2e-tests]
+    if: ${{ !github.event.pull_request.draft }}
 
     strategy:
       fail-fast: false
@@ -413,7 +305,7 @@ jobs:
             python_version: "3.11"
             pytorch: 2.6.0
             num_gpus: 1
-            axolotl_extras: vllm
+            axolotl_extras:
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -424,7 +316,7 @@ jobs:
       - name: Install Modal
         run: |
           python -m pip install --upgrade pip
-          pip install modal==0.71.8 jinja2
+          pip install modal==1.0.2 jinja2
       - name: Update env vars
         run: |
           echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f627ec13f..4c9268529 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@ default_language_version:
 
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v5.0.0
+    rev: v6.0.0
     hooks:
     -   id: check-yaml
     -   id: end-of-file-fixer
@@ -19,15 +19,15 @@ repos:
     hooks:
       - id: isort
 -   repo: https://github.com/PyCQA/flake8
-    rev: 7.1.2
+    rev: 7.3.0
     hooks:
     - id: flake8
 -   repo: https://github.com/pylint-dev/pylint
-    rev: v3.3.6
+    rev: v3.3.8
     hooks:
     - id: pylint
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.15.0
+    rev: v1.17.1
     hooks:
     - id: mypy
       additional_dependencies:
@@ -36,7 +36,7 @@ repos:
             'pydantic>=2.5.3',
         ]
 -   repo: https://github.com/PyCQA/bandit
-    rev: 1.8.3
+    rev: 1.8.6
     hooks:
     -   id: bandit
         args: [
diff --git a/.runpod/README.md b/.runpod/README.md
index a631c3937..8042f4f91 100644
--- a/.runpod/README.md
+++ b/.runpod/README.md
@@ -119,14 +119,15 @@ datasets:
 
 ## Dataset Processing
 
-| Option                        | Default                    | Description                       |
-| ----------------------------- | -------------------------- | --------------------------------- |
-| `dataset_prepared_path`       | `"data/last_run_prepared"` | Path for prepared dataset         |
-| `push_dataset_to_hub`         | `""`                       | Push dataset to HF hub            |
-| `dataset_processes`           | `4`                        | Number of preprocessing processes |
-| `dataset_keep_in_memory`      | `false`                    | Keep dataset in memory            |
-| `shuffle_merged_datasets`     | `true`                     | Shuffle merged datasets           |
-| `dataset_exact_deduplication` | `true`                     | Deduplicate datasets              |
+| Option                            | Default                    | Description                         |
+| --------------------------------- | -------------------------- | ----------------------------------- |
+| `dataset_prepared_path`           | `"data/last_run_prepared"` | Path for prepared dataset           |
+| `push_dataset_to_hub`             | `""`                       | Push dataset to HF hub              |
+| `dataset_processes`               | `4`                        | Number of preprocessing processes   |
+| `dataset_keep_in_memory`          | `false`                    | Keep dataset in memory              |
+| `shuffle_merged_datasets`         | `true`                     | Shuffle merged datasets             |
+| `shuffle_before_merging_datasets` | `false`                    | Shuffle each dataset before merging |
+| `dataset_exact_deduplication`     | `true`                     | Deduplicate datasets                |
 
 ## LoRA Configuration
 
@@ -184,7 +185,6 @@ datasets:
 | `flash_attention`          | `false` | Use flash attention           |
 | `flash_attn_cross_entropy` | `false` | Flash attention cross entropy |
 | `flash_attn_rms_norm`      | `false` | Flash attention RMS norm      |
-| `flash_attn_fuse_qkv`      | `false` | Fuse QKV operations           |
 | `flash_attn_fuse_mlp`      | `false` | Fuse MLP operations           |
 | `sdp_attention`            | `false` | Use scaled dot product        |
 | `s2_attention`             | `false` | Use shifted sparse attention  |
@@ -328,7 +328,7 @@ The following optimizers are supported:
 - Use `gradient_checkpointing: true` to reduce memory usage
 - Adjust `micro_batch_size` and `gradient_accumulation_steps` based on your GPU memory
 
-For more detailed information, please refer to the [documentation](https://axolotl-ai-cloud.github.io/axolotl/docs/config.html).
+For more detailed information, please refer to the [documentation](https://axolotl-ai-cloud.github.io/axolotl/docs/config-reference.html).
 
 ### Errors:
 
diff --git a/.runpod/src/config/config.yaml b/.runpod/src/config/config.yaml
index 4dff37cae..f482a7331 100644
--- a/.runpod/src/config/config.yaml
+++ b/.runpod/src/config/config.yaml
@@ -97,7 +97,7 @@
 #       # 'no_input_format' cannot include {input}
 #       no_input_format: "{instruction} "
 
-#       # For `completion` datsets only, uses the provided field instead of `text` column
+#       # For `completion` datasets only, uses the provided field instead of `text` column
 #       field:
 
 # # Axolotl attempts to save the dataset as an arrow after packing the data together so
@@ -242,16 +242,12 @@
 # early_stopping_patience: 3
 
 # # Specify a scheduler and kwargs to use with the optimizer
-# lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
+# lr_scheduler: # 'one_cycle' | empty for cosine
 # lr_scheduler_kwargs:
 
 # # For one_cycle optim
 # lr_div_factor: # Learning rate div factor
 
-# # For log_sweep optim
-# log_sweep_min_lr:
-# log_sweep_max_lr:
-
 # # Specify optimizer
 # # Valid values are driven by the Transformers OptimizerNames class, see:
 # # https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
@@ -300,7 +296,6 @@
 # flash_attention:
 # flash_attn_cross_entropy:  # Whether to use flash-attention cross entropy implementation - advanced use only
 # flash_attn_rms_norm:  # Whether to use flash-attention rms norm implementation - advanced use only
-# flash_attn_fuse_qkv: # Whether to fuse QKV into a single operation
 # flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
 # # Whether to use scaled-dot-product attention
 # # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
@@ -545,7 +540,6 @@ xformers_attention: ${XFORMERS_ATTENTION}
 flash_attention: ${FLASH_ATTENTION}
 flash_attn_cross_entropy: ${FLASH_ATTN_CROSS_ENTROPY}
 flash_attn_rms_norm: ${FLASH_ATTN_RMS_NORM}
-flash_attn_fuse_qkv: ${FLASH_ATTN_FUSE_QKV}
 flash_attn_fuse_mlp: ${FLASH_ATTN_FUSE_MLP}
 sdp_attention: ${SDP_ATTENTION}
 s2_attention: ${S2_ATTENTION}
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 000000000..e6ecc7cb8
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,10 @@
+cff-version: 1.2.0
+type: software
+title: "Axolotl: Post-Training for AI Models"
+message: "If you use this software, please cite it as below."
+authors:
+  - name: "Axolotl maintainers and contributors"
+repository-code: "https://github.com/axolotl-ai-cloud/axolotl"
+url: "https://axolotl.ai/"
+license: Apache-2.0
+date-released: "2023-05-30"
diff --git a/MANIFEST.in b/MANIFEST.in
index 99324be3c..3fbb0edca 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,4 +2,5 @@ include requirements.txt
 include README.md
 include LICENSE
 include src/setuptools_axolotl_dynamic_dependencies.py
+include src/axolotl/utils/chat_templates/templates/*.jinja
 recursive-include axolotl *.py
diff --git a/README.md b/README.md
index 56e45e3fe..117eb9b12 100644
--- a/README.md
+++ b/README.md
@@ -22,28 +22,45 @@
     <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg" alt="multigpu-semi-weekly tests">
 </p>
 
-Axolotl is a tool designed to streamline post-training for various AI models.
-Post-training refers to any modifications or additional training performed on
-pre-trained models - including full model fine-tuning, parameter-efficient tuning (like
-LoRA and QLoRA), supervised fine-tuning (SFT), instruction tuning, and alignment
-techniques. With support for multiple model architectures and training configurations,
-Axolotl makes it easy to get started with these techniques.
 
-Axolotl is designed to work with YAML config files that contain everything you need to
-preprocess a dataset, train or fine-tune a model, run model inference or evaluation,
-and much more.
+## 🎉 Latest Updates
+
+- 2025/07:
+  - ND Parallelism support has been added into Axolotl. Compose Context Parallelism (CP), Tensor Parallelism (TP), and Fully Sharded Data Parallelism (FSDP) within a single node and across multiple nodes. Check out the [blog post](https://huggingface.co/blog/accelerate-nd-parallel) for more info.
+  - Axolotl adds more models: [GPT-OSS](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/gpt-oss), [Gemma 3n](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/gemma3n), [Liquid Foundation Model 2 (LFM2)](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/lfm2), and [Arcee Foundation Models (AFM)](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/afm).
+  - FP8 finetuning with fp8 gather op is now possible in Axolotl via `torchao`. Get started [here](https://docs.axolotl.ai/docs/mixed_precision.html#sec-fp8)!
+  - [Voxtral](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/voxtral), [Magistral 1.1](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral), and [Devstral](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/devstral) with mistral-common tokenizer support has been integrated in Axolotl!
+  - TiledMLP support for single-GPU to multi-GPU training with DDP, DeepSpeed and FSDP support has been added to support Arctic Long Sequence Training. (ALST). See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/alst) for using ALST with Axolotl!
+- 2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the [docs](https://docs.axolotl.ai/docs/qat.html) to learn more!
+- 2025/03: Axolotl has implemented Sequence Parallelism (SP) support. Read the [blog](https://huggingface.co/blog/axolotl-ai-co/long-context-with-sequence-parallelism-in-axolotl) and [docs](https://docs.axolotl.ai/docs/sequence_parallelism.html) to learn how to scale your context length when fine-tuning.
+
+<details>
+
+<summary>Expand older updates</summary>
+
+- 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral) to start training your own Magistral models with Axolotl!
+- 2025/04: Llama 4 support has been added in Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/llama-4) to start training your own Llama 4 models with Axolotl's linearized version!
+- 2025/03: (Beta) Fine-tuning Multimodal models is now supported in Axolotl. Check out the [docs](https://docs.axolotl.ai/docs/multimodal.html) to fine-tune your own!
+- 2025/02: Axolotl has added LoRA optimizations to reduce memory usage and improve training speed for LoRA and QLoRA in single GPU and multi-GPU training (DDP and DeepSpeed). Jump into the [docs](https://docs.axolotl.ai/docs/lora_optims.html) to give it a try.
+- 2025/02: Axolotl has added GRPO support. Dive into our [blog](https://huggingface.co/blog/axolotl-ai-co/training-llms-w-interpreter-feedback-wasm) and [GRPO example](https://github.com/axolotl-ai-cloud/grpo_code) and have some fun!
+- 2025/01: Axolotl has added Reward Modelling / Process Reward Modelling fine-tuning support. See [docs](https://docs.axolotl.ai/docs/reward_modelling.html).
+
+</details>
+
+## ✨ Overview
+
+Axolotl is a tool designed to streamline post-training for various AI models.
 
 Features:
 
-- Train various Huggingface models such as llama, pythia, falcon, mpt
-- Supports fullfinetune, lora, qlora, relora, and gptq
-- Customize configurations using a simple yaml file or CLI overwrite
-- Load different dataset formats, use custom formats, or bring your own tokenized datasets
-- Integrated with [xformers](https://github.com/facebookresearch/xformers), flash attention, [liger kernel](https://github.com/linkedin/Liger-Kernel), rope scaling, and multipacking
-- Works with single GPU or multiple GPUs via FSDP or Deepspeed
-- Easily run with Docker locally or on the cloud
-- Log results and optionally checkpoints to wandb, mlflow or Comet
-- And more!
+- **Multiple Model Support**: Train various models like LLaMA, Mistral, Mixtral, Pythia, and more. We are compatible with HuggingFace transformers causal language models.
+- **Training Methods**: Full fine-tuning, LoRA, QLoRA, GPTQ, QAT, Preference Tuning (DPO, IPO, KTO, ORPO), RL (GRPO), Multimodal, and Reward Modelling (RM) / Process Reward Modelling (PRM).
+- **Easy Configuration**: Re-use a single YAML file between dataset preprocess, training, evaluation, quantization, and inference.
+- **Performance Optimizations**: [Multipacking](https://docs.axolotl.ai/docs/multipack.html), [Flash Attention](https://github.com/Dao-AILab/flash-attention), [Xformers](https://github.com/facebookresearch/xformers), [Flex Attention](https://pytorch.org/blog/flexattention/), [Liger Kernel](https://github.com/linkedin/Liger-Kernel), [Cut Cross Entropy](https://github.com/apple/ml-cross-entropy/tree/main), [Sequence Parallelism (SP)](https://docs.axolotl.ai/docs/sequence_parallelism.html), [LoRA optimizations](https://docs.axolotl.ai/docs/lora_optims.html), [Multi-GPU training (FSDP1, FSDP2, DeepSpeed)](https://docs.axolotl.ai/docs/multi-gpu.html), [Multi-node training (Torchrun, Ray)](https://docs.axolotl.ai/docs/multi-node.html), and many more!
+- **Flexible Dataset Handling**: Load from local, HuggingFace, and cloud (S3, Azure, GCP, OCI) datasets.
+- **Cloud Ready**: We ship [Docker images](https://hub.docker.com/u/axolotlai) and also [PyPI packages](https://pypi.org/project/axolotl/) for use on cloud platforms and local hardware.
+
+
 
 ## 🚀 Quick Start
 
@@ -51,10 +68,12 @@ Features:
 
 - NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
 - Python 3.11
-- PyTorch ≥2.4.1
+- PyTorch ≥2.6.0
 
 ### Installation
 
+#### Using pip
+
 ```bash
 pip3 install -U packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
@@ -64,8 +83,29 @@ axolotl fetch examples
 axolotl fetch deepspeed_configs  # OPTIONAL
 ```
 
+#### Using Docker
+
+Installing with Docker can be less error prone than installing in your own environment.
+```bash
+docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
+```
+
 Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html).
 
+#### Cloud Providers
+
+<details>
+
+- [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
+- [Vast.ai](https://cloud.vast.ai?ref_id=62897&template_id=bdd4a49fa8bce926defc99471864cace&utm_source=github&utm_medium=developer_community&utm_campaign=template_launch_axolotl&utm_content=readme)
+- [PRIME Intellect](https://app.primeintellect.ai/dashboard/create-cluster?image=axolotl&location=Cheapest&security=Cheapest&show_spot=true)
+- [Modal](https://www.modal.com?utm_source=github&utm_medium=github&utm_campaign=axolotl)
+- [Novita](https://novita.ai/gpus-console?templateId=311)
+- [JarvisLabs.ai](https://jarvislabs.ai/templates/axolotl)
+- [Latitude.sh](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)
+
+</details>
+
 ### Your First Fine-tune
 
 ```bash
@@ -81,19 +121,12 @@ axolotl train examples/llama-3/lora-1b.yml
 
 That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/getting-started.html) for a more detailed walkthrough.
 
-## ✨ Key Features
-
-- **Multiple Model Support**: Train various models like LLaMA, Mistral, Mixtral, Pythia, and more
-- **Training Methods**: Full fine-tuning, LoRA, QLoRA, and more
-- **Easy Configuration**: Simple YAML files to control your training setup
-- **Performance Optimizations**: Flash Attention, xformers, multi-GPU training
-- **Flexible Dataset Handling**: Use various formats and custom datasets
-- **Cloud Ready**: Run on cloud platforms or local hardware
 
 ## 📚 Documentation
 
 - [Installation Options](https://docs.axolotl.ai/docs/installation.html) - Detailed setup instructions for different environments
-- [Configuration Guide](https://docs.axolotl.ai/docs/config.html) - Full configuration options and examples
+- [Configuration Guide](https://docs.axolotl.ai/docs/config-reference.html) - Full configuration options and examples
+- [Dataset Loading](https://docs.axolotl.ai/docs/dataset_loading.html) - Loading datasets from various sources
 - [Dataset Guide](https://docs.axolotl.ai/docs/dataset-formats/) - Supported formats and how to use them
 - [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
 - [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
@@ -112,41 +145,24 @@ That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/ge
 
 Contributions are welcome! Please see our [Contributing Guide](https://github.com/axolotl-ai-cloud/axolotl/blob/main/.github/CONTRIBUTING.md) for details.
 
-## Supported Models
-
-|             | fp16/fp32 | lora | qlora | gptq | gptq w/flash attn | flash attn | xformers attn |
-|-------------|:----------|:-----|-------|------|-------------------|------------|--------------|
-| llama       | ✅         | ✅    | ✅     | ✅             | ✅                 | ✅          | ✅            |
-| Mistral     | ✅         | ✅    | ✅     | ✅             | ✅                 | ✅          | ✅            |
-| Mixtral-MoE | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
-| Mixtral8X22 | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
-| Pythia      | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
-| cerebras    | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
-| btlm        | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
-| mpt         | ✅         | ❌    | ❓     | ❌             | ❌                 | ❌          | ❓            |
-| falcon      | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
-| gpt-j       | ✅         | ✅    | ✅     | ❌             | ❌                 | ❓          | ❓            |
-| XGen        | ✅         | ❓    | ✅     | ❓             | ❓                 | ❓          | ✅            |
-| phi         | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
-| RWKV        | ✅         | ❓    | ❓     | ❓             | ❓                 | ❓          | ❓            |
-| Qwen        | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
-| Gemma       | ✅         | ✅    | ✅     | ❓             | ❓                 | ✅          | ❓            |
-| Jamba       | ✅         | ✅    | ✅     | ❓             | ❓                 | ✅          | ❓            |
-
-✅: supported
-❌: not supported
-❓: untested
-
 ## ❤️ Sponsors
 
-Thank you to our sponsors who help make Axolotl possible:
-
-- [Modal](https://www.modal.com?utm_source=github&utm_medium=github&utm_campaign=axolotl) - Modal lets you run
-jobs in the cloud, by just writing a few lines of Python. Customers use Modal to deploy Gen AI models at large scale,
-fine-tune large language models, run protein folding simulations, and much more.
-
 Interested in sponsoring? Contact us at [wing@axolotl.ai](mailto:wing@axolotl.ai)
 
+## 📝 Citing Axolotl
+
+If you use Axolotl in your research or projects, please cite it as follows:
+
+```bibtex
+@software{axolotl,
+  title = {Axolotl: Post-Training for AI Models},
+  author = {{Axolotl maintainers and contributors}},
+  url = {https://github.com/axolotl-ai-cloud/axolotl},
+  license = {Apache-2.0},
+  year = {2023}
+}
+```
+
 ## 📜 License
 
 This project is licensed under the Apache 2.0 License - see the [LICENSE](LICENSE) file for details.
diff --git a/_quarto.yml b/_quarto.yml
index df6992d92..934d393cb 100644
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -1,5 +1,6 @@
 project:
   type: website
+  pre-render: docs/scripts/generate_config_docs.py
 
 quartodoc:
   dir: docs/api
@@ -17,7 +18,9 @@ quartodoc:
         - convert
         - prompt_tokenizers
         - logging_config
-        - core.trainer_builder
+        - core.builders.base
+        - core.builders.causal
+        - core.builders.rl
         - core.training_args
         - core.chat.messages
         - core.chat.format.chatml
@@ -32,24 +35,30 @@ quartodoc:
         - cli.train
         - cli.evaluate
         - cli.args
+        - cli.art
         - cli.checks
         - cli.config
+        - cli.delinearize_llama4
         - cli.inference
         - cli.merge_lora
         - cli.merge_sharded_fsdp_weights
         - cli.preprocess
-        - cli.sweeps
-        - cli.utils
+        - cli.quantize
         - cli.vllm_serve
         - cli.cloud.base
         - cli.cloud.modal_
+        - cli.utils
+        - cli.utils.args
+        - cli.utils.fetch
+        - cli.utils.load
+        - cli.utils.sweeps
+        - cli.utils.train
     - title: Trainers
       desc: Training implementations
       contents:
         - core.trainers.base
         - core.trainers.trl
         - core.trainers.mamba
-        - core.trainers.relora
         - core.trainers.dpo.trainer
         - core.trainers.grpo.trainer
         - core.trainers.grpo.sampler
@@ -126,7 +135,6 @@ quartodoc:
         - monkeypatch.trainer_fsdp_optim
         - monkeypatch.transformers_fa_utils
         - monkeypatch.unsloth_
-        - monkeypatch.attention.mllama
         - monkeypatch.data.batch_dataset_fetcher
         - monkeypatch.mixtral
         - monkeypatch.gradient_checkpointing.offload_cpu
@@ -147,6 +155,7 @@ quartodoc:
         - utils.optimizers.adopt
         - utils.data.pretraining
         - utils.data.sft
+        - utils.quantization
     - title: Schemas
       desc: Pydantic data models for Axolotl config
       contents:
@@ -196,12 +205,14 @@ quartodoc:
         - utils.callbacks.lisa
         - utils.callbacks.mlflow_
         - utils.callbacks.comet_
-
+        - utils.callbacks.qat
 website:
   title: "Axolotl"
   description: "We make fine-tuning accessible, scalable, and fun"
   favicon: favicon.jpg
 
+  google-analytics: "G-9KYCVJBNMQ"
+
   navbar:
     logo: image/axolotl_logo_digital_white.svg
     title: false
@@ -230,7 +241,7 @@ website:
             - docs/installation.qmd
             - docs/inference.qmd
             - docs/cli.qmd
-            - docs/config.qmd
+            - docs/config-reference.qmd
             - text: "API Reference"
               href: docs/api
 
@@ -254,12 +265,16 @@ website:
             - docs/lr_groups.qmd
             - docs/lora_optims.qmd
             - docs/dataset_loading.qmd
+            - docs/qat.qmd
+            - docs/quantize.qmd
 
         - section: "Core Concepts"
           contents:
             - docs/batch_vs_grad.qmd
             - docs/dataset_preprocessing.qmd
             - docs/multipack.qmd
+            - docs/mixed_precision.qmd
+            - docs/optimizers.qmd
 
         - section: "Advanced Features"
           contents:
@@ -268,6 +283,8 @@ website:
             - docs/torchao.qmd
             - docs/custom_integrations.qmd
             - docs/sequence_parallelism.qmd
+            - docs/gradient_checkpointing.qmd
+            - docs/nd_parallelism.qmd
 
         - section: "Troubleshooting"
           contents:
diff --git a/cicd/Dockerfile-uv.jinja b/cicd/Dockerfile-uv.jinja
new file mode 100644
index 000000000..860386187
--- /dev/null
+++ b/cicd/Dockerfile-uv.jinja
@@ -0,0 +1,52 @@
+FROM axolotlai/axolotl-base-uv:{{ BASE_TAG }}
+
+ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
+ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
+ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
+ENV CUDA="{{ CUDA }}"
+ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
+ENV GITHUB_REF="{{ GITHUB_REF }}"
+ENV GITHUB_SHA="{{ GITHUB_SHA }}"
+ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
+ENV HF_HOME="{{ HF_HOME }}"
+
+RUN apt-get update && \
+    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm
+
+WORKDIR /workspace
+
+RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
+
+WORKDIR /workspace/axolotl
+
+RUN git fetch origin +$GITHUB_REF && \
+    git checkout FETCH_HEAD
+
+# If AXOLOTL_EXTRAS is set, append it in brackets
+RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
+        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
+        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
+        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
+        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
+        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
+    fi
+
+RUN uv pip install packaging==23.2 setuptools==75.8.0
+RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
+        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+    else \
+        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
+    fi
+
+RUN python scripts/unsloth_install.py --uv | sh
+RUN python scripts/cutcrossentropy_install.py --uv | sh
+
+# So we can test the Docker image
+RUN uv pip install -r requirements-dev.txt -r requirements-tests.txt
+
+# fix so that git fetch/pull from remote works
+RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
+    git config --get remote.origin.fetch
+
+# helper for huggingface-login cli
+RUN git config --global credential.helper store
diff --git a/cicd/Dockerfile.jinja b/cicd/Dockerfile.jinja
index 6988e092b..94c9a67e3 100644
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -9,9 +9,10 @@ ENV GITHUB_REF="{{ GITHUB_REF }}"
 ENV GITHUB_SHA="{{ GITHUB_SHA }}"
 ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
 ENV HF_HOME="{{ HF_HOME }}"
+ENV AXOLOTL_DATASET_PROCESSES="8"
 
 RUN apt-get update && \
-    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
+    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm
 
 WORKDIR /workspace
 
diff --git a/cicd/e2e_tests.py b/cicd/e2e_tests.py
index ce9c605c7..5d2b6fed1 100644
--- a/cicd/e2e_tests.py
+++ b/cicd/e2e_tests.py
@@ -6,7 +6,7 @@ from .single_gpu import GPU_CONFIG, VOLUME_CONFIG, app, cicd_image, run_cmd
 @app.function(
     image=cicd_image,
     gpu=GPU_CONFIG,
-    timeout=90 * 60,  # 90 min
+    timeout=120 * 60,  # 90 min
     cpu=8.0,
     memory=131072,
     volumes=VOLUME_CONFIG,
diff --git a/cicd/multigpu.py b/cicd/multigpu.py
index 7de4ae0a7..2c067f143 100644
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -24,9 +24,9 @@ df_template = template_env.get_template("Dockerfile.jinja")
 df_args = {
     "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
     "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
-    "CUDA": os.environ.get("CUDA", "121"),
+    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.6.0"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu126-2.6.0"),
+    "CUDA": os.environ.get("CUDA", "126"),
     "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
     "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
     "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
@@ -55,7 +55,7 @@ VOLUME_CONFIG = {
 }
 
 N_GPUS = int(os.environ.get("N_GPUS", 2))
-GPU_CONFIG = modal.gpu.H100(count=N_GPUS)
+GPU_CONFIG = f"H100:{N_GPUS}"
 
 
 def run_cmd(cmd: str, run_folder: str):
@@ -69,7 +69,7 @@ def run_cmd(cmd: str, run_folder: str):
 @app.function(
     image=cicd_image,
     gpu=GPU_CONFIG,
-    timeout=90 * 60,
+    timeout=120 * 60,
     cpu=16.0,
     memory=131072 * N_GPUS,
     volumes=VOLUME_CONFIG,
diff --git a/cicd/multigpu.sh b/cicd/multigpu.sh
index 1f74cd67d..3ec4456b9 100755
--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
@@ -2,7 +2,7 @@
 set -e
 
 # Only run two tests at a time to avoid OOM on GPU (with coverage collection)
-pytest -v -n2 \
+pytest -v --durations=10 -n2 \
   --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ \
   --ignore=/workspace/axolotl/tests/e2e/multigpu/patched/ \
   /workspace/axolotl/tests/e2e/multigpu/ \
@@ -19,5 +19,7 @@ pytest -v  --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/patched/ \
   --cov-append \
   --cov-report=xml:multigpu-coverage.xml
 
-# Upload coverage to Codecov
-codecov upload-process -t "${CODECOV_TOKEN}" -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION} || true
+# Upload coverage to Codecov if CODECOV_TOKEN is available
+if [ -n "$CODECOV_TOKEN" ]; then
+  codecov upload-process -t "${CODECOV_TOKEN}" -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION} || true
+fi
diff --git a/cicd/single_gpu.py b/cicd/single_gpu.py
index d46d970cf..eb34e1748 100644
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
@@ -8,8 +8,9 @@ import tempfile
 
 import jinja2
 import modal
+import modal.experimental
 from jinja2 import select_autoescape
-from modal import App, Image
+from modal import App
 
 cicd_path = pathlib.Path(__file__).parent.resolve()
 
@@ -17,19 +18,22 @@ template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
 template_env = jinja2.Environment(
     loader=template_loader, autoescape=select_autoescape()
 )
-df_template = template_env.get_template("Dockerfile.jinja")
+dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja")
+df_template = template_env.get_template(dockerfile)
 
 df_args = {
     "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
     "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
-    "CUDA": os.environ.get("CUDA", "121"),
+    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.6.0"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu126-2.6.0"),
+    "CUDA": os.environ.get("CUDA", "126"),
     "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
     "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
     "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
     "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
     "HF_HOME": "/workspace/data/huggingface-cache/hub",
+    "PYTHONUNBUFFERED": os.environ.get("PYTHONUNBUFFERED", "1"),
+    "DEEPSPEED_LOG_LEVEL": os.environ.get("DEEPSPEED_LOG_LEVEL", "WARNING"),
 }
 
 dockerfile_contents = df_template.render(**df_args)
@@ -38,11 +42,11 @@ temp_dir = tempfile.mkdtemp()
 with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
     f.write(dockerfile_contents)
 
-cicd_image = Image.from_dockerfile(
+cicd_image = modal.experimental.raw_dockerfile_image(
     pathlib.Path(temp_dir) / "Dockerfile",
-    context_mount=None,
+    # context_mount=None,
     force_build=True,
-    gpu="A10G",
+    # gpu="A10G",
 ).env(df_args)
 
 app = App("Axolotl CI/CD", secrets=[])
@@ -55,12 +59,15 @@ VOLUME_CONFIG = {
 }
 
 N_GPUS = int(os.environ.get("N_GPUS", 1))
-GPU_CONFIG = modal.gpu.L40S(count=N_GPUS)
+GPU_CONFIG = f"L40S:{N_GPUS}"
 
 
 def run_cmd(cmd: str, run_folder: str):
     import subprocess  # nosec
 
+    sp_env = os.environ.copy()
+    sp_env["AXOLOTL_DATASET_PROCESSES"] = "8"
+
     # Propagate errors from subprocess.
-    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
+    if exit_code := subprocess.call(cmd.split(), cwd=run_folder, env=sp_env):  # nosec
         exit(exit_code)  # pylint: disable=consider-using-sys-exit
diff --git a/codecov.yml b/codecov.yml
index 2741b1758..28921f9be 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -22,6 +22,7 @@ coverage:
         only_pulls: true
         flags: null
         paths: null
+        informational: true
     patch:
       default:
         # basic
diff --git a/deepspeed_configs/zero2_torch_compile.json b/deepspeed_configs/zero2_torch_compile.json
new file mode 100644
index 000000000..c3bcf98cf
--- /dev/null
+++ b/deepspeed_configs/zero2_torch_compile.json
@@ -0,0 +1,31 @@
+{
+  "compile": {
+    "disable": false,
+    "backend": "inductor"
+  },
+  "zero_optimization": {
+    "stage": 2,
+    "offload_optimizer": {
+      "device": "cpu"
+    },
+    "contiguous_gradients": true,
+    "overlap_comm": true
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "fp16": {
+    "enabled": "auto",
+    "auto_cast": false,
+    "loss_scale": 0,
+    "initial_scale_power": 32,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "wall_clock_breakdown": false
+}
diff --git a/deepspeed_configs/zero3.json b/deepspeed_configs/zero3.json
index 90ec3677e..f8c9cdfe0 100644
--- a/deepspeed_configs/zero3.json
+++ b/deepspeed_configs/zero3.json
@@ -7,9 +7,9 @@
     "reduce_bucket_size": "auto",
     "stage3_prefetch_bucket_size": "auto",
     "stage3_param_persistence_threshold": "auto",
-    "stage3_max_live_parameters": 0,
-    "stage3_max_reuse_distance": 0,
-    "stage3_gather_16bit_weights_on_model_save": true
+    "max_live_parameters": 0,
+    "max_reuse_distance": 0,
+    "gather_16bit_weights_on_model_save": true
   },
   "bf16": {
     "enabled": "auto"
diff --git a/deepspeed_configs/zero3_bf16.json b/deepspeed_configs/zero3_bf16.json
index 49fb75755..a69e13cf7 100644
--- a/deepspeed_configs/zero3_bf16.json
+++ b/deepspeed_configs/zero3_bf16.json
@@ -7,9 +7,9 @@
     "reduce_bucket_size": "auto",
     "stage3_prefetch_bucket_size": "auto",
     "stage3_param_persistence_threshold": "auto",
-    "stage3_max_live_parameters": 0,
-    "stage3_max_reuse_distance": 0,
-    "stage3_gather_16bit_weights_on_model_save": true
+    "max_live_parameters": 0,
+    "max_reuse_distance": 0,
+    "gather_16bit_weights_on_model_save": true
   },
   "bf16": {
     "enabled": true
diff --git a/deepspeed_configs/zero3_bf16_cpuoffload_all.json b/deepspeed_configs/zero3_bf16_cpuoffload_all.json
index 3ccc66db4..5112c570b 100644
--- a/deepspeed_configs/zero3_bf16_cpuoffload_all.json
+++ b/deepspeed_configs/zero3_bf16_cpuoffload_all.json
@@ -17,9 +17,9 @@
     "reduce_bucket_size": "auto",
     "stage3_prefetch_bucket_size": "auto",
     "stage3_param_persistence_threshold": "auto",
-    "stage3_max_live_parameters": 0,
-    "stage3_max_reuse_distance": 0,
-    "stage3_gather_16bit_weights_on_model_save": true
+    "max_live_parameters": 0,
+    "max_reuse_distance": 0,
+    "gather_16bit_weights_on_model_save": true
   },
   "bf16": {
     "enabled": true
diff --git a/deepspeed_configs/zero3_bf16_cpuoffload_params.json b/deepspeed_configs/zero3_bf16_cpuoffload_params.json
index fe21d35f8..a2ac82341 100644
--- a/deepspeed_configs/zero3_bf16_cpuoffload_params.json
+++ b/deepspeed_configs/zero3_bf16_cpuoffload_params.json
@@ -13,9 +13,9 @@
     "reduce_bucket_size": "auto",
     "stage3_prefetch_bucket_size": "auto",
     "stage3_param_persistence_threshold": "auto",
-    "stage3_max_live_parameters": 0,
-    "stage3_max_reuse_distance": 0,
-    "stage3_gather_16bit_weights_on_model_save": true
+    "max_live_parameters": 0,
+    "max_reuse_distance": 0,
+    "gather_16bit_weights_on_model_save": true
   },
   "bf16": {
     "enabled": true
diff --git a/docker/Dockerfile b/docker/Dockerfile
index e23a729d4..116361dcd 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -10,7 +10,9 @@ ARG PYTORCH_VERSION="2.1.2"
 ENV PYTORCH_VERSION=$PYTORCH_VERSION
 
 RUN apt-get update && \
-    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev rsync s3fs
+    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev rsync s3fs && \
+    rm -rf /var/cache/apt/archives && \
+    rm -rf /var/lib/apt/lists/*
 
 WORKDIR /workspace
 
@@ -23,17 +25,17 @@ RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
         pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
     else \
         pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
-    fi
+    fi && \
+    python scripts/unsloth_install.py | sh && \
+    python scripts/cutcrossentropy_install.py | sh && \
+    pip install pytest && \
+    pip cache purge
 
-RUN python scripts/unsloth_install.py | sh
-RUN python scripts/cutcrossentropy_install.py | sh
-
-# So we can test the Docker image
-RUN pip install pytest
-
-# fix so that git fetch/pull from remote works
+# fix so that git fetch/pull from remote works with shallow clone
 RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
-    git config --get remote.origin.fetch
+    git config --get remote.origin.fetch && \
+    git config --global credential.helper store
 
-# helper for huggingface-login cli
-RUN git config --global credential.helper store
+COPY .axolotl-complete.bash /root/.axolotl-complete.bash
+RUN chmod +x /root/.axolotl-complete.bash && \
+    echo 'source /root/.axolotl-complete.bash' >> ~/.bashrc
diff --git a/docker/Dockerfile-base b/docker/Dockerfile-base
index cf1af9682..0434a583f 100644
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -16,12 +16,19 @@ ENV PYTHON_VERSION=$PYTHON_VERSION
 ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
 
 RUN apt-get update \
-    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
+    && apt-get install -y --no-install-recommends \
+        wget git build-essential ninja-build git-lfs libaio-dev pkg-config \
+        ibverbs-providers ibverbs-utils infiniband-diags  \
+        librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm \
+    && rm -rf /var/cache/apt/archives \
+    && rm -rf /var/lib/apt/lists/* \
     && wget \
     https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
     && mkdir /root/.conda \
     && bash Miniconda3-latest-Linux-x86_64.sh -b \
     && rm -f Miniconda3-latest-Linux-x86_64.sh \
+    && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \
+    && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \
     && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
 
 ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
@@ -31,13 +38,15 @@ WORKDIR /workspace
 RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
     python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
     python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
-    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
+    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" && \
+    python3 -m pip cache purge
 
 RUN git lfs install --skip-repo && \
     pip3 install awscli && \
     # The base image ships with `pydantic==1.8.2` which is not working
-    pip3 install -U --no-cache-dir pydantic==1.10.10
+    pip3 install -U --no-cache-dir pydantic==1.10.10 && \
+    pip3 cache purge
 
-RUN if [ "$PYTORCH_VERSION" = "2.7.0" ] ; then \
-        pip3 install flash-attn==2.7.4.post1; \
+RUN if [ "$PYTORCH_VERSION" = "2.6.0" ] && [ "$CUDA" = "124" ] ; then \
+        FLASH_ATTENTION_FORCE_BUILD="TRUE" pip3 install --no-build-isolation flash-attn==2.8.0.post2; \
     fi
diff --git a/docker/Dockerfile-base-next b/docker/Dockerfile-base-next
index a968b5913..85bac2516 100644
--- a/docker/Dockerfile-base-next
+++ b/docker/Dockerfile-base-next
@@ -29,7 +29,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
 WORKDIR /workspace
 
 RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
-    python3 -m pip install --no-cache-dir -U torch==2.7.0 --extra-index-url https://download.pytorch.org/whl/test/cu$CUDA && \
+    python3 -m pip install --no-cache-dir -U torch==2.7.1 --extra-index-url https://download.pytorch.org/whl/test/cu$CUDA && \
     python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
     python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
 
diff --git a/docker/Dockerfile-base-nightly b/docker/Dockerfile-base-nightly
index 85805ea41..cc74e6bb9 100644
--- a/docker/Dockerfile-base-nightly
+++ b/docker/Dockerfile-base-nightly
@@ -22,18 +22,22 @@ RUN apt-get update \
     && mkdir /root/.conda \
     && bash Miniconda3-latest-Linux-x86_64.sh -b \
     && rm -f Miniconda3-latest-Linux-x86_64.sh \
+    && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \
+    && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \
     && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
 
 ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
 
 WORKDIR /workspace
 
-RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
+RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
     python3 -m pip install --no-cache-dir -U torch --extra-index-url https://download.pytorch.org/whl/nightly/cu$CUDA && \
     python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
-    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
+    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" && \
+    python3 -m pip cache purge
 
 RUN git lfs install --skip-repo && \
     pip3 install awscli && \
     # The base image ships with `pydantic==1.8.2` which is not working
-    pip3 install -U --no-cache-dir pydantic==1.10.10
+    pip3 install -U --no-cache-dir pydantic==1.10.10 && \
+    pip3 cache purge
diff --git a/docker/Dockerfile-cloud b/docker/Dockerfile-cloud
index c84ea1dca..6ab090826 100644
--- a/docker/Dockerfile-cloud
+++ b/docker/Dockerfile-cloud
@@ -14,7 +14,10 @@ COPY scripts/motd /etc/motd
 
 RUN pip install jupyterlab notebook ipywidgets && \
     jupyter lab clean
-RUN apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop && \
+RUN apt update && \
+    apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop && \
+    rm -rf /var/cache/apt/archives && \
+    rm -rf /var/lib/apt/lists/* && \
     mkdir -p ~/.ssh && \
     chmod 700 ~/.ssh && \
     printf "\n[[ -z \"\$TMUX\"  ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \
diff --git a/docker/Dockerfile-cloud-no-tmux b/docker/Dockerfile-cloud-no-tmux
index 165063105..594559cfd 100644
--- a/docker/Dockerfile-cloud-no-tmux
+++ b/docker/Dockerfile-cloud-no-tmux
@@ -9,13 +9,15 @@ ENV HF_HUB_ENABLE_HF_TRANSFER="1"
 EXPOSE 8888
 EXPOSE 22
 
-COPY scripts/cloud-entrypoint-term.sh /root/cloud-entrypoint.sh
+COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh
 COPY scripts/motd /etc/motd
 
 RUN pip install jupyterlab notebook ipywidgets && \
     jupyter lab clean
-RUN apt install --yes --no-install-recommends openssh-server tmux sudo && \
-    pip3 install -U --no-cache-dir grpcio ray[default]==2.9.3 && \
+RUN apt update && \
+    apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm && \
+    rm -rf /var/cache/apt/archives && \
+    rm -rf /var/lib/apt/lists/* && \
     mkdir -p ~/.ssh && \
     chmod 700 ~/.ssh && \
     printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \
diff --git a/docker/Dockerfile-uv-base b/docker/Dockerfile-uv-base
new file mode 100644
index 000000000..4b08e55f8
--- /dev/null
+++ b/docker/Dockerfile-uv-base
@@ -0,0 +1,36 @@
+ARG CUDA_VERSION="12.6.3"
+ARG CUDNN_VERSION=""
+ARG UBUNTU_VERSION="22.04"
+ARG MAX_JOBS=4
+
+FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
+
+ARG PYTHON_VERSION="3.11"
+ARG PYTORCH_VERSION="2.6.0"
+ARG CUDA="126"
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
+
+ENV PYTHON_VERSION=$PYTHON_VERSION
+ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
+ENV UV_TORCH_BACKEND="cu${CUDA}"
+
+RUN apt-get update \
+    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config curl && rm -rf /var/lib/apt/lists/* \
+    && git lfs install --skip-repo \
+    && curl -LsSf https://astral.sh/uv/install.sh | sh
+
+ENV PATH="/root/.local/bin:${PATH}"
+
+RUN uv python install ${PYTHON_VERSION}
+
+WORKDIR /workspace
+
+RUN uv venv --no-project --relocatable axolotl-venv
+
+ENV PATH="/workspace/axolotl-venv/bin:${PATH}"
+
+RUN uv pip install packaging setuptools wheel psutil \
+    && uv pip install torch==${PYTORCH_VERSION} \
+    && uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" \
+    && uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" \
+    && uv pip install awscli pydantic
diff --git a/docs/.gitignore b/docs/.gitignore
index 6c3cb2070..89407326f 100644
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -2,3 +2,4 @@
 _site/
 /api/*.qmd
 /api/*.html
+config-reference.qmd
diff --git a/docs/cli.qmd b/docs/cli.qmd
index 1003a210c..d9f26dbf8 100644
--- a/docs/cli.qmd
+++ b/docs/cli.qmd
@@ -23,6 +23,20 @@ axolotl <command> [config.yml] [options]
 
 The config file can be local or a URL to a raw YAML file.
 
+### Launcher Arguments
+
+For commands that support multi-GPU (`train`, `evaluate`, ...), you can pass launcher-specific arguments using the `--` separator:
+
+```bash
+# Pass torchrun arguments
+axolotl train config.yml --launcher torchrun -- --nproc_per_node=2 --nnodes=1
+
+# Pass accelerate arguments
+axolotl train config.yml --launcher accelerate -- --config_file=accelerate_config.yml --num_processes=4
+```
+
+Arguments after `--` are passed directly to the launcher (torchrun, accelerate launch, etc.).
+
 ## Command Reference
 
 ### fetch
@@ -80,7 +94,11 @@ axolotl train config.yml \
     --num-epochs 3
 
 # Training without accelerate
-axolotl train config.yml --no-accelerate
+axolotl train config.yml --launcher python
+
+# Pass launcher-specific arguments using -- separator
+axolotl train config.yml --launcher torchrun -- --nproc_per_node=2 --nnodes=1
+axolotl train config.yml --launcher accelerate -- --config_file=accelerate_config.yml
 
 # Resume training from checkpoint
 axolotl train config.yml --resume-from-checkpoint path/to/checkpoint
@@ -175,6 +193,9 @@ Evaluates a model's performance (loss etc) on the train and eval datasets.
 ```bash
 # Basic evaluation
 axolotl evaluate config.yml
+
+# Evaluation with launcher arguments
+axolotl evaluate config.yml --launcher torchrun -- --nproc_per_node=2
 ```
 
 ### lm-eval
@@ -209,6 +230,16 @@ axolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir
 
 This would be necessary to use with other frameworks. If you have an adapter, merge it with the non-quantized linearized model before delinearizing.
 
+### quantize
+
+Quantizes a model using the quantization configuration specified in your YAML file.
+
+```bash
+axolotl quantize config.yml
+```
+
+See [Quantization](./quantize.qmd) for more details.
+
 
 ## Legacy CLI Usage
 
@@ -277,9 +308,6 @@ axolotl preprocess config.yml --cloud cloud_config.yml
 # Train on cloud
 axolotl train config.yml --cloud cloud_config.yml
 
-# Train without accelerate on cloud
-axolotl train config.yml --cloud cloud_config.yml --no-accelerate
-
 # Run lm-eval on cloud
 axolotl lm-eval config.yml --cloud cloud_config.yml
 ```
diff --git a/docs/custom_integrations.qmd b/docs/custom_integrations.qmd
index 023f09732..8e1fdaa2e 100644
--- a/docs/custom_integrations.qmd
+++ b/docs/custom_integrations.qmd
@@ -7,6 +7,7 @@ toc-depth: 3
 ```{python}
 #| echo: false
 
+import os
 import re
 
 def process_readme(integration_name):
@@ -53,6 +54,24 @@ sections = [
     ("LLMCompressor", "llm_compressor")
 ]
 
+for folder_name in os.listdir("../src/axolotl/integrations/"):
+    if folder_name in [path for name, path in sections]:
+        # skip if already in sections
+        continue
+    if os.path.exists(f"../src/axolotl/integrations/{folder_name}/README.md"):
+        # grab the first heading in README.md as the section name
+        with open(f"../src/axolotl/integrations/{folder_name}/README.md", "r") as f:
+            txt = f.read()
+            matches = re.search(r'^# (.*)\n?', txt, flags=re.MULTILINE)
+            if matches:
+                name = matches.group(1)
+            else:
+                continue
+            sections.append((name, folder_name))
+
+# sort sections by name
+sections = sorted(sections, key=lambda x: x[0])
+
 for section_name, folder_name in sections:
     print(print_section(section_name, folder_name))
 ```
diff --git a/docs/dataset-formats/conversation.qmd b/docs/dataset-formats/conversation.qmd
index 87c2941e6..d53c68598 100644
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -9,10 +9,10 @@ order: 3
 Chat Template strategy uses a jinja2 template that converts a list of messages into a prompt. Support using tokenizer's template, a supported template, or custom jinja2.
 
 ```{.json filename="data.jsonl"}
-{"conversations": [{"role": "...", "content": "..."}]}
+{"messages": [{"role": "...", "content": "..."}, {"role": "...", "content": "..."}, ...]}
 ```
 
-See [configs](../config.qmd) for full configs and supported templates.
+See [configs](../config-reference.qmd) for full configs and supported templates.
 
 ### Migrating from sharegpt
 
@@ -52,7 +52,9 @@ We recommend checking the below examples for other usecases.
 
 ### Examples
 
-1. (Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.
+#### Training on last message
+
+(Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.
 
 ```yaml
 datasets:
@@ -66,7 +68,9 @@ datasets:
 If you receive an error like "`chat_template` choice is `tokenizer_default` but tokenizer's `chat_template` is null.", it means the tokenizer does not have a default `chat_template`. Follow the examples below instead to set a custom `chat_template`.
 :::
 
-2. Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages.
+#### Overriding default chat template
+
+Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages.
 
 ```yaml
 chat_template: gemma # this overwrites the tokenizer's chat_template
@@ -76,7 +80,13 @@ datasets:
     roles_to_train: ["assistant"]  # default value
 ```
 
-3. Using the tokenizer_config.json's chat template or `chatml` as fallback if the former's chat template does not exist, on OpenAI messages format, training on all assistant messages.
+::: {.callout-note}
+If you want to use built-in chat_template, use `chat_template: tokenizer_default` (this is set by default).
+:::
+
+#### Using default chat template with fallback
+
+Using the tokenizer_config.json's chat template or `chatml` as fallback if the former's chat template does not exist, on OpenAI messages format, training on all assistant messages.
 
 ```yaml
 chat_template: tokenizer_default_fallback_chatml # this overwrites the tokenizer's chat_template
@@ -85,7 +95,9 @@ datasets:
     type: chat_template
 ```
 
-4. Using a custom jinja template on OpenAI messages format, training on all assistant messages.
+#### Custom Jinja template
+
+Using a custom jinja template on OpenAI messages format, training on all assistant messages.
 
 ```yaml
 # chat_template: jinja # `jinja` will be implied if the `chat_template_jinja` is set and this field is empty
@@ -100,7 +112,9 @@ datasets:
 Please make sure that your `tokenizer.eos_token` is same as EOS (End-of-Sequence) token in template. Otherwise, set `eos_token` under `special_tokens: `.
 :::
 
-5. If you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the `eot_tokens: ` config. The handling of EOT tokens follows `train_on_eos: ` which defaults to turn.
+#### Using template with different token for EOT and EOS
+
+- If you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the `eot_tokens: ` config. The handling of EOT tokens follows `train_on_eos: ` which defaults to turn.
 
 ```yaml
 eot_tokens:
@@ -116,16 +130,16 @@ datasets:
 ```
 
 ::: {.callout-tip}
-See [config documentation](../config.qmd) for detailed explanations of "turn", "last", and "all" options for training on tokens.
+See [config documentation](../config-reference.qmd) for detailed explanations of "turn", "last", and "all" options for training on tokens.
 :::
 
 ::: {.callout-note}
 Using `eot_tokens` requires each token that exists in `chat_template` to be a single token in the tokenizer. Otherwise, the tokenizer will split the token and cause unexpected behavior.
 
-You can add those tokens as new tokens under `tokens: ` or (recommended) override unused added_tokens via `added_tokens_overrides: `. See [config](../config.qmd) for more details.
+You can add those tokens as new tokens under `tokens: ` or (recommended) override unused added_tokens via `added_tokens_overrides: `. See [config](../config-reference.qmd) for more details.
 :::
 
-6. Continuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set `train_on_eos: last`.
+- Continuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set `train_on_eos: last`.
 
 ```yaml
 eot_tokens:
@@ -145,7 +159,76 @@ If EOS token only appears at the end of a prompt, `train_on_eos: last` is equiva
 :::
 
 
-7. (Advanced) Using fine-grained control over tokens and turns to train in a conversation
+#### Using tool use
+
+Instead of passing `tools` via the system prompt, an alternative method would be to have the `tools` in a separate column and loaded via `chat_template` to let the template dynamically build it.
+
+```json
+{
+    "tools": [
+        {
+            "type": "...",
+            "function": {
+                "name": "...",
+                "description": "...",
+                "parameters": {
+                    "type": "...",
+                    "properties": {
+                        // ...
+                    },
+                    "required": ["..."],
+                },
+            },
+        },
+    ],
+    "messages": [
+        // ...
+        {
+            "role": "assistant", // call the function via assistant
+            "tool_calls": [
+                {
+                    "id": "...",  // required only for mistral
+                    "type": "function",
+                    "function": {
+                        "name": "...",
+                        "arguments": {
+                            "...": "...",
+                        }
+                    }
+                }
+            ]
+        },
+        {
+            "role": "tool",
+            "tool_call_id": "...",  // required only for mistral
+            "name": "...",
+            "content": "..."
+        },
+    ],
+}
+```
+
+::: {.callout-note}
+Tools need to follow [JSON schema](https://json-schema.org/learn/getting-started-step-by-step).
+:::
+
+Example config for Llama4:
+```yaml
+chat_template: llama4
+datasets:
+  - path: Nanobit/text-tools-2k-test
+    type: chat_template
+    # field_tools: tools # default is `tools`
+```
+
+::: {.callout-tip}
+Look into the `chat_template` you are using to see if it supports `tools` and what the expected role is for the tool answer. In the example above, the tool answer is expected to be in the `tool` or `ipython` role for `llama4` template.
+:::
+
+
+#### Using fine-grained control over token masking
+
+(Advanced) Using fine-grained control over tokens and turns to train in a conversation
 
 For a data sample that looks like:
 
@@ -196,7 +279,9 @@ datasets:
 It is not necessary to set both `message_field_training` and `message_field_training_detail` at once.
 :::
 
-8. (For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.
+#### Reasoning split
+
+(For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.
 
 ```yaml
 datasets:
diff --git a/docs/dataset-formats/index.qmd b/docs/dataset-formats/index.qmd
index 9898bbc9b..a0113db07 100644
--- a/docs/dataset-formats/index.qmd
+++ b/docs/dataset-formats/index.qmd
@@ -36,10 +36,6 @@ It is typically recommended to save your dataset as `.jsonl` due to its flexibil
 
 Axolotl supports loading from a Hugging Face hub repo or from local files.
 
-::: {.callout-important}
-For pre-training only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts.
-:::
-
 ### Pre-training from Hugging Face hub datasets
 
 As an example, to train using a Hugging Face dataset `hf_org/name`, you can pass the following config:
@@ -77,18 +73,21 @@ datasets:
     type: completion
 ```
 
-From local files (either example works):
+From local files:
 
 ```yaml
 datasets:
   - path: A.jsonl
     type: completion
 
-  - path: json
-    data_files: ["A.jsonl", "B.jsonl", "C.jsonl"]
+  - path: B.jsonl
     type: completion
 ```
 
+::: {.callout-important}
+For `completion` only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts. If you are interested in having this for `pretraining_dataset` too, please let us know or help make a PR!
+:::
+
 ### Pre-training dataset configuration tips
 
 #### Setting max_steps
diff --git a/docs/dataset-formats/inst_tune.qmd b/docs/dataset-formats/inst_tune.qmd
index d89c6adaf..f5bd7ab8f 100644
--- a/docs/dataset-formats/inst_tune.qmd
+++ b/docs/dataset-formats/inst_tune.qmd
@@ -186,4 +186,4 @@ datasets:
       no_input_format: "[INST] {instruction} [/INST]"
 ```
 
-See full config options under [here](../config.qmd).
+See full config options under [here](../config-reference.qmd).
diff --git a/docs/dataset_loading.qmd b/docs/dataset_loading.qmd
index 09c8b0098..bcffe7f0f 100644
--- a/docs/dataset_loading.qmd
+++ b/docs/dataset_loading.qmd
@@ -36,7 +36,7 @@ This matches the API of [`datasets.load_dataset`](https://github.com/huggingface
 
 For HuggingFace's guide to load different dataset types, see [here](https://huggingface.co/docs/datasets/loading).
 
-For full details on the config, see [config.qmd](config.qmd).
+For full details on the config, see [config-reference.qmd](config-reference.qmd).
 
 ::: {.callout-note}
 
@@ -54,7 +54,7 @@ datasets:
 
 #### Files
 
-Usually, to load a JSON file, you would do something like this:
+To load a JSON file, you would do something like this:
 
 ```python
 from datasets import load_dataset
@@ -66,20 +66,12 @@ Which translates to the following config:
 
 ```yaml
 datasets:
-  - path: json
-    data_files: /path/to/your/file.jsonl
-```
-
-However, to make things easier, we have added a few shortcuts for loading local dataset files.
-
-You can just point the `path` to the file or directory along with the `ds_type` to load the dataset. The below example shows for a JSON file:
-
-```yaml
-datasets:
-  - path: /path/to/your/file.jsonl
+  - path: data.json
     ds_type: json
 ```
 
+In the example above, it can be seen that we can just point the `path` to the file or directory along with the `ds_type` to load the dataset.
+
 This works for CSV, JSON, Parquet, and Arrow files.
 
 ::: {.callout-tip}
diff --git a/docs/docker.qmd b/docs/docker.qmd
index d665eaf5b..da6184394 100644
--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -9,7 +9,7 @@ format:
 This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai).
 
 ::: {.callout-important}
-For Blackwell GPUs, please use the tags with Pytorch 2.7.0 and CUDA 12.8.
+For Blackwell GPUs, please use the tags with PyTorch 2.7.1 and CUDA 12.8.
 :::
 
 ## Base
@@ -32,11 +32,11 @@ main-base-py{python_version}-cu{cuda_version}-{pytorch_version}
 
 Tags examples:
 
-- `main-base-py3.11-cu128-2.7.0`
+- `main-base-py3.11-cu128-2.7.1`
+- `main-base-py3.11-cu126-2.7.1`
 - `main-base-py3.11-cu126-2.7.0`
+- `main-base-py3.11-cu126-2.6.0`
 - `main-base-py3.11-cu124-2.6.0`
-- `main-base-py3.11-cu124-2.5.1`
-- `main-base-py3.11-cu124-2.4.1`
 
 ## Main
 
@@ -74,15 +74,15 @@ There may be some extra tags appended to the image, like `-vllm` which installs
 
 Tags examples:
 
+- `main-py3.11-cu128-2.7.1`
+- `main-py3.11-cu126-2.7.1`
 - `main-py3.11-cu126-2.7.0`
+- `main-py3.11-cu126-2.6.0`
 - `main-py3.11-cu124-2.6.0`
-- `main-py3.11-cu124-2.5.1`
-- `main-py3.11-cu124-2.4.1`
 - `main-latest`
 - `main-20250303-py3.11-cu124-2.6.0`
-- `main-20250303-py3.11-cu124-2.5.1`
-- `main-20250303-py3.11-cu124-2.4.1`
-- `0.7.1`
+- `main-20250303-py3.11-cu126-2.6.0`
+- `0.10.1`
 
 ## Cloud
 
diff --git a/docs/faq.qmd b/docs/faq.qmd
index f586099e7..08d439af7 100644
--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -9,11 +9,11 @@ description: Frequently asked questions
 
 > A: Usually an issue with the GPUs communicating with each other. See the [NCCL doc](nccl.qmd)
 
-**Q: Exitcode -9**
+**Q: exitcode: -9**
 
 > A: This usually happens when you run out of system RAM.
 
-**Q: Exitcode -7 while using deepspeed**
+**Q: exitcode: -7 while using deepspeed**
 
 > A: Try upgrading deepspeed w: `pip install -U deepspeed`
 
@@ -51,6 +51,18 @@ description: Frequently asked questions
 >   pad_token: "..."
 > ```
 
+**Q: `IterableDataset error` or `KeyError: 'input_ids'` when using `preprocess` CLI**
+
+> A: This is because you may be using `preprocess` CLI with `pretraining_dataset:` or `skip_prepare_dataset: true` respectively. Please use `axolotl train` CLI directly instead as these datasets are prepared on demand.
+
+**Q: vLLM is not working with Axolotl**
+
+> A: We currently recommend torch 2.6.0 for use with `vllm`. Please ensure you use the right version. For Docker, please use the `main-py3.11-cu124-2.6.0` tag.
+
+**Q: FA2 2.8.0 `undefined symbol` runtime error on CUDA 12.4**
+
+> A: There seems to be a wheel issue with FA2 2.8.0 on CUDA 12.4. Try CUDA 12.6 instead or downgrade to FA2 2.7.4. Please refer to the upstream issue: https://github.com/Dao-AILab/flash-attention/issues/1717.
+
 ### Chat templates
 
 **Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**
@@ -110,3 +122,21 @@ description: Frequently asked questions
 > A: If `eot_tokens: ` is not provided, the default behavior is the same as before. EOS tokens used to delimit turns are masked/unmasked depending on whether the turn is trainable.
 
 > Internally, `eot_tokens: tokenizer.eos_token` and `train_on_eot: train_on_eos` (which defaults to `turn`). This transition helps clarify the naming and behavior of EOT/EOS tokens.
+
+**Q: `Data processing error: CAS service error`**
+
+> A: Try disabling XET with `export HF_HUB_DISABLE_XET=1`
+
+**Q: `torch._inductor.exc.LoweringException: NoValidChoicesError: No choices to select, please consider adding ATEN into max_autotune_gemm_backends config (defined in torch/_inductor/config.py) to allow at least one choice. `**
+
+> A: Depending on the version of torch, you may need to include this in your YAML:
+
+> ```yaml
+> flex_attn_compile_kwargs:
+>   dynamic: false
+>   mode: max-autotune-no-cudagraphs
+> ```
+
+**Q: `ValueError("Backward pass should have cleared tracker of all tensors")`
+
+> A: This may happen due to edge cases in using the modern OffloadActivations context manager for CUDA streams. If you encounter this error, you may have success using the naive implementation with `offload_activations: legacy` in your YAML.
diff --git a/docs/fsdp_qlora.qmd b/docs/fsdp_qlora.qmd
index 7af2a3eba..2f1b0358f 100644
--- a/docs/fsdp_qlora.qmd
+++ b/docs/fsdp_qlora.qmd
@@ -20,7 +20,7 @@ To enable `QLoRA` with `FSDP`, you need to perform the following steps:
 > See the [example config](#example-config) file in addition to reading these instructions.
 
 1. Set `adapter: qlora` in your axolotl config file.
-2. Enable FSDP in your axolotl config, as [described here](https://github.com/axolotl-ai-cloud/axolotl?tab=readme-ov-file#fsdp).
+2. Enable FSDP in your axolotl config, as [described here](multi-gpu.qmd#sec-fsdp).
 3. Use one of the supported model types: `llama`, `mistral` or `mixtral`.
 
 ## Example Config
diff --git a/docs/getting-started.qmd b/docs/getting-started.qmd
index 6f1b54348..de059c397 100644
--- a/docs/getting-started.qmd
+++ b/docs/getting-started.qmd
@@ -55,7 +55,7 @@ output_dir: ./outputs/lora-out
 - To perform QLoRA finetuning, replace with `load_in_4bit: true` and `adapter: qlora`.
 :::
 
-See our [Config options](config.qmd) for more details.
+See our [config options](config-reference.qmd) for more details.
 
 ### Training {#sec-training}
 
@@ -179,7 +179,7 @@ Now that you have the basics, you might want to:
 
 Check our other guides for details on these topics:
 
-- [Configuration Guide](config.qmd) - Full configuration options
+- [Configuration Guide](config-reference.qmd) - Full configuration options
 - [Dataset Loading](dataset_loading.qmd) - Loading datasets from various sources
 - [Dataset Formats](dataset-formats) - Working with different data formats
 - [Multi-GPU Training](multi-gpu.qmd)
diff --git a/docs/gradient_checkpointing.qmd b/docs/gradient_checkpointing.qmd
new file mode 100644
index 000000000..25a887999
--- /dev/null
+++ b/docs/gradient_checkpointing.qmd
@@ -0,0 +1,29 @@
+---
+title: Gradient Checkpointing and Activation Offloading
+---
+
+Gradient checkpointing and activation offloading are techniques used to optimize the performance of deep learning
+models by reducing the memory footprint and improving computational efficiency.
+
+### Enabling Gradient Checkpointing
+
+```yaml
+gradient_checkpointing: true
+```
+
+### Enabling Activation Offloading
+
+```yaml
+gradient_checkpointing: true  # required for activation offloading
+activation_offloading: true
+```
+
+Activation offloading variants:
+
+The default `activation_offloading: true` offloads activations to CPU and uses CUDA streams
+to overlap the communications and computations when offloading.
+
+The `activation_offloading: legacy` naively offloads activations to CPU and without additional optimizations.
+
+For resource constrained environments with limited CPU memory, `activation_offloading: disk` offloads
+activations to disk instead of CPU RAM so that much larger context lengths can be trained with minimal memory.
diff --git a/docs/installation.qmd b/docs/installation.qmd
index b429992b6..763539278 100644
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -14,8 +14,8 @@ This guide covers all the ways you can install and set up Axolotl for your envir
 ## Requirements {#sec-requirements}
 
 - NVIDIA GPU (Ampere architecture or newer for `bf16` and Flash Attention) or AMD GPU
-- Python ≥3.10
-- PyTorch ≥2.4.1
+- Python ≥3.11
+- PyTorch ≥2.6.0
 
 ## Installation Methods {#sec-installation-methods}
 
@@ -41,6 +41,40 @@ installed) in order not to clobber it, and so that we set the correct version of
 dependencies that are specific to the PyTorch version or other installed
 co-dependencies.
 
+### uv Installation {#sec-uv}
+
+uv is a fast, reliable Python package installer and resolver built in Rust. It offers significant performance improvements over pip and provides better dependency resolution, making it an excellent choice for complex environments.
+
+Install uv if not already installed
+```{.bash}
+curl -LsSf https://astral.sh/uv/install.sh | sh
+source $HOME/.local/bin/env
+```
+
+Choose your CUDA version to use with PyTorch; e.g. `cu124`, `cu126`, `cu128`,
+then create the venv and activate
+```{.bash}
+export UV_TORCH_BACKEND=cu126
+uv venv --no-project --relocatable
+source .venv/bin/activate
+```
+
+Install PyTorch
+- PyTorch 2.6.0 recommended
+```{.bash}
+uv pip install packaging setuptools wheel
+uv pip install torch==2.6.0
+uv pip install awscli pydantic
+```
+
+Install axolotl from PyPi
+```{.bash}
+uv pip install --no-build-isolation axolotl[deepspeed,flash-attn]
+
+# optionally install with vLLM if you're using torch==2.6.0 and want to train w/ GRPO
+uv pip install --no-build-isolation axolotl[deepspeed,flash-attn,vllm]
+```
+
 ### Edge/Development Build {#sec-edge-build}
 
 For the latest features between releases:
@@ -90,10 +124,13 @@ For providers supporting Docker:
 
 - Use `axolotlai/axolotl-cloud:main-latest`
 - Available on:
-  - [Latitude.sh](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)
-  - [JarvisLabs.ai](https://jarvislabs.ai/templates/axolotl)
-  - [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
-  - [Novita](https://novita.ai/gpus-console?templateId=311)
+    - [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
+    - [Vast.ai](https://cloud.vast.ai?ref_id=62897&template_id=bdd4a49fa8bce926defc99471864cace&utm_source=axolotl&utm_medium=partner&utm_campaign=template_launch_july2025&utm_content=docs_link)
+    - [PRIME Intellect](https://app.primeintellect.ai/dashboard/create-cluster?image=axolotl&location=Cheapest&security=Cheapest&show_spot=true)
+    - [Modal](https://www.modal.com?utm_source=github&utm_medium=github&utm_campaign=axolotl)
+    - [Novita](https://novita.ai/gpus-console?templateId=311)
+    - [JarvisLabs.ai](https://jarvislabs.ai/templates/axolotl)
+    - [Latitude.sh](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)
 
 ### Google Colab {#sec-colab}
 
@@ -119,7 +156,7 @@ We recommend using WSL2 (Windows Subsystem for Linux) or Docker.
 
 ### Conda/Pip venv {#sec-conda}
 
-1. Install Python ≥3.10
+1. Install Python ≥3.11
 2. Install PyTorch: https://pytorch.org/get-started/locally/
 3. Install Axolotl:
    ```{.bash}
diff --git a/docs/lora_optims.qmd b/docs/lora_optims.qmd
index 56d56e9fc..7cdf53975 100644
--- a/docs/lora_optims.qmd
+++ b/docs/lora_optims.qmd
@@ -84,6 +84,10 @@ lora_qkv_kernel: true
 lora_o_kernel: true
 ```
 
+::: {.callout-note}
+Currently, LoRA kernels are not supported for RLHF training, only SFT.
+:::
+
 ## Requirements
 
 - One or more NVIDIA or AMD GPUs (in order to use the Triton kernels)
diff --git a/docs/mixed_precision.qmd b/docs/mixed_precision.qmd
new file mode 100644
index 000000000..7b77cd4bb
--- /dev/null
+++ b/docs/mixed_precision.qmd
@@ -0,0 +1,149 @@
+---
+title: "Mixed Precision Training"
+format:
+  html:
+    toc: true
+    toc-depth: 3
+    number-sections: true
+    code-tools: true
+execute:
+  enabled: false
+---
+
+Mixed precision training uses lower precision data types to reduce memory usage and increase training speed while maintaining model quality. Axolotl supports several mixed precision formats:
+
+- **FP16** - Half precision 16-bit (Pascal generation+)
+- **BF16** - Brain Float 16-bit (Ampere generation+)
+- **FP8** - 8-bit floating point (Hopper generation+)
+
+## FP16 Mixed Precision {#sec-fp16}
+
+### Overview {#sec-fp16-overview}
+
+FP16 is the traditional half-precision format, supported on older GPUs but can be less numerically stable than BF16.
+
+### Configuration {#sec-fp16-config}
+
+```{.yaml}
+fp16: true
+```
+
+### FP16 Considerations {#sec-fp16-considerations}
+
+- May require gradient scaling to prevent underflow
+- Less numerically stable than BF16
+- Can cause training instability with some model architectures
+- Consider using BF16 if your hardware supports it
+
+## BF16 Mixed Precision {#sec-bf16}
+
+### Overview {#sec-bf16-overview}
+
+BF16 (Brain Float 16) offers better numerical stability than FP16 and is the recommended mixed precision format for modern GPUs. It provides the same dynamic range as FP32 while using half the memory.
+
+### Configuration {#sec-bf16-config}
+
+```{.yaml}
+# Automatic BF16 detection (recommended)
+bf16: auto
+
+# Or explicitly enable
+bf16: true
+
+# For evaluation with BF16
+bf16: full  # Equivalent to bf16_full_eval in the HF trainer
+```
+
+## FP8 Mixed Precision {#sec-fp8}
+
+::: {.callout-note}
+FP8 support is experimental and requires compatible hardware (H100, H200) and recent PyTorch versions with TorchAO.
+:::
+
+### What is FP8? {#sec-fp8-overview}
+
+FP8 (8-bit floating point) can provide significant time savings compared to FP16/BF16 while maintaining training stability. Axolotl's implementation uses PyTorch's TorchAO library with "tensorwise" scaling strategy.
+
+### Requirements {#sec-fp8-software}
+
+- Hopper+ GPUs (H100/H200)
+- PyTorch 2.7+ (+ compatible TorchAO version)
+- CUDA 12.4+
+
+### Configuration {#sec-fp8-config}
+
+Add to your YAML config:
+
+```{.yaml}
+# Enable FP8 mixed precision
+fp8: true
+
+# Optional: Enable FP8 for FSDP all-gather operations
+fp8_enable_fsdp_float8_all_gather: true
+
+# Enable torch.compile (almost always necessary for FP8 speedups)
+torch_compile: true
+```
+
+::: {.callout-important}
+**torch.compile is critical for FP8 performance**
+
+FP8 training requires `torch_compile: true` to see meaningful speedups. Without compilation, FP8 may actually be slower and use more memory than FP16/BF16.
+:::
+
+### Advanced FP8 Configs {#sec-fp8-advanced}
+
+For [FSDP](multi-gpu.qmd#sec-fsdp) (Fully Sharded Data Parallel) training:
+
+```{.yaml}
+fp8: true
+fp8_enable_fsdp_float8_all_gather: true
+
+torch_compile: true
+
+# FSDP configuration
+fsdp_version: 2
+fsdp_config:
+  offload_params: false
+  cpu_ram_efficient_loading: true
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  state_dict_type: FULL_STATE_DICT
+  reshard_after_forward: true
+```
+
+## Best Practices {#sec-best-practices}
+
+### Choosing Precision Format {#sec-choosing-format}
+
+- **Start with automatic detection**: `bf16: auto`
+- **For Hopper+ (H100/H200)**: Try FP8 + torch.compile for maximum speed
+- **For Ampere (A100/RTX 30/40)**: Use BF16
+- **For older Pascal/Turing GPUs**: Use FP16 with caution
+- **For very old or unsupported GPUs**: Use FP32
+
+### Validation and Testing {#sec-validation}
+
+Always validate your mixed precision setup:
+
+- **Start with a small dataset** to verify stability
+- **Monitor loss curves** for irregularities
+- **Compare with FP32 baseline** when possible
+- **Test evaluation metrics** match expectations
+
+### FP8 Particulars {#sec-fp8-details}
+
+- Use cases
+  - Single GPU training
+  - Multi GPU training with FSDP2 or Deepspeed
+- Speedups
+  - Please refer to the [TorchAO FP8 training benchmarks](https://github.com/pytorch/ao/tree/main/torchao/float8#rowwise-scaling) for expected matmul speedups for different (M, K, N) settings
+  - Concrete number for LLaMA 3 8B training can be found [here](https://github.com/pytorch/ao/tree/main/torchao/float8#training-benchmarks)
+- Known issues:
+  - FP8 + DDP + `torch.compile` (causes [error](https://gist.github.com/djsaunde/0c1664c32e44a64d31b5e01b4aafe5c4))
+  - FP8 + FSDP2 + `torch.compile` + FSDP2 activation checkpointing tends to be _slower_ than the BF16 equivalent training
+  - Flash Attention 2 does not play nicely with `torch.compile`
+
+See `examples/llama-3/3b-fp8-fsdp2.yaml` for an optimized example config. Enabling FP8 mixed precision + FP8 all-gather training results in ~10% faster iterations per second vs. BF16 for a relatively small (3B param) model
+
+For more information on multi-GPU training, see our [Multi-GPU guide](multi-gpu.qmd).
diff --git a/docs/multi-gpu.qmd b/docs/multi-gpu.qmd
index fee7d17e5..71676bc84 100644
--- a/docs/multi-gpu.qmd
+++ b/docs/multi-gpu.qmd
@@ -23,8 +23,6 @@ Axolotl supports several methods for multi-GPU training:
 
 ## DeepSpeed {#sec-deepspeed}
 
-DeepSpeed is the recommended approach for multi-GPU training due to its stability and performance. It provides various optimization levels through ZeRO stages.
-
 ### Configuration {#sec-deepspeed-config}
 
 Add to your YAML config:
@@ -32,7 +30,6 @@ Add to your YAML config:
 ```{.yaml}
 deepspeed: deepspeed_configs/zero1.json
 ```
-
 ### Usage {#sec-deepspeed-usage}
 
 ```{.bash}
@@ -66,9 +63,75 @@ Start from Stage 1 -> Stage 2 -> Stage 3.
 
 :::
 
-## FSDP {#sec-fsdp}
+::: {.callout-tip}
 
-### Basic FSDP Configuration {#sec-fsdp-config}
+Using ZeRO Stage 3 with Single-GPU training
+
+ZeRO Stage 3 can be used for training on a single GPU by manually setting the environment variables:
+`WORLD_SIZE=1 LOCAL_RANK=0 MASTER_ADDR=0.0.0.0 MASTER_PORT=29500`
+
+:::
+
+## Fully Sharded Data Parallel (FSDP) {#sec-fsdp}
+
+::: {.callout-note}
+
+FSDP2 is recommended for new users. FSDP1 is deprecated and will be removed in an upcoming release of Axolotl.
+
+:::
+
+### Migrating from FSDP1 to FSDP2 {#sec-migrate-fsdp1-fsdp2}
+
+To migrate your config from FSDP1 to FSDP2, you must use the `fsdp_version` top-level config field to specify the FSDP version, and
+also follow the config field mapping below to update field names.
+
+#### Config mapping
+
+FSDP1 | FSDP2
+-------- | --------
+fsdp_sharding_strategy | reshard_after_forward
+fsdp_backward_prefetch_policy | **REMOVED**
+fsdp_backward_prefetch | **REMOVED**
+fsdp_forward_prefetch | **REMOVED**
+fsdp_sync_module_states | **REMOVED**
+fsdp_cpu_ram_efficient_loading | cpu_ram_efficient_loading
+fsdp_state_dict_type | state_dict_type
+fsdp_use_orig_params | **REMOVED**
+
+For more details, please see the migration guide in the [torchtitan repo](https://github.com/pytorch/torchtitan/blob/main/docs/fsdp.md). In Axolotl,
+if you were using the following FSDP1 config:
+
+```{.yaml}
+fsdp_version: 1
+fsdp_config:
+  fsdp_offload_params: false
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_transformer_layer_cls_to_wrap: Qwen3DecoderLayer
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sharding_strategy: FULL_SHARD
+```
+
+You can migrate to the following FSDP2 config:
+
+```{.yaml}
+fsdp_version: 2
+fsdp_config:
+  offload_params: false
+  cpu_ram_efficient_loading: true
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: Qwen3DecoderLayer
+  state_dict_type: FULL_STATE_DICT
+  reshard_after_forward: true
+```
+
+### FSDP1 (deprecated) {#sec-fsdp-config}
+
+::: {.callout-note}
+
+Using `fsdp` to configure FSDP is deprecated and will be removed in an upcoming release of Axolotl. Please use `fsdp_config` as above instead.
+
+:::
 
 ```{.yaml}
 fsdp:
@@ -80,6 +143,7 @@ fsdp_config:
   fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
 ```
 
+
 ## Sequence parallelism {#sec-sequence-parallelism}
 
 We support sequence parallelism (SP) via the
diff --git a/docs/multi-node.qmd b/docs/multi-node.qmd
index cec8ff45d..16196a2d7 100644
--- a/docs/multi-node.qmd
+++ b/docs/multi-node.qmd
@@ -40,13 +40,13 @@ use_cpu: false
 
 Configure your model to use FSDP in the Axolotl yaml. For example:
 ```yaml
-fsdp:
-  - full_shard
-  - auto_wrap
+fsdp_version: 2
 fsdp_config:
-  fsdp_offload_params: true
-  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  offload_params: true
+  state_dict_type: FULL_STATE_DICT
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  reshard_after_forward: true
 ```
 
 All you have to do now is launch using accelerate as you would usually do on each machine and voila, the processes will start once you have launched accelerate on every machine.
@@ -69,11 +69,19 @@ export NCCL_BUFFSIZE=2097152
 
 Run the following on each node:
 
+### Option 1: New Axolotl CLI with launcher args (Recommended)
+
+```bash
+axolotl train config.yaml --launcher torchrun -- --nnodes $num_nodes --nproc_per_node $gpu_per_node --rdzv_id $rdzv_id --rdzv_backend c10d --rdzv_endpoint "$head_node_ip:$head_node_port"
+```
+
+### Option 2: Direct torchrun (Legacy)
+
 ```bash
 torchrun --nnodes $num_nodes --nproc_per_node $gpu_per_node --rdzv_id $rdzv_id --rdzv_backend c10d --rdzv_endpoint "$head_node_ip:$head_node_port" -m axolotl.cli.train config.yaml
 ```
 
-Please make sure to substitute the placeholder variables.
+Please make sure to substitute the placeholder variables:
 
 - `num_nodes`: Number of nodes (containing GPUs)
 - `gpu_per_node`: Number of gpus per node
@@ -81,8 +89,6 @@ Please make sure to substitute the placeholder variables.
 - `head_node_port`: Port of the head node (make sure other machines can connect to this. Default 29400)
 - `rdzv_id`: A unique job ID that is used by the job across nodes.
 
-::: {.callout-note}
-You need to call `axolotl.cli.train` instead of `axolotl train` as the latter calls accelerate under the hood
-:::
+The new CLI approach (Option 1) is recommended as it provides consistent argument handling and works seamlessly with other Axolotl CLI features.
 
 More info on the available configs can be found on the Pytorch docs [here](https://pytorch.org/docs/stable/elastic/run.html)
diff --git a/docs/multimodal.qmd b/docs/multimodal.qmd
index 3506db340..dbb365f73 100644
--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -14,6 +14,7 @@ format:
 - [Llava-1.5](#sec-llava-15)
 - [Mistral-Small-3.1](#sec-mistral-small-31)
 - [Gemma-3](#sec-gemma-3)
+- [Gemma-3n](#sec-gemma-3n)
 - [Qwen2-VL](#sec-qwen2-vl)
 - [Qwen2.5-VL](#sec-qwen25-vl)
 
@@ -43,7 +44,7 @@ datasets:
 # leave the vision model and vision tower frozen
 # load_in_8bit: true
 adapter: lora
-lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
 
 # (optional) if you want to resize images to a set size
 image_size: 512
@@ -110,6 +111,22 @@ base_model: google/gemma-3-4b-it
 chat_template: gemma3
 ```
 
+### Gemma-3n {#sec-gemma-3n}
+
+::: {.callout-warning}
+The model's initial loss and grad norm will be very high. We suspect this to be due to the Conv in the vision layers.
+:::
+
+::: {.callout-tip}
+Please make sure to install `timm` via `pip3 install timm==1.0.17`
+:::
+
+```yaml
+base_model: google/gemma-3n-E2B-it
+
+chat_template: gemma3n
+```
+
 ### Qwen2-VL {#sec-qwen2-vl}
 
 ```yaml
@@ -132,7 +149,9 @@ For multi-modal datasets, we adopt an extended `chat_template` format similar to
 
 - A message is a list of `role` and `content`.
 - `role` can be `system`, `user`, `assistant`, etc.
-- `content` is a list of `type` and (`text` or `image` or `path` or `url` or `base64`).
+- `content` is a list of `type` and (`text`, `image`, `path`, `url`, `base64`, or `audio`).
+
+### Image
 
 ::: {.callout-note}
 For backwards compatibility:
@@ -141,15 +160,29 @@ For backwards compatibility:
 - If `content` is a string, it will be converted to a list with `type` as `text`.
 :::
 
-::: {.callout-tip}
 For image loading, you can use the following keys within `content` alongside `"type": "image"`:
 
 - `"path": "/path/to/image.jpg"`
 - `"url": "https://example.com/image.jpg"`
 - `"base64": "..."`
 - `"image": PIL.Image`
+
+### Audio
+
+For audio loading, you can use the following keys within `content` alongside `"type": "audio"`:
+
+- `"path": "/path/to/audio.mp3"`
+- `"url": "https://example.com/audio.mp3"`
+- `"audio": np.ndarray`
+
+::: {.callout-tip}
+
+You may need to install `librosa` via `pip3 install librosa==0.11.0`.
+
 :::
 
+### Example
+
 Here is an example of a multi-modal dataset:
 ```json
 [
@@ -178,3 +211,9 @@ Here is an example of a multi-modal dataset:
   }
 ]
 ```
+
+## FAQ
+
+1. `PIL.UnidentifiedImageError: cannot identify image file ...`
+
+`PIL` could not retrieve the file at `url` using `requests`. Please check for typo. One alternative reason is that the request is blocked by the server.
diff --git a/docs/nd_parallelism.qmd b/docs/nd_parallelism.qmd
new file mode 100644
index 000000000..435e53e21
--- /dev/null
+++ b/docs/nd_parallelism.qmd
@@ -0,0 +1,108 @@
+---
+title: "N-D Parallelism (Beta)"
+---
+
+Axolotl enables training models at scale by composing different parallelism techniques. This is essential when:
+
+- A model's weights are too large to fit on a single GPU's memory.
+- A model's activations, especially with very long contexts, are too large for a single GPU.
+- You want to accelerate training by using multiple GPUs or nodes.
+
+or combinations of the above!
+
+## Core Concepts
+
+Parallelism strategies can be combined. The key is understanding how each one divides the workload. PyTorch's `DeviceMesh` is the modern way to manage these combinations, creating a logical grid of your GPUs and assigning different parallel strategies to different dimensions of the grid.
+
+### Data Parallelism {#sec-dp}
+
+Data Parallelism focuses on splitting the global data batch across GPUs.
+
+- Distributed Data Parallel (DDP): The classic approach. The full model is replicated on every GPU. Each GPU processes a different slice of the data batch. Gradients are then averaged across all GPUs after the backward pass to keep the models synchronized. This can substantially improve data throughput compared to single-device training, but requires that each GPU is able to hold the entire model, its gradients, and optimizer states.
+
+- [Fully Sharded Data Parallel (FSDP)](multi-gpu.qmd#fully-sharded-data-parallel-(fsdp)): A highly memory-efficient form of data parallelism (inspired by DeepSpeed's ZeRO). Instead of replicating the model, FSDP shards the model's *parameters, gradients, and optimizer states* across the GPUs in the data-parallel group. During computation, each GPU receives the specific parameters it needs via an `all_gather` operation just before they are used, and they can be discarded immediately after (`reshard-after-forward`).
+    - FSDP maps to ZeRO stages:
+        - ZeRO-2 (`reshard_after_forward=False`): Shards gradients and optimizer states. Model weights are replicated on each GPU.
+        - ZeRO-3 (`reshard_after_forward=True`): Shards gradients, optimizer states, AND model parameters. This provides the most memory savings at the cost of more communication (re-gathering parameters for both forward and backward passes).
+
+### [Experimental] Tensor Parallelism (TP) {#sec-tp}
+
+Also known as "horizontal model parallelism," as described in the [Megatron-LM paper](https://arxiv.org/pdf/1909.08053.pdf). Instead of splitting the batch, TP splits the model's layers themselves across GPUs.
+
+- How it works: For a linear layer `Y = XA`, the weight matrix `A` is split column-wise (`A = [A_1, A_2]`). The computation becomes `Y_1 = XA_1` and `Y_2 = XA_2`, which can happen in parallel on different GPUs. The final output `Y` is simply the concatenation of `Y_1` and `Y_2`. Check [this comment](https://github.com/huggingface/transformers/issues/10321#issuecomment-783543530) for more detailed info.
+- Requirement: TP involves frequent, small communications within a forward/backward pass. It requires a very fast interconnect between GPUs (e.g., NVLink) and is typically not recommended across different nodes.
+
+### Context Parallelism (CP) {#sec-cp}
+
+Context Parallelism, also called [Sequence Parallelism](sequence_parallelism.qmd), addresses the memory bottleneck from long sequences. The input sequence itself is split along the sequence length dimension and distributed across GPUs.
+
+- How it works: If you have a sequence of 8192 tokens and a `context_parallel_size` of 4, each GPU will only handle a chunk of 2048 tokens.
+- The Challenge: Attention is not local; every token needs to "attend to" every other token. Splitting the sequence breaks this.
+- The Solution (`ring-flash-attention`): An efficient communication protocol is used. To compute attention for its local sequence chunk, each GPU passes its Key-Value (KV) cache to its neighbor in a "ring." After `N-1` steps, every GPU has seen the KV-cache from all other GPUs, allowing it to compute the correct attention values for its chunk. This is implemented using the highly optimized `flash-attention` kernel at each step.
+
+### Hybrid Sharding Data Parallel (HSDP) {#sec-hsdp}
+
+HSDP is a 2D strategy that intelligently combines FSDP and DDP, typically for multi-node training.
+
+- Intra-Node (within a machine): Use FSDP. This is efficient because GPUs on the same node have fast interconnects (NVLink), making the `all_gather` operations for sharded parameters fast.
+- Inter-Node (across machines): Use DDP. The gradient synchronization between nodes is less frequent than FSDP's parameter gathering, making it a better fit for the slower node-to-node network (e.g., Ethernet/Infiniband).
+- Example: With 2 nodes of 8 GPUs each (16 total), you could have `dp_shard_size=8` (FSDP within each node) and `dp_replicate_size=2` (DDP across the two nodes).
+
+## Usage
+
+```yaml
+# FSDP config. See https://docs.axolotl.ai/docs/multi-gpu.html#sec-fsdp
+fsdp_version: 2
+fsdp_config:
+  # ...
+
+# The number of GPUs to shard the model parameters across (FSDP dimension).
+dp_shard_size: 4
+
+# The number of times to replicate the sharded model (DDP dimension).
+dp_replicate_size: 2
+
+# Number of GPUs for Tensor Parallelism.
+tensor_parallel_size: 1  # (default is 1, no TP)
+
+# Number of GPUs for Context/Sequence Parallelism.
+context_parallel_size: 1 # (default is 1, no CP)
+```
+
+Note: We recommend FSDP. DeepSpeed is only compatible with `tensor_parallel_size`.
+
+## Examples
+
+::: {.callout-tip}
+See our example configs [here](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/distributed-parallel).
+:::
+
+1.  HSDP on 2 nodes with 4 GPUs each (8 GPUs total):
+    - You want FSDP within each node and DDP across nodes.
+    - Set `dp_shard_size: 4` and `dp_replicate_size: 2`.
+
+2.  FSDP + TP on a single 8-GPU node:
+    - You want to split the model across 4 GPUs using FSDP, and further split each layer across 2 GPUs with TP.
+    - Set `dp_shard_size: 4` and `tensor_parallel_size: 2`.
+
+3.  FSDP + CP on a single 8-GPU node for long context:
+    - You want to shard the model across all 8 GPUs and also split the sequence length across all 8 GPUs.
+    - Set `dp_shard_size: 8` and `context_parallel_size: 8`. Note: this means the data parallel group and context parallel group are the same. A more common setup might be to shard across a smaller group.
+
+## Support Matrix
+
+This matrix describes how different parallelism methods can be combined in Axolotl.
+
+| Combination | `dp_replicate_size` | `dp_shard_size` | `tp_size` | `cp_size` | Status & Notes |
+| --- | :---: | :---: |:---:|:---:|---|
+| **FSDP** (ZeRO-3) | 1 | >1 | 1 | 1 | ✅ Fully supported. Shards model across all GPUs. |
+| **HSDP** | >1 | >1 | 1 | 1 | ✅ Fully supported. FSDP intra-node, DDP inter-node. |
+| **FSDP + TP** | 1 | >1 | >1 | 1 | ✅ **2D Parallelism**. Shards the model across a `dp_shard` group, and TP-splits layers within the `tp` group. |
+| **HSDP + TP** | >1 | >1 | >1 | 1 | ✅ **3D Parallelism**. A powerful but complex combination. |
+| **FSDP + CP** | 1 | >1 | 1 | >1 | ✅ **2D Parallelism**. Combines FSDP with context parallelism. |
+| **FSDP + TP + CP**| 1 | >1 | >1| >1| ✅ **3D Parallelism**. Another advanced combination. |
+| DDP + TP/CP | >1 | 1 | >1 | >1 | ❌ **Not Supported**. The `ParallelismConfig` explicitly prevents this, as composing pure DDP with TP or CP is currently not supported. You should use FSDP + TP/CP instead (`dp_shard_size > 1`). |
+| Just TP / CP | 1 | 1 | >1 | >1 | ✅ Supported. Useful for inference or when the model fits on one GPU but context is too long. |
+
+- `tp_size` refers to `tensor_parallel_size`
+- `cp_size` refers to `context_parallel_size`
diff --git a/docs/optimizers.qmd b/docs/optimizers.qmd
new file mode 100644
index 000000000..45eea1d3a
--- /dev/null
+++ b/docs/optimizers.qmd
@@ -0,0 +1,129 @@
+---
+title: Optimizers
+description: Configuring optimizers
+---
+
+## Overview
+
+Axolotl supports all optimizers supported by [transformers OptimizerNames](https://github.com/huggingface/transformers/blob/51f94ea06d19a6308c61bbb4dc97c40aabd12bad/src/transformers/training_args.py#L142-L187)
+
+Here is a list of optimizers supported by transformers as of `v4.54.0`:
+
+- `adamw_torch`
+- `adamw_torch_fused`
+- `adamw_torch_xla`
+- `adamw_torch_npu_fused`
+- `adamw_apex_fused`
+- `adafactor`
+- `adamw_anyprecision`
+- `adamw_torch_4bit`
+- `adamw_torch_8bit`
+- `ademamix`
+- `sgd`
+- `adagrad`
+- `adamw_bnb_8bit`
+- `adamw_8bit`  # alias for adamw_bnb_8bit
+- `ademamix_8bit`
+- `lion_8bit`
+- `lion_32bit`
+- `paged_adamw_32bit`
+- `paged_adamw_8bit`
+- `paged_ademamix_32bit`
+- `paged_ademamix_8bit`
+- `paged_lion_32bit`
+- `paged_lion_8bit`
+- `rmsprop`
+- `rmsprop_bnb`
+- `rmsprop_bnb_8bit`
+- `rmsprop_bnb_32bit`
+- `galore_adamw`
+- `galore_adamw_8bit`
+- `galore_adafactor`
+- `galore_adamw_layerwise`
+- `galore_adamw_8bit_layerwise`
+- `galore_adafactor_layerwise`
+- `lomo`
+- `adalomo`
+- `grokadamw`
+- `schedule_free_radam`
+- `schedule_free_adamw`
+- `schedule_free_sgd`
+- `apollo_adamw`
+- `apollo_adamw_layerwise`
+- `stable_adamw`
+
+
+## Custom Optimizers
+
+Enable custom optimizers by passing a string to the `optimizer` argument. Each optimizer will receive beta and epsilon args, however, some may accept additional args which are detailed below.
+
+### optimi_adamw
+
+```yaml
+optimizer: optimi_adamw
+```
+
+### ao_adamw_4bit
+
+Deprecated: Please use `adamw_torch_4bit`.
+
+### ao_adamw_8bit
+
+Deprecated: Please use `adamw_torch_8bit`.
+
+### ao_adamw_fp8
+
+
+```yaml
+optimizer: ao_adamw_fp8
+```
+
+### adopt_adamw
+
+GitHub: [https://github.com/iShohei220/adopt](https://github.com/iShohei220/adopt)
+Paper: [https://arxiv.org/abs/2411.02853](https://arxiv.org/abs/2411.02853)
+
+```yaml
+optimizer: adopt_adamw
+```
+
+### came_pytorch
+
+GitHub: [https://github.com/yangluo7/CAME/tree/master](https://github.com/yangluo7/CAME/tree/master)
+Paper: [https://arxiv.org/abs/2307.02047](https://arxiv.org/abs/2307.02047)
+
+```yaml
+optimizer: came_pytorch
+
+# optional args (defaults below)
+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_beta3: 0.9999
+adam_epsilon: 1e-30
+adam_epsilon2: 1e-16
+```
+
+### muon
+
+Blog: [https://kellerjordan.github.io/posts/muon/](https://kellerjordan.github.io/posts/muon/)
+Paper: [https://arxiv.org/abs/2502.16982v1](https://arxiv.org/abs/2502.16982v1)
+
+```yaml
+optimizer: muon
+```
+
+### dion
+
+Microsoft's Dion (DIstributed OrthoNormalization) optimizer is a scalable and communication-efficient
+orthonormalizing optimizer that uses low-rank approximations to reduce gradient communication.
+
+GitHub: [https://github.com/microsoft/dion](https://github.com/microsoft/dion)
+Paper: [https://arxiv.org/pdf/2504.05295](https://arxiv.org/pdf/2504.05295)
+Note: Implementation written for PyTorch 2.7+ for DTensor
+
+```yaml
+optimizer: dion
+dion_lr: 0.01
+dion_momentum: 0.95
+lr: 0.00001  # learning rate for embeddings and parameters that fallback to AdamW
+```
diff --git a/docs/qat.qmd b/docs/qat.qmd
new file mode 100644
index 000000000..e0d000a79
--- /dev/null
+++ b/docs/qat.qmd
@@ -0,0 +1,32 @@
+---
+title: "Quantization Aware Training (QAT)"
+back-to-top-navigation: true
+toc: true
+toc-expand: 2
+toc-depth: 4
+---
+
+## Overview
+
+[Quantization Aware Training](https://pytorch.org/blog/introduction-to-quantization-on-pytorch/#quantization-aware-training) (QAT) is a technique for improving the accuracy of models which are quantized
+by applying "fake" quantizations to the model's weights (and optionally, activations) during training. This fake
+quantization allows for the model to adjust for noise introduced by the quantization, so when the model is eventually
+quantized, the accuracy loss is minimized. We use the quantization techniques implemented in [torchao](https://github.com/pytorch/ao) to provide
+support for QAT and post-training quantization (PTQ) in axolotl.
+
+We recommend reviewing the excellent QAT tutorial in the [torchtune library](https://pytorch.org/torchtune/main/tutorials/qat_finetune.html#quantizing-the-qat-model),
+and the QAT documentation in the [torchao library](https://github.com/pytorch/ao/tree/main/torchao/quantization/qat), for more details.
+
+## Configuring QAT in Axolotl
+
+To enable QAT in axolotl, add the following to your configuration file:
+
+```yaml
+qat:
+  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
+  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4" and "int8"
+  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
+  fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after
+```
+
+Once you have finished training, you must quantize your model by using the same quantization configuration which you used to train the model with. You can use the [`quantize`](./quantize.qmd) command to do this.
diff --git a/docs/quantize.qmd b/docs/quantize.qmd
new file mode 100644
index 000000000..113fcafbe
--- /dev/null
+++ b/docs/quantize.qmd
@@ -0,0 +1,53 @@
+---
+title: "Quantization with torchao"
+back-to-top-navigation: true
+toc: true
+toc-expand: 2
+toc-depth: 4
+---
+
+Quantization is a technique to lower the memory footprint of your model, potentially at the cost of accuracy or model performance. We support quantizing your model using the [torchao](https://github.com/pytorch/ao) library. Quantization is supported for both post-training quantization (PTQ) and quantization-aware training (QAT).
+
+
+::: {.callout-note}
+
+We do not currently support quantization techniques such as GGUF/GPTQ,EXL2 at the moment.
+
+:::
+
+## Configuring Quantization in Axolotl
+
+Quantization is configured using the `quantization` key in your configuration file.
+
+```yaml
+base_model: # The path to the model to quantize.
+quantization:
+  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are uintX for X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8
+  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
+  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
+  quantize_embedding: # Optional[bool] = False. Whether to quantize the embedding layer.
+
+output_dir:  # The path to the output directory.
+```
+
+Once quantization is complete, your quantized model will be saved in the `{output_dir}/quantized` directory.
+
+You may also use the `quantize` command to quantize a model which has been trained with [QAT](./qat.qmd) - you can do this by using the existing QAT configuration file which
+you used to train the model:
+
+```yaml
+# qat.yml
+qat:
+  activation_dtype: int8
+  weight_dtype: int8
+  group_size: 256
+  quantize_embedding: true
+
+output_dir: # The path to the output directory used during training where the final checkpoint has been saved.
+```
+
+```bash
+axolotl quantize qat.yml
+```
+
+This ensures that an identical quantization configuration is used to quantize the model as was used to train it.
diff --git a/docs/rlhf.qmd b/docs/rlhf.qmd
index 3a8f87d71..4a67b7559 100644
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -16,7 +16,7 @@ feedback. Various methods include, but not limited to:
 - [Identity Preference Optimization (IPO)](#ipo)
 - [Kahneman-Tversky Optimization (KTO)](#kto)
 - [Odds Ratio Preference Optimization (ORPO)](#orpo)
-- Proximal Policy Optimization (PPO) (not yet supported in axolotl)
+- [Group Relative Policy Optimization (GRPO)](#grpo)
 
 
 ## RLHF using Axolotl
@@ -274,15 +274,14 @@ rl: dpo
 datasets:
   - path: ...
     split: train
-    type: user_defined.default
-
-    field_prompt: "prompt"
-    field_system: "system"
-    field_chosen: "chosen"
-    field_rejected: "rejected"
-    prompt_format: "{prompt}"
-    chosen_format: "{chosen}"
-    rejected_format: "{rejected}"
+    type:
+      field_prompt: "prompt"
+      field_system: "system"
+      field_chosen: "chosen"
+      field_rejected: "rejected"
+      prompt_format: "{prompt}"
+      chosen_format: "{chosen}"
+      rejected_format: "{rejected}"
 ```
 
 The input format is a simple JSON input with customizable fields based on the above config.
@@ -475,14 +474,13 @@ rl: kto
 datasets:
   - path: ...
     split: train
-    type: user_defined.default
-
-    field_prompt: "prompt"
-    field_system: "system"
-    field_completion: "completion"
-    field_label: "label"
-    prompt_format: "{prompt}"
-    completion_format: "{completion}"
+    type:
+      field_prompt: "prompt"
+      field_system: "system"
+      field_completion: "completion"
+      field_label: "label"
+      prompt_format: "{prompt}"
+      completion_format: "{completion}"
 ```
 
 The input format is a simple JSON input with customizable fields based on the above config.
@@ -499,7 +497,7 @@ The input format is a simple JSON input with customizable fields based on the ab
 ### GRPO
 
 ::: {.callout-tip}
-Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/axolotl-cookbook/tree/main/grpo#training-an-r1-style-large-language-model-using-grpo).
+Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/grpo_code).
 :::
 
 In the latest GRPO implementation, `vLLM` is used to significantly speedup trajectory generation during training. In this example, we're using 4 GPUs - 2 for training, and 2 for vLLM:
@@ -582,7 +580,20 @@ datasets:
 
 To see other examples of custom reward functions, please see [TRL GRPO Docs](https://github.com/huggingface/trl/blob/main/docs/source/grpo_trainer.md#using-a-custom-reward-function).
 
-To see description of the configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/config/models/input/v0_4_1/trl.py).
+To see all configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/v0.9.2/src/axolotl/utils/schemas/trl.py).
+
+#### GRPO with DAPO/Dr. GRPO loss
+
+The DAPO paper and subsequently Dr. GRPO paper proposed an alternative loss function for GRPO to remediate the penalty in longer responses.
+
+```yaml
+trl:
+  loss_type: dr_grpo
+  # Normalizes loss based on max completion length (default: 256)
+  max_completion_length:
+```
+
+For more information, see [GRPO docs](https://huggingface.co/docs/trl/v0.17.0/en/grpo_trainer#loss-types).
 
 ### SimPO
 
diff --git a/docs/scripts/generate_config_docs.py b/docs/scripts/generate_config_docs.py
new file mode 100644
index 000000000..e22da7d05
--- /dev/null
+++ b/docs/scripts/generate_config_docs.py
@@ -0,0 +1,752 @@
+# type: ignore
+
+"""
+Quarto documentation generation from Pydantic models. Uses Pydantic model source code
+to automatically group fields, including inherited fields from parent classes.
+"""
+
+import ast
+import inspect
+import textwrap
+import types
+import typing
+from typing import Any, FrozenSet, Type, Union
+
+from pydantic import BaseModel
+
+from axolotl.utils.schemas.config import AxolotlInputConfig
+
+
+class QuartoGenerator:
+    """Generate Quarto documentation from Pydantic models."""
+
+    def __init__(self):
+        self._class_fields_cache = {}
+        self._inheritance_map_cache = {}
+        self._nested_models_cache = {}
+
+    def _get_direct_fields(self, cls: Type[BaseModel]) -> FrozenSet[str]:
+        """Get fields defined directly in a single class (not inherited)."""
+        if cls in self._class_fields_cache:
+            return self._class_fields_cache[cls]
+
+        fields = set()
+
+        # Get annotated fields
+        if hasattr(cls, "__annotations__"):
+            fields.update(cls.__annotations__.keys())
+
+        # Filter out private/special methods
+        fields = {f for f in fields if not f.startswith("_")}
+
+        result = frozenset(fields)
+        self._class_fields_cache[cls] = result
+        return result
+
+    def _is_pydantic_model(self, type_obj) -> bool:
+        """Check if a type is a Pydantic BaseModel."""
+        return inspect.isclass(type_obj) and issubclass(type_obj, BaseModel)
+
+    # pylint: disable=too-many-return-statements
+    def _extract_nested_type(self, field_type) -> Any:
+        """Extract the actual type from complex type annotations."""
+        # Handle Annotated types (Python 3.9+)
+        if hasattr(typing, "get_origin") and hasattr(typing, "get_args"):
+            origin = typing.get_origin(field_type)
+            args = typing.get_args(field_type)
+
+            if origin is not None:
+                # Handle Annotated[SomeType, ...] - extract the first argument
+                if hasattr(typing, "Annotated") and origin is typing.Annotated:
+                    if args:
+                        return self._extract_nested_type(
+                            args[0]
+                        )  # Recursively process the actual type
+
+                # Handle list[SomeType], List[SomeType], etc.
+                elif origin in (list, typing.List):
+                    if args:
+                        return self._extract_nested_type(
+                            args[0]
+                        )  # Extract element type
+
+                # Handle Union types (including | syntax)
+                elif origin is typing.Union:
+                    # Get non-None types from the Union
+                    non_none_types = [arg for arg in args if arg is not type(None)]
+                    if len(non_none_types) >= 1:
+                        # Prioritize Pydantic models over primitive types
+                        pydantic_models = [
+                            arg
+                            for arg in non_none_types
+                            if self._is_pydantic_model(arg)
+                        ]
+                        if pydantic_models:
+                            # Return the first Pydantic model found
+                            return self._extract_nested_type(pydantic_models[0])
+
+                        # No Pydantic models, return the first non-None type
+                        return self._extract_nested_type(non_none_types[0])
+
+        # Handle new Python 3.10+ union syntax (PeftConfig | None)
+        if hasattr(field_type, "__class__") and field_type.__class__ is types.UnionType:
+            # Get non-None types from the Union
+            non_none_types = [
+                arg for arg in field_type.__args__ if arg is not type(None)
+            ]
+            if len(non_none_types) >= 1:
+                # Prioritize Pydantic models over primitive types
+                pydantic_models = [
+                    arg for arg in non_none_types if self._is_pydantic_model(arg)
+                ]
+                if pydantic_models:
+                    return self._extract_nested_type(pydantic_models[0])
+                return self._extract_nested_type(non_none_types[0])
+
+        # Handle old typing.Union syntax (fallback)
+        if hasattr(field_type, "__origin__"):
+            if field_type.__origin__ is Union:
+                # Get non-None types from the Union
+                non_none_types = [
+                    arg for arg in field_type.__args__ if arg is not type(None)
+                ]
+                if len(non_none_types) >= 1:
+                    # Prioritize Pydantic models over primitive types
+                    pydantic_models = [
+                        arg for arg in non_none_types if self._is_pydantic_model(arg)
+                    ]
+                    if pydantic_models:
+                        return self._extract_nested_type(pydantic_models[0])
+                    return self._extract_nested_type(non_none_types[0])
+            # Handle other generic types like dict[str, Any], etc.
+            elif hasattr(field_type, "__args__"):
+                return field_type
+
+        return field_type
+
+    # pylint: disable=too-many-return-statements
+    def _extract_all_pydantic_models_from_type(
+        self, field_type
+    ) -> list[type[BaseModel]]:
+        """Extract all Pydantic models from a type annotation, including from Unions."""
+        models = []
+
+        if field_type is None:
+            return models
+
+        # Handle Annotated types
+        if hasattr(typing, "get_origin") and hasattr(typing, "get_args"):
+            origin = typing.get_origin(field_type)
+            args = typing.get_args(field_type)
+
+            if origin is not None:
+                # Handle Annotated[SomeType, ...] - extract from the first argument
+                if hasattr(typing, "Annotated") and origin is typing.Annotated:
+                    if args:
+                        models.extend(
+                            self._extract_all_pydantic_models_from_type(args[0])
+                        )
+                    return models
+
+                # Handle list[SomeType], List[SomeType], etc.
+                if origin in (list, typing.List):
+                    if args:
+                        models.extend(
+                            self._extract_all_pydantic_models_from_type(args[0])
+                        )
+                    return models
+
+                # Handle Union types
+                if origin is typing.Union:
+                    for arg in args:
+                        if arg is not type(None):  # Skip None type
+                            models.extend(
+                                self._extract_all_pydantic_models_from_type(arg)
+                            )
+                    return models
+
+        # Handle new Python 3.10+ union syntax
+        if hasattr(field_type, "__class__") and field_type.__class__ is types.UnionType:
+            for arg in field_type.__args__:
+                if arg is not type(None):  # Skip None type
+                    models.extend(self._extract_all_pydantic_models_from_type(arg))
+            return models
+
+        # Handle old typing.Union syntax (fallback)
+        if hasattr(field_type, "__origin__") and field_type.__origin__ is Union:
+            for arg in field_type.__args__:
+                if arg is not type(None):  # Skip None type
+                    models.extend(self._extract_all_pydantic_models_from_type(arg))
+            return models
+
+        # Check if this type itself is a Pydantic model
+        if self._is_pydantic_model(field_type):
+            models.append(field_type)
+
+        return models
+
+    def _get_nested_models(
+        self, model_class: type[BaseModel], visited=None
+    ) -> dict[str, type[BaseModel]]:
+        """Get all nested Pydantic models from a model class."""
+        if visited is None:
+            visited = set()
+
+        # Avoid infinite recursion
+        if model_class in visited:
+            return {}
+
+        if model_class in self._nested_models_cache:
+            return self._nested_models_cache[model_class]
+
+        visited.add(model_class)
+        nested_models = {}
+
+        # Check all fields in the model
+        for field_info in model_class.model_fields.values():
+            field_type = self._extract_nested_type(field_info.annotation)
+
+            if self._is_pydantic_model(field_type):
+                nested_models[field_type.__name__] = field_type
+                # Recursively get nested models from this nested model
+                deeper_nested = self._get_nested_models(field_type, visited.copy())
+                nested_models.update(deeper_nested)
+
+        self._nested_models_cache[model_class] = nested_models
+        return nested_models
+
+    def _build_inheritance_map(self, child_class: Type[BaseModel]):
+        """Build inheritance map for a class and all its parents."""
+        if child_class in self._inheritance_map_cache:
+            return self._inheritance_map_cache[child_class]
+
+        inheritance_map = {}
+
+        # Get MRO and filter out BaseModel and object
+        mro_classes = [
+            cls
+            for cls in child_class.__mro__
+            if cls not in (BaseModel, object) and hasattr(cls, "__annotations__")
+        ]
+
+        # Process each class in the MRO
+        for cls in mro_classes:
+            inheritance_map[cls] = self._get_direct_fields(cls)
+
+        self._inheritance_map_cache[child_class] = inheritance_map
+        return inheritance_map
+
+    def _wrap_comment(self, text: str, width: int = 88) -> list[str]:
+        """Wrap a comment to specified width, accounting for '# ' prefix."""
+        if not text.strip():
+            return ["#"]
+
+        # Account for "# " prefix (2 characters)
+        content_width = width - 2
+        wrapped_lines = textwrap.wrap(text, width=content_width)
+        return [f"# {line}" for line in wrapped_lines]
+
+    def _extract_type_from_source(
+        self, model_class: type[BaseModel], field_name: str
+    ) -> str:
+        """Extract the actual type annotation text from source code, checking inheritance chain."""
+        # Use inheritance map to check classes efficiently
+        inheritance_map = self._build_inheritance_map(model_class)
+
+        # Check classes in MRO order
+        for cls in model_class.__mro__:
+            if cls in inheritance_map and field_name in inheritance_map[cls]:
+                type_annotation = self._get_type_from_class_source(cls, field_name)
+                if type_annotation != "unknown":
+                    return type_annotation
+
+        return "unknown"
+
+    def _get_type_from_class_source(self, class_obj: type, field_name: str) -> str:
+        """Extract type annotation from a specific class's source code."""
+        try:
+            source = inspect.getsource(class_obj)
+            tree = ast.parse(source)
+        except (OSError, TypeError):
+            return "unknown"
+
+        # Find the class definition
+        for node in tree.body:
+            if isinstance(node, ast.ClassDef) and node.name == class_obj.__name__:
+                # Find the field assignment
+                for body_node in node.body:
+                    if isinstance(body_node, ast.AnnAssign) and isinstance(
+                        body_node.target, ast.Name
+                    ):
+                        if body_node.target.id == field_name and body_node.annotation:
+                            return ast.unparse(body_node.annotation)
+                break
+
+        return "unknown"
+
+    def _extract_field_groups_from_all_classes(
+        self, model_class: type[BaseModel]
+    ) -> list[dict]:
+        """Extract field groups from all classes in the inheritance hierarchy."""
+        all_groups = []
+        inheritance_map = self._build_inheritance_map(model_class)
+
+        # Get all Pydantic base classes in MRO order (most specific first)
+        # This puts AxolotlInputConfig fields first, then parent class fields
+        pydantic_classes = [
+            cls
+            for cls in model_class.__mro__
+            if cls in inheritance_map and inheritance_map[cls]
+        ]
+
+        # Extract groups from each class
+        for cls in pydantic_classes:
+            class_groups = self._extract_field_groups_from_source(cls)
+            for group in class_groups:
+                all_groups.append(group)
+
+        # If no groups found, create a default grouping by class
+        if not all_groups:
+            for cls in pydantic_classes:
+                fields_in_class = inheritance_map[cls]
+                if fields_in_class:
+                    all_groups.append(
+                        {
+                            "fields": list(fields_in_class),
+                        }
+                    )
+
+        return all_groups
+
+    # pylint: disable=too-many-return-statements
+    def _extract_field_groups_from_source(
+        self, model_class: type[BaseModel]
+    ) -> list[dict]:
+        """Extract field groups from source code based on blank lines and comments."""
+        try:
+            source = inspect.getsource(model_class)
+            tree = ast.parse(source)
+        except (OSError, TypeError):
+            # Fallback if we can't get source code
+            fields_in_class = self._get_direct_fields(model_class)
+            if fields_in_class:
+                return [
+                    {
+                        "fields": list(fields_in_class),
+                    }
+                ]
+            return []
+
+        groups = []
+        current_group_fields = []
+        current_group_comment = None
+
+        # Find the class definition
+        class_node = None
+        for node in ast.walk(tree):
+            if isinstance(node, ast.ClassDef) and node.name == model_class.__name__:
+                class_node = node
+                break
+
+        if not class_node:
+            fields_in_class = self._get_direct_fields(model_class)
+            if fields_in_class:
+                return [
+                    {
+                        "fields": list(fields_in_class),
+                    }
+                ]
+            return []
+
+        # Parse the source lines to detect groupings
+        source_lines = source.split("\n")
+
+        # Get fields that are actually defined in this specific class
+        fields_in_class = self._get_direct_fields(model_class)
+
+        # Find assignments that correspond to model fields for THIS class only
+        field_assignments = []
+        for node in class_node.body:
+            if isinstance(node, ast.AnnAssign) and isinstance(node.target, ast.Name):
+                field_name = node.target.id
+                if field_name in fields_in_class:
+                    field_assignments.append(
+                        {
+                            "name": field_name,
+                            "lineno": node.lineno,
+                            "end_lineno": getattr(node, "end_lineno", node.lineno),
+                        }
+                    )
+
+        if not field_assignments:
+            if fields_in_class:
+                return [
+                    {
+                        "fields": list(fields_in_class),
+                    }
+                ]
+            return []
+
+        # Sort by line number
+        field_assignments.sort(key=lambda x: x["lineno"])
+
+        # Group fields based on blank lines and comments
+        for i, field_info in enumerate(field_assignments):
+            field_name = field_info["name"]
+            current_line = field_info["lineno"]
+
+            # Check if this starts a new group (blank line before or significant gap)
+            is_new_group = False
+
+            if i == 0:
+                is_new_group = True
+            else:
+                prev_end_line = field_assignments[i - 1]["end_lineno"]
+
+                # Check for blank lines or comments between fields
+                lines_between = source_lines[prev_end_line : current_line - 1]
+                has_blank_line = any(line.strip() == "" for line in lines_between)
+                has_comment = any(
+                    line.strip().startswith("#") for line in lines_between
+                )
+
+                # Start new group if there's a blank line or comment, or significant gap
+                if has_blank_line or has_comment or (current_line - prev_end_line > 3):
+                    is_new_group = True
+
+            if is_new_group and current_group_fields:
+                # Save the previous group
+                groups.append(
+                    {
+                        "fields": current_group_fields.copy(),
+                        "description": current_group_comment,
+                    }
+                )
+                current_group_fields = []
+                current_group_comment = None
+
+            current_group_fields.append(field_name)
+
+        # Add the final group
+        if current_group_fields:
+            groups.append(
+                {
+                    "fields": current_group_fields,
+                    "description": current_group_comment,
+                }
+            )
+
+        return groups
+
+    def _generate_field_documentation(
+        self,
+        model_class: type[BaseModel],
+        field_name: str,
+        field_info: dict,
+        field_type_str: str,
+        is_required: bool,
+        indent_level: int = 0,
+        visited_models: set = None,
+    ) -> list[str]:
+        """Generate documentation for a single field, expanding nested models inline."""
+        if visited_models is None:
+            visited_models = set()
+
+        lines = []
+        indent = "  " * indent_level
+
+        # Get the actual field type for nested model detection
+        if field_name in model_class.model_fields:
+            pydantic_field_info = model_class.model_fields[field_name]
+            actual_field_type = pydantic_field_info.annotation
+        else:
+            actual_field_type = None
+
+        # Add description comment if available
+        description = field_info.get("description", "")
+        if description:
+            wrapped_lines = self._wrap_comment(description, width=88 - len(indent))
+            for line in wrapped_lines:
+                lines.append(f"{indent}{line}")
+
+        # Extract nested Pydantic models from the type annotation
+        nested_models = self._extract_all_pydantic_models_from_type(actual_field_type)
+
+        # Filter out already visited models to prevent infinite recursion
+        expandable_models = [
+            model for model in nested_models if model not in visited_models
+        ]
+
+        if expandable_models:
+            # This field contains Pydantic models that can be expanded
+
+            # Show the field with its full type annotation
+            field_line = f"{indent}{field_name}: {field_type_str}"
+            if field_info.get("default") is not None:
+                field_line += f" = {field_info['default']}"
+            if is_required:
+                field_line += " (required)"
+            lines.append(field_line)
+
+            # Add to visited to prevent infinite recursion
+            new_visited = visited_models.copy()
+            new_visited.update(expandable_models)
+
+            # Expand each nested Pydantic model
+            for i, nested_model in enumerate(expandable_models):
+                if i > 0:
+                    lines.append("\n")
+                lines.append(f"{indent}  # For {nested_model.__name__}:")
+
+                # Get nested model schema
+                try:
+                    nested_schema = nested_model.model_json_schema()
+                    nested_properties = nested_schema.get("properties", {})
+                    nested_required = nested_schema.get("required", [])
+                except Exception:  # pylint: disable=broad-exception-caught
+                    # Fallback: use model fields directly
+                    nested_properties = {}
+                    nested_required = []
+                    for (
+                        nested_field_name,
+                        nested_field_info,
+                    ) in nested_model.model_fields.items():
+                        nested_description = ""
+                        if (
+                            hasattr(nested_field_info, "json_schema_extra")
+                            and nested_field_info.json_schema_extra
+                        ):
+                            nested_description = (
+                                nested_field_info.json_schema_extra.get(
+                                    "description", ""
+                                )
+                            )
+                        elif (
+                            hasattr(nested_field_info, "description")
+                            and nested_field_info.description
+                        ):
+                            nested_description = nested_field_info.description
+
+                        nested_default_val = None
+                        if (
+                            hasattr(nested_field_info, "default")
+                            and nested_field_info.default is not None
+                        ):
+                            if str(nested_field_info.default) != "PydanticUndefined":
+                                nested_default_val = nested_field_info.default
+
+                        nested_properties[nested_field_name] = {
+                            "type": "unknown",
+                            "description": nested_description,
+                            "default": nested_default_val,
+                        }
+
+                        if nested_field_info.is_required():
+                            nested_required.append(nested_field_name)
+
+                # Get field groups for the nested model
+                nested_field_groups = self._extract_field_groups_from_all_classes(
+                    nested_model
+                )
+
+                # Generate nested fields with increased indentation
+                for i, group in enumerate(nested_field_groups):
+                    if not group["fields"]:
+                        continue
+
+                    # Add blank line between groups (except before first group)
+                    if i > 0:
+                        lines.append("")
+
+                    # Process nested fields
+                    for nested_field_name in group["fields"]:
+                        if nested_field_name not in nested_properties:
+                            continue
+
+                        nested_field_info = nested_properties[nested_field_name]
+                        nested_field_type = self._extract_type_from_source(
+                            nested_model, nested_field_name
+                        )
+                        nested_is_required = nested_field_name in nested_required
+
+                        # Recursively generate documentation for nested field
+                        nested_lines = self._generate_field_documentation(
+                            nested_model,
+                            nested_field_name,
+                            nested_field_info,
+                            nested_field_type,
+                            nested_is_required,
+                            indent_level + 1,
+                            new_visited,
+                        )
+                        lines.extend(nested_lines)
+        else:
+            # Regular field (no expandable nested models)
+            field_line = f"{indent}{field_name}: {field_type_str}"
+            if field_info.get("default") is not None:
+                field_line += f" = {field_info['default']}"
+            if is_required:
+                field_line += " (required)"
+            lines.append(field_line)
+
+        return lines
+
+    def generate_qmd(
+        self,
+        model_class: type[BaseModel],
+        title: str | None = None,
+        expand_nested: bool = True,
+    ) -> str:
+        """Auto-generate config reference documentation including inherited fields."""
+
+        if title is None:
+            title = f"{model_class.__name__} Reference"
+
+        # Try to get JSON schema, with fallback for serialization issues
+        try:
+            schema = model_class.model_json_schema()
+            properties = schema.get("properties", {})
+            required = schema.get("required", [])
+        except Exception as e:  # pylint: disable=broad-exception-caught
+            print(
+                f"Warning: Could not generate JSON schema ({e}). Using model fields instead."
+            )
+            # Fallback: use model fields directly
+            properties = {}
+            required = []
+            for field_name, field_info in model_class.model_fields.items():
+                # Extract description from json_schema_extra or field info
+                description = ""
+                if (
+                    hasattr(field_info, "json_schema_extra")
+                    and field_info.json_schema_extra
+                ):
+                    description = field_info.json_schema_extra.get("description", "")
+                elif hasattr(field_info, "description") and field_info.description:
+                    description = field_info.description
+
+                # Get default value
+                default_val = None
+                if hasattr(field_info, "default") and field_info.default is not None:
+                    # Handle special Pydantic default markers
+                    if str(field_info.default) != "PydanticUndefined":
+                        default_val = field_info.default
+
+                properties[field_name] = {
+                    "type": "unknown",
+                    "description": description,
+                    "default": default_val,
+                }
+
+                if field_info.is_required():
+                    required.append(field_name)
+
+        # Extract field groups from all classes in inheritance hierarchy
+        field_groups = self._extract_field_groups_from_all_classes(model_class)
+
+        # Start building QMD content
+        qmd_lines = [
+            "---",
+            f"title: {title}",
+            "description: A complete list of all configuration options.",
+            "---",
+            "",
+        ]
+
+        # Generate one big code block with all fields (inline nested expansion)
+        qmd_lines.append("```yaml")
+
+        for i, group in enumerate(field_groups):
+            if not group["fields"]:
+                continue
+
+            # Add blank line between groups (except before first group)
+            if i > 0:
+                qmd_lines.append("")
+
+            # Process fields in the order they appear in source
+            for field_name in group["fields"]:
+                if field_name not in properties:
+                    continue
+
+                field_info = properties[field_name]
+                field_type = self._extract_type_from_source(model_class, field_name)
+                is_required = field_name in required
+
+                if expand_nested:
+                    # Check if this field has nested models
+                    if field_name in model_class.model_fields:
+                        pydantic_field_info = model_class.model_fields[field_name]
+                        nested_models = self._extract_all_pydantic_models_from_type(
+                            pydantic_field_info.annotation
+                        )
+                        has_nested = bool(nested_models)
+                    else:
+                        has_nested = False
+
+                    # Add blank line before nested config
+                    if has_nested:
+                        qmd_lines.append("")
+
+                    # Use the new inline generation method
+                    field_lines = self._generate_field_documentation(
+                        model_class,
+                        field_name,
+                        field_info,
+                        field_type,
+                        is_required,
+                        indent_level=0,
+                        visited_models=set(),
+                    )
+                    qmd_lines.extend(field_lines)
+
+                    # Add blank line after nested config
+                    if has_nested:
+                        qmd_lines.append("")
+                else:
+                    # Original simple approach
+                    description = field_info.get("description", "")
+                    default = field_info.get("default")
+
+                    # Add wrapped comment for description
+                    if description:
+                        wrapped_lines = self._wrap_comment(description)
+                        qmd_lines.extend(wrapped_lines)
+
+                    line = f"{field_name}: {field_type}"
+                    if default is not None:
+                        line += f" = {default}"
+                    if is_required:
+                        line += " (required)"
+                    qmd_lines.append(line)
+
+        qmd_lines.append("```")
+
+        # Join all lines and clean up any double newlines
+        content = "\n".join(qmd_lines)
+
+        # Replace multiple consecutive newlines with just two newlines (one blank line)
+        import re
+
+        content = re.sub(r"\n{3,}", "\n\n", content)
+
+        # Ensure single newline at the very end
+        content = content.rstrip("\n") + "\n"
+
+        return content
+
+
+def main():
+    generator = QuartoGenerator()
+
+    print("Generating config reference content...")
+    qmd_content = generator.generate_qmd(AxolotlInputConfig, "Config Reference", True)
+
+    print("Writing to file...")
+    with open("docs/config-reference.qmd", "w", encoding="utf-8") as f:
+        f.write(qmd_content)
+    print("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/sequence_parallelism.qmd b/docs/sequence_parallelism.qmd
index b98206135..d1933a145 100644
--- a/docs/sequence_parallelism.qmd
+++ b/docs/sequence_parallelism.qmd
@@ -22,7 +22,7 @@ To enable sequence parallelism, add the following to your configuration file:
 
 ```yaml
 # Set to a divisor (> 1) of the number of GPUs available
-sequence_parallel_degree: 4  # Split sequences across 4 GPUs
+context_parallel_size: 4  # Split sequences across 4 GPUs
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 heads_k_stride: 1
 # Optional; one of "varlen_llama3" or "batch_ring". Defaults to
@@ -30,7 +30,7 @@ heads_k_stride: 1
 ring_attn_func:
 ```
 
-The `sequence_parallel_degree` should be a divisor of the total number of GPUs. For example:
+The `context_parallel_size` should be a divisor of the total number of GPUs. For example:
 
 - With 8 GPUs, valid values would be 2, 4, or 8
 - With 4 GPUs, valid values would be 2 or 4
@@ -66,7 +66,7 @@ sequence_len: 8192
 
 ...
 
-sequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU
+context_parallel_size: 4  # Split each sequence into 4 parts, one per GPU
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 heads_k_stride: 1
 # Optional; one of "varlen_llama3" or "batch_ring". Defaults to
@@ -89,12 +89,12 @@ Sequence parallelism is compatible with Axolotl's sample packing functionality.
 
 ## Effect on Batch Size
 
-When using sequence parallelism, your effective global batch size is **divided** by the `sequence_parallel_degree`. This happens because:
+When using sequence parallelism, your effective global batch size is **divided** by the `context_parallel_size`. This happens because:
 
-- Each group of `sequence_parallel_degree` GPUs works on the same batch (just different parts of each sequence)
+- Each group of `context_parallel_size` GPUs works on the same batch (just different parts of each sequence)
 - The number of batches processed per step decreases
 
 For example:
 - With 8 GPUs and no sequence parallelism: 8 different batches processed per step
-- With 8 GPUs and `sequence_parallel_degree=4`: Only 2 different batches processed per step (each split across 4 GPUs)
+- With 8 GPUs and `context_parallel_size=4`: Only 2 different batches processed per step (each split across 4 GPUs)
 - If your per-GPU `micro_batch_size` is 2, the global batch size decreases from 16 to 4
diff --git a/examples/alst/README.md b/examples/alst/README.md
new file mode 100644
index 000000000..7f194d299
--- /dev/null
+++ b/examples/alst/README.md
@@ -0,0 +1,9 @@
+# Arctic Long Sequence Training (ALST)
+
+Artic Long Sequence Training (ALST) is a technique for training long context models using a variety of optimization
+techniques. It is a combination of:
+- TiledMLP: Leverage tiling over the sequence dimension on MLP layers to reduce memory usage
+- Tiled Loss: Using optimized loss functions like Liger-Kernel or Cut Cross Entropy to reduce memory usage
+- Activation Offloading: Offload activations to CPU RAM to reduce memory usage
+
+For more information, you can check out the ALST paper [here](https://www.arxiv.org/abs/2506.13996).
diff --git a/examples/alst/llama3-8b-deepspeed-alst.yaml b/examples/alst/llama3-8b-deepspeed-alst.yaml
new file mode 100644
index 000000000..dea23c5ee
--- /dev/null
+++ b/examples/alst/llama3-8b-deepspeed-alst.yaml
@@ -0,0 +1,53 @@
+base_model: meta-llama/Llama-3.1-8B
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+datasets:
+  - path: togethercomputer/Long-Data-Collections
+    type: completion
+    field: text
+    data_files:
+      - pretrain/rp_sub.jsonl.zst
+  - path: princeton-nlp/TextbookChapters
+    type: completion
+    field: chapter
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+sequence_len: 500_000
+min_sample_len: 200_000
+sample_packing: true
+
+tiled_mlp: true
+context_parallel_size: 8
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_torch_8bit
+lr_scheduler: cosine
+learning_rate: 2e-5
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+activation_offloading: legacy
+
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_steps: 100
+saves_per_epoch: 1
+evals_per_epoch: 2
+weight_decay: 0.0
+special_tokens:
+  pad_token: <|end_of_text|>
+
+deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_all.json
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/alst/llama3-8b-fsdp2-alst.yaml b/examples/alst/llama3-8b-fsdp2-alst.yaml
new file mode 100644
index 000000000..c8a978264
--- /dev/null
+++ b/examples/alst/llama3-8b-fsdp2-alst.yaml
@@ -0,0 +1,59 @@
+base_model: meta-llama/Llama-3.1-8B
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+datasets:
+  - path: togethercomputer/Long-Data-Collections
+    type: completion
+    field: text
+    data_files:
+      - pretrain/rp_sub.jsonl.zst
+  - path: princeton-nlp/TextbookChapters
+    type: completion
+    field: chapter
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+sequence_len: 500_000
+min_sample_len: 200_000
+sample_packing: true
+
+tiled_mlp: true
+context_parallel_size: 8
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_torch_8bit
+lr_scheduler: cosine
+learning_rate: 2e-5
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+activation_offloading: legacy
+
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_steps: 100
+saves_per_epoch: 1
+evals_per_epoch: 2
+weight_decay: 0.0
+special_tokens:
+  pad_token: <|end_of_text|>
+
+fsdp_version: 2
+fsdp_config:
+  offload_params: false  # offloading is currently not compatible with SP + torchao optimizer
+  state_dict_type: SHARDED_STATE_DICT
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  reshard_after_forward: true
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/arcee/README.md b/examples/arcee/README.md
new file mode 100644
index 000000000..217893306
--- /dev/null
+++ b/examples/arcee/README.md
@@ -0,0 +1,53 @@
+# Finetune ArceeAI's AFM with Axolotl
+
+[Arcee Foundation Models (AFM)](https://huggingface.co/collections/arcee-ai/afm-45b-68823397c351603014963473) are a family of 4.5B parameter open weight models trained by Arcee.ai.
+
+This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
+
+Thanks to the team at Arcee.ai for using Axolotl in supervised fine-tuning the AFM model.
+
+## Getting started
+
+1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as AFM is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
+
+    Here is an example of how to install from main for pip:
+
+```bash
+# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
+git clone https://github.com/axolotl-ai-cloud/axolotl.git
+cd axolotl
+
+pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation -e '.[flash-attn]'
+```
+
+2. Run the finetuning example:
+
+```bash
+axolotl train examples/arcee/afm-4.5b-qlora.yaml
+```
+
+This config uses about 7.8GiB VRAM.
+
+Let us know how it goes. Happy finetuning! 🚀
+
+### TIPS
+
+- For inference, the official Arcee.ai team recommends `top_p: 0.95`, `temperature: 0.5`, `top_k: 50`, and `repeat_penalty: 1.1`.
+- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
+- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
+- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
+
+## Optimization Guides
+
+- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
+- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
+- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
+
+## Related Resources
+
+- [AFM Blog](https://docs.arcee.ai/arcee-foundation-models/introduction-to-arcee-foundation-models)
+- [Axolotl Docs](https://docs.axolotl.ai)
+- [Axolotl Website](https://axolotl.ai)
+- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
+- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
diff --git a/examples/arcee/afm-4.5b-qlora.yaml b/examples/arcee/afm-4.5b-qlora.yaml
new file mode 100644
index 000000000..2cb42cacd
--- /dev/null
+++ b/examples/arcee/afm-4.5b-qlora.yaml
@@ -0,0 +1,64 @@
+base_model: arcee-ai/AFM-4.5B
+
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+load_in_8bit: false
+load_in_4bit: true
+
+datasets:
+  - path: fozziethebeat/alpaca_messages_2k_test
+    type: chat_template
+
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.1
+output_dir: ./outputs/lora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 2048
+sample_packing: true
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_target_modules:
+  - gate_proj
+  - down_proj
+  - up_proj
+  - q_proj
+  - v_proj
+  - k_proj
+  - o_proj
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 1
+saves_per_epoch: 1
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/archived/README.md b/examples/archived/README.md
new file mode 100644
index 000000000..da797c552
--- /dev/null
+++ b/examples/archived/README.md
@@ -0,0 +1,5 @@
+# Archived Examples
+
+This directory contains examples that are no longer maintained and may no longer be functional.
+
+We keep them around for archival purposes in case they are useful to others.
diff --git a/examples/cerebras/btlm-ft.yml b/examples/archived/cerebras/btlm-ft.yml
similarity index 98%
rename from examples/cerebras/btlm-ft.yml
rename to examples/archived/cerebras/btlm-ft.yml
index c9878779d..c3495d287 100644
--- a/examples/cerebras/btlm-ft.yml
+++ b/examples/archived/cerebras/btlm-ft.yml
@@ -66,7 +66,7 @@ flash_optimum:
 gptq_groupsize:
 gptq_model_v1:
 
-warmup_steps: 32
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 save_total_limit:
diff --git a/examples/cerebras/qlora.yml b/examples/archived/cerebras/qlora.yml
similarity index 98%
rename from examples/cerebras/qlora.yml
rename to examples/archived/cerebras/qlora.yml
index 55cc597f1..4598a8338 100644
--- a/examples/cerebras/qlora.yml
+++ b/examples/archived/cerebras/qlora.yml
@@ -43,7 +43,7 @@ xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
diff --git a/examples/code-llama/13b/lora.yml b/examples/archived/code-llama/13b/lora.yml
similarity index 96%
rename from examples/code-llama/13b/lora.yml
rename to examples/archived/code-llama/13b/lora.yml
index 0ed2382ba..ace94b619 100644
--- a/examples/code-llama/13b/lora.yml
+++ b/examples/archived/code-llama/13b/lora.yml
@@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out
 
 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 
 adapter: lora
 lora_model_dir:
@@ -47,7 +47,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
diff --git a/examples/code-llama/13b/qlora.yml b/examples/archived/code-llama/13b/qlora.yml
similarity index 96%
rename from examples/code-llama/13b/qlora.yml
rename to examples/archived/code-llama/13b/qlora.yml
index 22bd1691b..f4ed17af5 100644
--- a/examples/code-llama/13b/qlora.yml
+++ b/examples/archived/code-llama/13b/qlora.yml
@@ -20,7 +20,7 @@ lora_model_dir:
 
 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 
 lora_r: 32
 lora_alpha: 16
@@ -48,7 +48,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
diff --git a/examples/code-llama/34b/lora.yml b/examples/archived/code-llama/34b/lora.yml
similarity index 96%
rename from examples/code-llama/34b/lora.yml
rename to examples/archived/code-llama/34b/lora.yml
index 25dc9f421..0a1d71467 100644
--- a/examples/code-llama/34b/lora.yml
+++ b/examples/archived/code-llama/34b/lora.yml
@@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out
 
 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 
 adapter: lora
 lora_model_dir:
@@ -47,7 +47,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
diff --git a/examples/code-llama/34b/qlora.yml b/examples/archived/code-llama/34b/qlora.yml
similarity index 96%
rename from examples/code-llama/34b/qlora.yml
rename to examples/archived/code-llama/34b/qlora.yml
index 0e33e2a45..ec17bf200 100644
--- a/examples/code-llama/34b/qlora.yml
+++ b/examples/archived/code-llama/34b/qlora.yml
@@ -20,7 +20,7 @@ lora_model_dir:
 
 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 
 lora_r: 32
 lora_alpha: 16
@@ -48,7 +48,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
diff --git a/examples/code-llama/7b/lora.yml b/examples/archived/code-llama/7b/lora.yml
similarity index 95%
rename from examples/code-llama/7b/lora.yml
rename to examples/archived/code-llama/7b/lora.yml
index d288b9f65..174c17d2c 100644
--- a/examples/code-llama/7b/lora.yml
+++ b/examples/archived/code-llama/7b/lora.yml
@@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out
 
 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 
 adapter: lora
 lora_model_dir:
@@ -47,7 +47,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
diff --git a/examples/code-llama/7b/qlora.yml b/examples/archived/code-llama/7b/qlora.yml
similarity index 96%
rename from examples/code-llama/7b/qlora.yml
rename to examples/archived/code-llama/7b/qlora.yml
index de41c0123..08e67d8c2 100644
--- a/examples/code-llama/7b/qlora.yml
+++ b/examples/archived/code-llama/7b/qlora.yml
@@ -20,7 +20,7 @@ lora_model_dir:
 
 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 
 lora_r: 32
 lora_alpha: 16
@@ -48,7 +48,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
diff --git a/examples/code-llama/README.md b/examples/archived/code-llama/README.md
similarity index 100%
rename from examples/code-llama/README.md
rename to examples/archived/code-llama/README.md
diff --git a/examples/dbrx/16bit-lora.yaml b/examples/archived/dbrx/16bit-lora.yaml
similarity index 98%
rename from examples/dbrx/16bit-lora.yaml
rename to examples/archived/dbrx/16bit-lora.yaml
index 852654d49..05946dfe9 100644
--- a/examples/dbrx/16bit-lora.yaml
+++ b/examples/archived/dbrx/16bit-lora.yaml
@@ -54,7 +54,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch:
 saves_per_epoch: 1
 
diff --git a/examples/dbrx/8bit-lora.yaml b/examples/archived/dbrx/8bit-lora.yaml
similarity index 98%
rename from examples/dbrx/8bit-lora.yaml
rename to examples/archived/dbrx/8bit-lora.yaml
index 0b9402194..f159bf7fa 100644
--- a/examples/dbrx/8bit-lora.yaml
+++ b/examples/archived/dbrx/8bit-lora.yaml
@@ -57,7 +57,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch:
 saves_per_epoch: 1
 
diff --git a/examples/dbrx/README.md b/examples/archived/dbrx/README.md
similarity index 100%
rename from examples/dbrx/README.md
rename to examples/archived/dbrx/README.md
diff --git a/examples/dbrx/fft-ds-zero3.yaml b/examples/archived/dbrx/fft-ds-zero3.yaml
similarity index 98%
rename from examples/dbrx/fft-ds-zero3.yaml
rename to examples/archived/dbrx/fft-ds-zero3.yaml
index e42c16673..13cd0d997 100644
--- a/examples/dbrx/fft-ds-zero3.yaml
+++ b/examples/archived/dbrx/fft-ds-zero3.yaml
@@ -41,7 +41,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch:
 saves_per_epoch: 1
 
diff --git a/examples/deepcoder/deepcoder-14B-preview-lora.yml b/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
similarity index 95%
rename from examples/deepcoder/deepcoder-14B-preview-lora.yml
rename to examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
index 9e92c0a07..2202091d5 100644
--- a/examples/deepcoder/deepcoder-14B-preview-lora.yml
+++ b/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
@@ -21,7 +21,7 @@ output_dir: ./outputs/lora-out
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+
 
 adapter: lora
 lora_model_dir:
@@ -51,7 +51,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
diff --git a/examples/falcon/config-7b-lora.yml b/examples/archived/falcon/config-7b-lora.yml
similarity index 98%
rename from examples/falcon/config-7b-lora.yml
rename to examples/archived/falcon/config-7b-lora.yml
index 391d4dd94..f4fedbede 100644
--- a/examples/falcon/config-7b-lora.yml
+++ b/examples/archived/falcon/config-7b-lora.yml
@@ -47,7 +47,7 @@ xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_steps: 40
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
diff --git a/examples/falcon/config-7b-qlora.yml b/examples/archived/falcon/config-7b-qlora.yml
similarity index 99%
rename from examples/falcon/config-7b-qlora.yml
rename to examples/archived/falcon/config-7b-qlora.yml
index a9af8574c..a44cc40a6 100644
--- a/examples/falcon/config-7b-qlora.yml
+++ b/examples/archived/falcon/config-7b-qlora.yml
@@ -77,7 +77,7 @@ xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.000001
diff --git a/examples/falcon/config-7b.yml b/examples/archived/falcon/config-7b.yml
similarity index 98%
rename from examples/falcon/config-7b.yml
rename to examples/archived/falcon/config-7b.yml
index 3cc553daa..5481fb236 100644
--- a/examples/falcon/config-7b.yml
+++ b/examples/archived/falcon/config-7b.yml
@@ -44,7 +44,7 @@ xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_steps: 40
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
diff --git a/examples/gemma/qlora.yml b/examples/archived/gemma/qlora.yml
similarity index 97%
rename from examples/gemma/qlora.yml
rename to examples/archived/gemma/qlora.yml
index 2738112b4..80829b3c9 100644
--- a/examples/gemma/qlora.yml
+++ b/examples/archived/gemma/qlora.yml
@@ -25,7 +25,7 @@ lora_target_linear: true
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+
 
 wandb_project:
 wandb_entity:
diff --git a/examples/gptj/qlora.yml b/examples/archived/gptj/qlora.yml
similarity index 98%
rename from examples/gptj/qlora.yml
rename to examples/archived/gptj/qlora.yml
index c3cf9f973..6348566c2 100644
--- a/examples/gptj/qlora.yml
+++ b/examples/archived/gptj/qlora.yml
@@ -40,7 +40,7 @@ xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
diff --git a/examples/jeopardy-bot/config.yml b/examples/archived/jeopardy-bot/config.yml
similarity index 98%
rename from examples/jeopardy-bot/config.yml
rename to examples/archived/jeopardy-bot/config.yml
index 3609bd97e..ab1d19784 100644
--- a/examples/jeopardy-bot/config.yml
+++ b/examples/archived/jeopardy-bot/config.yml
@@ -41,7 +41,7 @@ xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_steps: 20
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
diff --git a/examples/mpt-7b/README.md b/examples/archived/mpt-7b/README.md
similarity index 100%
rename from examples/mpt-7b/README.md
rename to examples/archived/mpt-7b/README.md
diff --git a/examples/mpt-7b/config.yml b/examples/archived/mpt-7b/config.yml
similarity index 98%
rename from examples/mpt-7b/config.yml
rename to examples/archived/mpt-7b/config.yml
index e7485fad7..1fff51b6e 100644
--- a/examples/mpt-7b/config.yml
+++ b/examples/archived/mpt-7b/config.yml
@@ -42,7 +42,7 @@ logging_steps: 5
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_steps: 20
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0001
diff --git a/examples/openllama-3b/README.md b/examples/archived/openllama-3b/README.md
similarity index 100%
rename from examples/openllama-3b/README.md
rename to examples/archived/openllama-3b/README.md
diff --git a/examples/openllama-3b/config.yml b/examples/archived/openllama-3b/config.yml
similarity index 98%
rename from examples/openllama-3b/config.yml
rename to examples/archived/openllama-3b/config.yml
index 17eeb73ae..63056ed6d 100644
--- a/examples/openllama-3b/config.yml
+++ b/examples/archived/openllama-3b/config.yml
@@ -42,7 +42,7 @@ logging_steps: 1
 flash_attention: true
 gptq_groupsize:
 gptq_model_v1:
-warmup_steps: 20
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
diff --git a/examples/openllama-3b/lora.yml b/examples/archived/openllama-3b/lora.yml
similarity index 98%
rename from examples/openllama-3b/lora.yml
rename to examples/archived/openllama-3b/lora.yml
index 073117f11..b70821ce2 100644
--- a/examples/openllama-3b/lora.yml
+++ b/examples/archived/openllama-3b/lora.yml
@@ -50,7 +50,7 @@ logging_steps: 1
 flash_attention: true
 gptq_groupsize:
 gptq_model_v1:
-warmup_steps: 20
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
diff --git a/examples/openllama-3b/qlora.yml b/examples/archived/openllama-3b/qlora.yml
similarity index 98%
rename from examples/openllama-3b/qlora.yml
rename to examples/archived/openllama-3b/qlora.yml
index b4fca2c07..a34f2964b 100644
--- a/examples/openllama-3b/qlora.yml
+++ b/examples/archived/openllama-3b/qlora.yml
@@ -43,7 +43,7 @@ logging_steps: 1
 flash_attention: true
 gptq_groupsize:
 gptq_model_v1:
-warmup_steps: 20
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
diff --git a/examples/pythia-12b/README.md b/examples/archived/pythia-12b/README.md
similarity index 100%
rename from examples/pythia-12b/README.md
rename to examples/archived/pythia-12b/README.md
diff --git a/examples/pythia-12b/config.yml b/examples/archived/pythia-12b/config.yml
similarity index 100%
rename from examples/pythia-12b/config.yml
rename to examples/archived/pythia-12b/config.yml
diff --git a/examples/pythia/lora.yml b/examples/archived/pythia/lora.yml
similarity index 100%
rename from examples/pythia/lora.yml
rename to examples/archived/pythia/lora.yml
diff --git a/examples/qwen/README.md b/examples/archived/qwen/README.md
similarity index 100%
rename from examples/qwen/README.md
rename to examples/archived/qwen/README.md
diff --git a/examples/qwen/lora.yml b/examples/archived/qwen/lora.yml
similarity index 98%
rename from examples/qwen/lora.yml
rename to examples/archived/qwen/lora.yml
index 9a2843236..29de25611 100644
--- a/examples/qwen/lora.yml
+++ b/examples/archived/qwen/lora.yml
@@ -49,7 +49,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention:
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
diff --git a/examples/qwen/qlora.yml b/examples/archived/qwen/qlora.yml
similarity index 98%
rename from examples/qwen/qlora.yml
rename to examples/archived/qwen/qlora.yml
index 5f85b44dd..d46669444 100644
--- a/examples/qwen/qlora.yml
+++ b/examples/archived/qwen/qlora.yml
@@ -49,7 +49,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention:
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
diff --git a/examples/qwen/qwen2-moe-lora.yaml b/examples/archived/qwen/qwen2-moe-lora.yaml
similarity index 98%
rename from examples/qwen/qwen2-moe-lora.yaml
rename to examples/archived/qwen/qwen2-moe-lora.yaml
index afce443a0..1d5e1b524 100644
--- a/examples/qwen/qwen2-moe-lora.yaml
+++ b/examples/archived/qwen/qwen2-moe-lora.yaml
@@ -45,7 +45,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
diff --git a/examples/qwen/qwen2-moe-qlora.yaml b/examples/archived/qwen/qwen2-moe-qlora.yaml
similarity index 98%
rename from examples/qwen/qwen2-moe-qlora.yaml
rename to examples/archived/qwen/qwen2-moe-qlora.yaml
index 92a6842cf..08731441b 100644
--- a/examples/qwen/qwen2-moe-qlora.yaml
+++ b/examples/archived/qwen/qwen2-moe-qlora.yaml
@@ -48,7 +48,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
diff --git a/examples/redpajama/README.md b/examples/archived/redpajama/README.md
similarity index 100%
rename from examples/redpajama/README.md
rename to examples/archived/redpajama/README.md
diff --git a/examples/redpajama/config-3b.yml b/examples/archived/redpajama/config-3b.yml
similarity index 98%
rename from examples/redpajama/config-3b.yml
rename to examples/archived/redpajama/config-3b.yml
index 3e2999df9..c5b229c3d 100644
--- a/examples/redpajama/config-3b.yml
+++ b/examples/archived/redpajama/config-3b.yml
@@ -43,7 +43,7 @@ logging_steps: 5
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_steps: 20
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0001
diff --git a/examples/replit-3b/config-lora.yml b/examples/archived/replit-3b/config-lora.yml
similarity index 98%
rename from examples/replit-3b/config-lora.yml
rename to examples/archived/replit-3b/config-lora.yml
index 5a02ba10c..d8561762c 100644
--- a/examples/replit-3b/config-lora.yml
+++ b/examples/archived/replit-3b/config-lora.yml
@@ -41,7 +41,7 @@ logging_steps: 1
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_steps: 20
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0
diff --git a/examples/stablelm-2/1.6b/fft.yml b/examples/archived/stablelm-2/1.6b/fft.yml
similarity index 93%
rename from examples/stablelm-2/1.6b/fft.yml
rename to examples/archived/stablelm-2/1.6b/fft.yml
index 9b45b399f..585888f43 100644
--- a/examples/stablelm-2/1.6b/fft.yml
+++ b/examples/archived/stablelm-2/1.6b/fft.yml
@@ -16,7 +16,7 @@ output_dir: ./outputs/out
 
 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 
 adapter:
 lora_model_dir:
@@ -47,10 +47,9 @@ logging_steps: 1
 flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
-flash_attn_fuse_qkv: false
 flash_attn_fuse_mlp: true
 
-warmup_steps: 100
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 
diff --git a/examples/stablelm-2/1.6b/lora.yml b/examples/archived/stablelm-2/1.6b/lora.yml
similarity index 96%
rename from examples/stablelm-2/1.6b/lora.yml
rename to examples/archived/stablelm-2/1.6b/lora.yml
index 31e5ad933..6d358bdd8 100644
--- a/examples/stablelm-2/1.6b/lora.yml
+++ b/examples/archived/stablelm-2/1.6b/lora.yml
@@ -19,7 +19,7 @@ output_dir: ./outputs/lora-out
 
 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 
 adapter: lora
 lora_model_dir:
@@ -51,7 +51,7 @@ flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
diff --git a/examples/stablelm-2/README.md b/examples/archived/stablelm-2/README.md
similarity index 100%
rename from examples/stablelm-2/README.md
rename to examples/archived/stablelm-2/README.md
diff --git a/examples/starcoder2/qlora.yml b/examples/archived/starcoder2/qlora.yml
similarity index 95%
rename from examples/starcoder2/qlora.yml
rename to examples/archived/starcoder2/qlora.yml
index 18d85f9c3..fecf98d23 100644
--- a/examples/starcoder2/qlora.yml
+++ b/examples/archived/starcoder2/qlora.yml
@@ -19,7 +19,7 @@ lora_model_dir:
 
 sequence_len: 8192
 sample_packing: true
-pad_to_sequence_len: true
+
 
 lora_r: 32
 lora_alpha: 16
@@ -48,7 +48,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 20
+warmup_ratio: 0.1
 evals_per_epoch: 4
 eval_steps:
 saves_per_epoch: 4
diff --git a/examples/tiny-llama/README.md b/examples/archived/tiny-llama/README.md
similarity index 100%
rename from examples/tiny-llama/README.md
rename to examples/archived/tiny-llama/README.md
diff --git a/examples/tiny-llama/lora-mps.yml b/examples/archived/tiny-llama/lora-mps.yml
similarity index 95%
rename from examples/tiny-llama/lora-mps.yml
rename to examples/archived/tiny-llama/lora-mps.yml
index 66cf7cfb3..125090a78 100644
--- a/examples/tiny-llama/lora-mps.yml
+++ b/examples/archived/tiny-llama/lora-mps.yml
@@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out
 
 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 eval_sample_packing: false
 
 adapter: lora
@@ -49,7 +49,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: false
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 0
 saves_per_epoch: 1
 weight_decay: 0.0
diff --git a/examples/tiny-llama/lora.yml b/examples/archived/tiny-llama/lora.yml
similarity index 95%
rename from examples/tiny-llama/lora.yml
rename to examples/archived/tiny-llama/lora.yml
index 90998880f..817481e18 100644
--- a/examples/tiny-llama/lora.yml
+++ b/examples/archived/tiny-llama/lora.yml
@@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+
 
 adapter: lora
 lora_model_dir:
@@ -47,7 +47,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
diff --git a/examples/tiny-llama/pretrain.yml b/examples/archived/tiny-llama/pretrain.yml
similarity index 97%
rename from examples/tiny-llama/pretrain.yml
rename to examples/archived/tiny-llama/pretrain.yml
index 5b3706bcb..f15c6ce19 100644
--- a/examples/tiny-llama/pretrain.yml
+++ b/examples/archived/tiny-llama/pretrain.yml
@@ -38,7 +38,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
diff --git a/examples/tiny-llama/qlora.yml b/examples/archived/tiny-llama/qlora.yml
similarity index 95%
rename from examples/tiny-llama/qlora.yml
rename to examples/archived/tiny-llama/qlora.yml
index 8b2a4565a..d3ff59cb8 100644
--- a/examples/tiny-llama/qlora.yml
+++ b/examples/archived/tiny-llama/qlora.yml
@@ -21,7 +21,7 @@ lora_model_dir:
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+
 
 lora_r: 32
 lora_alpha: 16
@@ -49,7 +49,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
diff --git a/examples/xgen-7b/xgen-7b-8k-qlora.yml b/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml
similarity index 99%
rename from examples/xgen-7b/xgen-7b-8k-qlora.yml
rename to examples/archived/xgen-7b/xgen-7b-8k-qlora.yml
index 48066b130..fc09a1e7b 100644
--- a/examples/xgen-7b/xgen-7b-8k-qlora.yml
+++ b/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml
@@ -75,7 +75,7 @@ xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
diff --git a/examples/yi-34B-chat/README.md b/examples/archived/yi-34B-chat/README.md
similarity index 100%
rename from examples/yi-34B-chat/README.md
rename to examples/archived/yi-34B-chat/README.md
diff --git a/examples/yi-34B-chat/qlora.yml b/examples/archived/yi-34B-chat/qlora.yml
similarity index 98%
rename from examples/yi-34B-chat/qlora.yml
rename to examples/archived/yi-34B-chat/qlora.yml
index a0a95d86f..ba8d12fc8 100644
--- a/examples/yi-34B-chat/qlora.yml
+++ b/examples/archived/yi-34B-chat/qlora.yml
@@ -20,7 +20,7 @@ special_tokens:
 datasets:
   - path: mhenrichsen/alpaca_2k_test
     type: alpaca
-warmup_steps: 10
+warmup_ratio: 0.1
 
 # Iterations
 num_epochs: 1
diff --git a/examples/cohere/command-r-7b-qlora.yml b/examples/cohere/command-r-7b-qlora.yml
index 4a30e9a77..b4741636b 100644
--- a/examples/cohere/command-r-7b-qlora.yml
+++ b/examples/cohere/command-r-7b-qlora.yml
@@ -27,7 +27,7 @@ lora_target_linear: true
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+
 
 wandb_project:
 wandb_entity:
@@ -35,7 +35,6 @@ wandb_watch:
 wandb_name:
 wandb_log_model:
 
-
 gradient_accumulation_steps: 4
 micro_batch_size: 1
 num_epochs: 4
@@ -56,3 +55,5 @@ evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/colab-notebooks/colab-axolotl-example.ipynb b/examples/colab-notebooks/colab-axolotl-example.ipynb
index 0b373c28c..69881997e 100644
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -1,357 +1,9934 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Setting up"
-   ]
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "OPLSwmgdrB7g"
+      },
+      "source": [
+        "# Fine-Tune Qwen3 14B with Axolotl\n",
+        "\n",
+        "[<img src=\"https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png\" alt=\"Built with Axolotl\" width=\"200\" height=\"32\"/>](https://github.com/axolotl-ai-cloud/axolotl)\n",
+        "\n",
+        "Axolotl is the most performant LLM post-training framework available, delivering faster training with efficient, consistent and stable performance. Train your workload and ship your product 30% faster; saving you both time and money.\n",
+        "\n",
+        "- ⭐ us on [GitHub](https://github.com/axolotl-ai-cloud/axolotl)\n",
+        "- 📜 Read the [Docs](http://docs.axolotl.ai/)\n",
+        "- 💬 Chat with us on [Discord](https://discord.gg/mnpEYgRUmD)\n",
+        "- 📰 Get updates on [X/Twitter](https://x.com/axolotl_ai)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "rVjKD7CbxIP3"
+      },
+      "source": [
+        "# Installation\n",
+        "\n",
+        "Axolotl is easy to install from [pip](https://pypi.org/project/axolotl/), or use our [pre-built Docker images](http://docs.axolotl.ai/docs/docker.html) for a hassle free dependency experience. See our [docs](http://docs.axolotl.ai/docs/installation.html) for more information."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "msOCO4NRmRLa"
+      },
+      "outputs": [],
+      "source": [
+        "%%capture\n",
+        "# This step can take ~5-10 minutes to install dependencies\n",
+        "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
+        "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0ee9ee8\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "N0OW0YeksDLr"
+      },
+      "source": [
+        "## Demo: Talk Like a Pirate\n",
+        "\n",
+        "In this demo, we are training the model ***to respond like a pirate***. This was chosen as a way to easily show how to train a model to respond in a certain style of your choosing (without being prompted) and is quite easy to validate within the scope of a Colab."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "8Du2fANTsNCK"
+      },
+      "source": [
+        "### Upload your own dataset or use a Huggingface dataset\n",
+        "\n",
+        "You can choose to use your own JSONL file from your own [Google Drive](https://drive.google.com/drive/home); for example downloading the [Pirate-Ultrachat JSONL](https://huggingface.co/datasets/winglian/pirate-ultrachat-10k/blob/main/train.jsonl) to your Google Drive. JSONL datasets should be formatted similar to the [OpenAI dataset format](https://cookbook.openai.com/examples/chat_finetuning_data_prep).\n",
+        "\n",
+        "You can also simply use the [`winglian/pirate-ultrachat-10k`](https://huggingface.co/datasets/winglian/pirate-ultrachat-10k) dataset directly.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "fGEEjyQ-r_IV"
+      },
+      "outputs": [],
+      "source": [
+        "# Default to HF dataset location\n",
+        "dataset_id = \"winglian/pirate-ultrachat-10k\"\n",
+        "uploaded = {}"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "c5MyYqk7vIsG"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "# Optionally, upload your own JSONL to your Google Drive\n",
+        "GOOGLE_DRIVE_PATH = \"\"  # ex: \"MyDrive/Colab\\ Notebooks/train.jsonl\"\n",
+        "\n",
+        "# \"Select All\" permissions, or you may get the error:\n",
+        "# \"MessageError: Error: credential propagation was unsuccessful\"\n",
+        "if GOOGLE_DRIVE_PATH:\n",
+        "    from google.colab import drive\n",
+        "    # Mount your Google Drive\n",
+        "    GOOGLE_DRIVE_MNT = \"/content/drive/\"\n",
+        "    drive.mount(GOOGLE_DRIVE_MNT, force_remount=True)\n",
+        "    tmp_path = os.path.join(GOOGLE_DRIVE_MNT, GOOGLE_DRIVE_PATH.lstrip(\"/\"))\n",
+        "    # make sure file exists\n",
+        "    if not os.path.isfile(tmp_path):\n",
+        "        raise ValueError(f\"File {tmp_path} does not exist\")\n",
+        "    dataset_id = tmp_path\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "U6pTk3A9xj1W"
+      },
+      "source": [
+        "# Configure for Supervised Fine-Tuning (SFT)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 151,
+          "referenced_widgets": [
+            "388f618924274d21a066f098f4f1e744",
+            "7c95f85a2b1f47a1bd846d110c47bb3c",
+            "083f9cda8d754c168beee10d2f8955a2",
+            "62e1a65582f446a78612eaa804e08a7d",
+            "487a177d020f4605834878b2fdc7afa3",
+            "7fd44cf9ca6e4726bfd7ac21846d6a14",
+            "366a343b62fa47d8985a3bd464d99f9e",
+            "a0a11e929edd4189b79723d618522c33",
+            "e87ea87fcff247b5bbcc331ba79a8dc2",
+            "5e18768f7ad6434ba8b8b8a2e853e204",
+            "bb33aec33a6447078c31bfd728942994"
+          ]
+        },
+        "id": "fdRioqytmTtX",
+        "outputId": "f0acdcec-4b41-4a3f-ffed-c2d2d929158e"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[2025-05-08 13:40:27,488] [INFO] [root.register:348] [PID:174] Attempting to load plugin: axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n",
+            "[2025-05-08 13:40:27,493] [INFO] [root.register:351] [PID:174] Plugin loaded successfully: axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n",
+            "[2025-05-08 13:40:27,959] [INFO] [axolotl.utils.schemas.config.check_eval_packing:721] [PID:174] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`\u001b[39m\n",
+            "[2025-05-08 13:40:27,960] [INFO] [axolotl.utils.schemas.config.hint_sample_packing_padding:514] [PID:174] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing\u001b[39m\n",
+            "[2025-05-08 13:40:27,961] [INFO] [axolotl.utils.schemas.config.check_bf16:1251] [PID:174] [RANK:0] bf16 support detected, but not enabled for this configuration.\u001b[39m\n"
+          ]
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "388f618924274d21a066f098f4f1e744",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[2025-05-08 13:40:28,590] [INFO] [axolotl.normalize_config:237] [PID:174] [RANK:0] cuda memory usage baseline: 0.000GB (+0.002GB cache, +0.359GB misc)\u001b[39m\n"
+          ]
+        }
+      ],
+      "source": [
+        "from axolotl.utils.dict import DictDefault\n",
+        "from axolotl.cli.config import load_cfg\n",
+        "\n",
+        "# Axolotl provides full control and transparency over model and training configuration\n",
+        "config = DictDefault(\n",
+        "    base_model = \"Qwen/Qwen3-14B\",  # Use the instruct tuned model, but we're aligning it to be a pirate\n",
+        "    load_in_4bit = True,  # set to True for qLoRA\n",
+        "    adapter = \"qlora\",\n",
+        "    lora_r = 32,\n",
+        "    lora_alpha = 64,\n",
+        "    lora_target_modules = [\n",
+        "        \"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",  # train self_attn linear modules\n",
+        "        \"gate_proj\", \"down_proj\", \"up_proj\",  # train MLP linear modules\n",
+        "    ],\n",
+        "    lora_qkv_kernel = True,  # optimized triton kernels for LoRA\n",
+        "    lora_o_kernel = True,\n",
+        "    lora_mlp_kernel = True,\n",
+        "    embeddings_skip_upcast = True,  # keep embeddings in fp16 so the model fits in 15GB VRAM\n",
+        "    xformers_attention = True,  # use xformers on Colab w/ T4 for memory efficient attention, flash_attention only on Ampere or above\n",
+        "    plugins = [\n",
+        "        # more efficient training using Apple's Cut Cross Entropy; https://github.com/apple/ml-cross-entropy\n",
+        "        \"axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\",\n",
+        "    ],\n",
+        "    sample_packing = True,  # 2-6x increase in tokens per micro-batch\n",
+        "    # when using packing, use a slightly higher learning rate to account for fewer steps\n",
+        "    # alternatively, reduce the micro_batch_size + gradient_accumulation_steps to achieve closer to the same number of steps/epoch\n",
+        "    learning_rate = 0.00019,\n",
+        "    sequence_len = 4096,  # larger sequence length improves packing efficiency for more tokens/sec\n",
+        "    micro_batch_size = 1,\n",
+        "    gradient_accumulation_steps = 1,\n",
+        "    gradient_checkpointing = True,  # tradeoff reduced VRAM for increased time\n",
+        "    gradient_checkpointing_kwargs = {\n",
+        "        \"use_reentrant\": False,\n",
+        "    },\n",
+        "    optimizer = \"paged_adamw_8bit\",\n",
+        "    lr_scheduler = \"cosine\",\n",
+        "    warmup_steps = 5,\n",
+        "    fp16 = True,  # use float16 + automatic mixed precision, bfloat16 not supported on Colab w/ T4\n",
+        "    bf16 = False,\n",
+        "    max_grad_norm = 0.1,  # gradient clipping\n",
+        "    num_epochs = 1,\n",
+        "    saves_per_epoch = 2,  # how many checkpoints to save over one epoch\n",
+        "    logging_steps = 1,\n",
+        "    output_dir = \"./outputs/qwen-sft-pirate-rrr\",\n",
+        "    chat_template = \"qwen3\",\n",
+        "    datasets = [\n",
+        "        {\n",
+        "            \"path\": dataset_id,  # Huggingface Dataset id or path to train.jsonl\n",
+        "            \"type\": \"chat_template\",\n",
+        "            \"split\": \"train\",\n",
+        "            \"eot_tokens\": [\"<|im_end|>\"],\n",
+        "        }\n",
+        "    ],\n",
+        "    dataloader_prefetch_factor = 8,  # dataloader optimizations\n",
+        "    dataloader_num_workers = 2,\n",
+        "    dataloader_pin_memory = True,\n",
+        "  )\n",
+        "\n",
+        "# validates the configuration\n",
+        "cfg = load_cfg(config)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "715UpvnSoBIS"
+      },
+      "outputs": [],
+      "source": [
+        "from axolotl.utils import patch_optimized_env\n",
+        "# speedup downloads from HF 🤗 and set \"PYTORCH_CUDA_ALLOC_CONF\" env to save memory\n",
+        "patch_optimized_env()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Vc6MC-hwyH-n"
+      },
+      "source": [
+        "# Datasets\n",
+        "\n",
+        "Axolotl has a robust suite of loaders and transforms to parse most open datasets of any format into the appropriate chat template for your model. Axolotl will mask input tokens from the user's prompt so that the train loss is only calculated against the model's response. For more information, [see our documentation](http://docs.axolotl.ai/docs/dataset-formats/conversation.html) on dataset preparation.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000,
+          "referenced_widgets": [
+            "b82aa8c57f7c422a9a9c90f333ed2a99",
+            "c0991cf63ee6458b96e9a75e7a88b61a",
+            "71c8af139cd248b1b51101fd46a93f35",
+            "1d5117195d4b49eb8f1a73b18419f7ce",
+            "3c21e4a511b4441192c03b7f1d0976e9",
+            "ed28e2e0410d4e0b855467e798e53d66",
+            "d93f134f802b4b69b575bdaf07dbd27c",
+            "d0e9dce55cec4c1ca619a0ccf209d924",
+            "4c727d40ef0443449afc31724ee79f0c",
+            "0dea5caa27384f5689e3cab51f558727",
+            "a6f48410b9964fefba0c3009a77dc838",
+            "95caff42f08a4c2aa14c867b8f37f231",
+            "de7c37ee83e24f0c889e84d07279c2ec",
+            "9d4897eefb5f48259ffb2d23e332f752",
+            "253017b0d0534e54ab44e181f6d7c82d",
+            "27beaf06e41b472abdb544a43c720c5a",
+            "34cf3df51fbc41cabfdbba153c007f0e",
+            "ac764024cf1c4e08ba7749afd2cd20ac",
+            "30a81da86f8043eca301e86a8651201a",
+            "e8b7a81040904c1e89e58978223b1737",
+            "1c6f1f10667545aaab958016ba7e2c94",
+            "e6e969610738449887259063967f82b0",
+            "a138859f19b74fc0928dc236ab5359db",
+            "9b42e08b3c9548818488268768a118b1",
+            "12b56912736849fea2ad8124456fdc5c",
+            "879c8ab5873847a8833bd74123be90a4",
+            "20352e5f58d24bb8b1f3940efd14fe4a",
+            "d955dcaa0e944e719f3a06139dd54a03",
+            "d3de2662c7964f1ba96e58da382af720",
+            "97e36007e1304e1583fd81bfb13f0edd",
+            "c65dc74c7d6f4bab8f7dd28455161dd8",
+            "ef223e8504b64e3592589880326aaf41",
+            "598da69727bd4fb8b1caf465ac736d7a",
+            "5f86cd894de94c3280fadc1e2fd0ee13",
+            "a20927bf5f2c41f58c1e31ac858ab36c",
+            "0a46ad75c198463d843fb35e813642cb",
+            "09007681cf8d42aeb8c1d2f6a74e470a",
+            "ebc80d1a55fa47f4a5ea2756588569ec",
+            "1811cda0644e4190a9469d1774435d82",
+            "35c811d2ae8e43f3b5cecbdd3cfa857f",
+            "b8e39e4dddc3497fbc29ae45c66da759",
+            "63b4e563e85c4f03b1b72beda9577bcc",
+            "b195f160ca20442fadd8b5aed0ee41af",
+            "ca65e32eb52f48c09a84b33cb18f22cd",
+            "7cd0b85ebd204b7aba908417811ce4e0",
+            "7baeab52d6694c32b1efd1ea1a0a7782",
+            "519a7b154022443db6703f04a9142bae",
+            "d4183e9715f34d249942b8271cca3bdf",
+            "da2347ac94764a3fa2743343cf0d3cd2",
+            "93a44a11aa4846fa8efc6c1413ef1627",
+            "a55060adc3564407ac81ad7297d34aaa",
+            "d02274afd47b462291c745f261209d42",
+            "0f417447a7bd4a33acca96fa37aec877",
+            "63580b6fb30642479fe3000915bf551a",
+            "8f726dbfb45d4528afa33e36a6313267",
+            "03b093d592ba4386aa61f7b8483da660",
+            "b8766a88716948cf968f4563531a76d9",
+            "6f3a28b912714c6e931003549664bfa3",
+            "16d1283741404b7bb319094c992fce01",
+            "2a5bb0e818ab47be8cf6465988328503",
+            "2b3a2659b12244bd8548320320016dbf",
+            "0cd7efffbb3c4c4b972e63749f61ab97",
+            "5ca240f31e6b44e3882c5eb37cd5a309",
+            "5eb06edeb58e4930b1affef2a59eae81",
+            "a4e5789584564049b83df7c6c54a3e08",
+            "ff3a94b146a948b6907f5d80c7157f99",
+            "258b7c635c1045329d4669e48c46ccd5",
+            "6f68ed9889f54ad2ae8a3b95ac263a83",
+            "80366349d81e4dcc892db6cd56e384f3",
+            "c73055099c084dca996159e23e162d0b",
+            "977f799afaac4a55b2dc1cffa7d5b63b",
+            "41f3b32c2f6b4034ae7a3b9124e28bc7",
+            "a10d0a76010f4e508c65a9b69ebc5156",
+            "f8ef805b776145c3bfa9ba8d90972058",
+            "cc587493c33c4f118d1b1170f85be24c",
+            "e40d1c1ac9494b3bade9858324e7ffdf",
+            "d65b6b060d9845779299491ac5599c31",
+            "0f6907ebbc6242c8bde059cef1e1bd29",
+            "5bdfd87fc6cd4f9dabef7cfee29c8060",
+            "64f54d4a744a4627a07c3c0120276f3b",
+            "65b75b9b8bc143cf997796af68ff6668",
+            "d6fe74e4255444368f8f90a62157d869",
+            "4d468f96ec924681ad65eb671674b93e",
+            "ad7599de524549c48bf2d3124ad4b299",
+            "0546d04aae644dde846c58a4afb598a6",
+            "897b77a56c09479bb11d7f2a30997e55",
+            "81c3db71ac704280ad030072655f1537",
+            "042e091f75694c47aee761e760e76773",
+            "ef0a3c7a6f14460fb4da096928ae249e",
+            "07fb3a2c8315494e97b447e672dfae06",
+            "ec030fc3c346426f9abc3a89892258d3",
+            "e3fb3fc6afe04b3c9b7ac61809ce78fa",
+            "c3be9109d63c485d9c0ef4f9bc0f9218",
+            "12815f401eba44658caa7b2e490137a8",
+            "30e02aa2d0d241979369e598287f2639",
+            "dfd2a2649b8341ef913207526708aff1",
+            "4f1977d7e4824ef1a14b65f0f42bba10",
+            "c6164e05a1914ae48083db9ad7f4ef7c",
+            "813621384dc748b0ad06775e22761c0b",
+            "dc892a596f6942d7973c616c38f0eebb",
+            "c84cc07789be48aebb322c23d355289e",
+            "bed8726b8069434687c75452e21f19e5",
+            "16a188a0b06d45f980dcf3933509fe0a",
+            "60c1a0d765c14a1d888317e6a507e4ea",
+            "0077aedc3d174560bce924ee89e9c006",
+            "00321cce58884f6f9b3855a21fcd9187",
+            "fa864b41586f4a7aa56aeafd1d84eb75",
+            "3225603166b54e7aab766b9964a2f660",
+            "349eee9f56d64f0cba6fc24ff2c50c9b",
+            "7e5d3774060e4589aa65982da5ea4ef4",
+            "7c2485c6cdfe463da6fdb35982a1070d",
+            "ad1236893754446881e153adc9d5c962",
+            "daee63fd167e4441a32324b51b00ad2b",
+            "fe41858c6bd04c58840112b67c19a336",
+            "d262c82138024169b9f3aa034ca756fa",
+            "62e302ebdad64aada0ffe64ae1c873f3",
+            "bd1b0dfed6d34d16af33a4a58330f5ec",
+            "d07c8b97d3314f1c852e44bdd40f61ed",
+            "ebb69a2c3d0a4299a484698287b3087c",
+            "e5a82df528bb4e408797a3b6c2758f4a",
+            "f113ebd8c1c34806bea4dd7ed3035173"
+          ]
+        },
+        "id": "KQQhgK8FoDfF",
+        "outputId": "f69441d8-95f9-4885-c306-6c8709090ff6"
+      },
+      "outputs": [
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "b82aa8c57f7c422a9a9c90f333ed2a99",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "tokenizer_config.json:   0%|          | 0.00/9.68k [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "95caff42f08a4c2aa14c867b8f37f231",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "a138859f19b74fc0928dc236ab5359db",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "5f86cd894de94c3280fadc1e2fd0ee13",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[2025-05-08 13:41:00,844] [DEBUG] [axolotl.utils.models.load_tokenizer:441] [PID:174] [RANK:0] EOS: 151645 / <|im_end|>\u001b[39m\n",
+            "[2025-05-08 13:41:00,845] [DEBUG] [axolotl.utils.models.load_tokenizer:442] [PID:174] [RANK:0] BOS: None / None\u001b[39m\n",
+            "[2025-05-08 13:41:00,846] [DEBUG] [axolotl.utils.models.load_tokenizer:443] [PID:174] [RANK:0] PAD: 151643 / <|endoftext|>\u001b[39m\n",
+            "[2025-05-08 13:41:00,847] [DEBUG] [axolotl.utils.models.load_tokenizer:444] [PID:174] [RANK:0] UNK: None / None\u001b[39m\n",
+            "[2025-05-08 13:41:00,869] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:271] [PID:174] [RANK:0] Unable to find prepared dataset in last_run_prepared/97037817611d38b3a9c681753c3c4c95\u001b[39m\n",
+            "[2025-05-08 13:41:00,870] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:272] [PID:174] [RANK:0] Loading raw datasets...\u001b[39m\n",
+            "\u001b[33m[2025-05-08 13:41:00,870] [WARNING] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:274] [PID:174] [RANK:0] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset.\u001b[39m\n",
+            "[2025-05-08 13:41:00,871] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:281] [PID:174] [RANK:0] No seed provided, using default seed of 42\u001b[39m\n"
+          ]
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "7cd0b85ebd204b7aba908417811ce4e0",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "train.jsonl:   0%|          | 0.00/27.3M [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "03b093d592ba4386aa61f7b8483da660",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Generating train split: 0 examples [00:00, ? examples/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[2025-05-08 13:41:04,196] [INFO] [axolotl.utils.data.sft.get_dataset_wrapper:484] [PID:174] [RANK:0] Loading dataset with base_type: chat_template and prompt_style: None\u001b[39m\n",
+            "[2025-05-08 13:41:04,233] [INFO] [axolotl.__call__:761] [PID:174] [RANK:0] Using chat template:\n",
+            "---\n",
+            "{%- if tools %}\n",
+            "    {{- '<|im_start|>system\\n' }}\n",
+            "    {%- if messages[0].role == 'system' %}\n",
+            "        {{- messages[0].content + '\\n\\n' }}\n",
+            "    {%- endif %}\n",
+            "    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n",
+            "    {%- for tool in tools %}\n",
+            "        {{- \"\\n\" }}\n",
+            "        {{- tool | tojson }}\n",
+            "    {%- endfor %}\n",
+            "    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n",
+            "{%- else %}\n",
+            "    {%- if messages[0].role == 'system' %}\n",
+            "        {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n",
+            "    {%- endif %}\n",
+            "{%- endif %}\n",
+            "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n",
+            "{%- for message in messages[::-1] %}\n",
+            "    {%- set index = (messages|length - 1) - loop.index0 %}\n",
+            "    {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n",
+            "        {%- set ns.multi_step_tool = false %}\n",
+            "        {%- set ns.last_query_index = index %}\n",
+            "    {%- endif %}\n",
+            "{%- endfor %}\n",
+            "{%- for message in messages %}\n",
+            "    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n",
+            "        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n",
+            "    {%- elif message.role == \"assistant\" %}\n",
+            "        {%- set content = message.content %}\n",
+            "        {%- set reasoning_content = '' %}\n",
+            "        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n",
+            "            {%- set reasoning_content = message.reasoning_content %}\n",
+            "        {%- else %}\n",
+            "            {%- if '</think>' in message.content %}\n",
+            "                {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n",
+            "                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n",
+            "            {%- endif %}\n",
+            "        {%- endif %}\n",
+            "        {%- if loop.index0 > ns.last_query_index %}\n",
+            "            {%- if loop.last or (not loop.last and reasoning_content) %}\n",
+            "                {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n",
+            "            {%- else %}\n",
+            "                {{- '<|im_start|>' + message.role + '\\n' + content }}\n",
+            "            {%- endif %}\n",
+            "        {%- else %}\n",
+            "            {{- '<|im_start|>' + message.role + '\\n' + content }}\n",
+            "        {%- endif %}\n",
+            "        {%- if message.tool_calls %}\n",
+            "            {%- for tool_call in message.tool_calls %}\n",
+            "                {%- if (loop.first and content) or (not loop.first) %}\n",
+            "                    {{- '\\n' }}\n",
+            "                {%- endif %}\n",
+            "                {%- if tool_call.function %}\n",
+            "                    {%- set tool_call = tool_call.function %}\n",
+            "                {%- endif %}\n",
+            "                {{- '<tool_call>\\n{\"name\": \"' }}\n",
+            "                {{- tool_call.name }}\n",
+            "                {{- '\", \"arguments\": ' }}\n",
+            "                {%- if tool_call.arguments is string %}\n",
+            "                    {{- tool_call.arguments }}\n",
+            "                {%- else %}\n",
+            "                    {{- tool_call.arguments | tojson }}\n",
+            "                {%- endif %}\n",
+            "                {{- '}\\n</tool_call>' }}\n",
+            "            {%- endfor %}\n",
+            "        {%- endif %}\n",
+            "        {{- '<|im_end|>\\n' }}\n",
+            "    {%- elif message.role == \"tool\" %}\n",
+            "        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n",
+            "            {{- '<|im_start|>user' }}\n",
+            "        {%- endif %}\n",
+            "        {{- '\\n<tool_response>\\n' }}\n",
+            "        {{- message.content }}\n",
+            "        {{- '\\n</tool_response>' }}\n",
+            "        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n",
+            "            {{- '<|im_end|>\\n' }}\n",
+            "        {%- endif %}\n",
+            "    {%- endif %}\n",
+            "{%- endfor %}\n",
+            "{%- if add_generation_prompt %}\n",
+            "    {{- '<|im_start|>assistant\\n' }}\n",
+            "    {%- if enable_thinking is defined and enable_thinking is false %}\n",
+            "        {{- '<think>\\n\\n</think>\\n\\n' }}\n",
+            "    {%- endif %}\n",
+            "{%- endif %}\n",
+            "---\u001b[39m\n"
+          ]
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "258b7c635c1045329d4669e48c46ccd5",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Tokenizing Prompts (num_proc=2):   0%|          | 0/9985 [00:00<?, ? examples/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[2025-05-08 13:42:09,195] [INFO] [axolotl.utils.data.utils.drop_long_seq_in_dataset:177] [PID:174] [RANK:0] min_input_len: 23\u001b[39m\n",
+            "[2025-05-08 13:42:09,196] [INFO] [axolotl.utils.data.utils.drop_long_seq_in_dataset:179] [PID:174] [RANK:0] max_input_len: 3380\u001b[39m\n"
+          ]
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "0f6907ebbc6242c8bde059cef1e1bd29",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Dropping Long Sequences (num_proc=2):   0%|          | 0/9985 [00:00<?, ? examples/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "ef0a3c7a6f14460fb4da096928ae249e",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Drop Samples with Zero Trainable Tokens (num_proc=2):   0%|          | 0/9985 [00:00<?, ? examples/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "dc892a596f6942d7973c616c38f0eebb",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Add position_id column (Sample Packing) (num_proc=2):   0%|          | 0/9985 [00:00<?, ? examples/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[2025-05-08 13:42:21,651] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:351] [PID:174] [RANK:0] Saving merged prepared dataset to disk... last_run_prepared/97037817611d38b3a9c681753c3c4c95\u001b[39m\n"
+          ]
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "7c2485c6cdfe463da6fdb35982a1070d",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Saving the dataset (0/1 shards):   0%|          | 0/9985 [00:00<?, ? examples/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[2025-05-08 13:42:25,711] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:411] [PID:174] [RANK:0] gather_len_batches: [1540]\u001b[39m\n",
+            "[2025-05-08 13:42:25,714] [INFO] [axolotl.calc_sample_packing_eff_est:491] [PID:174] [RANK:0] sample_packing_eff_est across ranks: [0.9987832601968344]\u001b[39m\n"
+          ]
+        }
+      ],
+      "source": [
+        "from axolotl.common.datasets import load_datasets\n",
+        "\n",
+        "# Load, parse and tokenize the datasets to be formatted with qwen3 chat template\n",
+        "# Drop long samples from the dataset that overflow the max sequence length\n",
+        "dataset_meta = load_datasets(cfg=cfg)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "mrSNfHpk0EAe"
+      },
+      "source": [
+        "# Training\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000,
+          "referenced_widgets": [
+            "004d9177a6a14118a5930dc3cc13147b",
+            "a80410b919e442c49aea15acc1ce1a72",
+            "c6e00f5224364822bc4239b176686919",
+            "ec11d1e5ae7b42c883d9b1f38a65356e",
+            "734185351eb543fa9a00a881dcbb9fe7",
+            "fa1282ccc7544e4f818e2f03ccffe4a5",
+            "bbbf575d2a4b4c6ea8389be79b2a6039",
+            "2a51b36be41745468e4c2d7a21b1c0d2",
+            "4fd114abe9f5494ab59858949f5055f1",
+            "936d04b5fe1b4c63bf0b080e423d051b",
+            "f1cef8e8dc2646fb9fd09f3b09081074",
+            "cdebbc55a1164c018546c2ac6f8c620c",
+            "a44f630e099e43899f20a77084ae60cd",
+            "c3725c7f79fe415fbd1ea336f0cc9cf1",
+            "0e50870ed0c643e0b6c18cc5d7ddae7f",
+            "c33ced495f70464aa4a3a91922090853",
+            "ed5ca967ad5342929e578ac6aa4dc4c0",
+            "af401d117d5047629d3a6e2361757b62",
+            "b191ac001a2e4962bc9a245fcdf26e6b",
+            "054c8dffadba48c6b895a6cc62448ecc",
+            "bfcdbba993b74972a9e3e575f86908ff",
+            "6ebb2ec171414e47a14765505f64bb3c",
+            "500e272208a246089613bf788a165271",
+            "200df5e79b9244849e589ecb0250a520",
+            "cc94432d08464affa3e58b560bdad194",
+            "3036608c71904ce9ae4bb2a9fa8802d9",
+            "adacfdcc1b0140efac56918e9ccf064e",
+            "f4a1795dc7514a718f478245f521f0ba",
+            "5e746eb25bbe416fb585fa24e79f5177",
+            "b5b65414154544aa8a71b1a39164aad7",
+            "f0a58fbd0fca4340890041f99fa2f8c8",
+            "5ca6be24acb548cea130bd58e9954c7c",
+            "5cfb02ee044b4011a378efa8b54a370f",
+            "4d05314858354e729d76094b3b0ce761",
+            "c42acf646f344a88b8c11f81e67f7206",
+            "7be6f04c284e4326bb4ff3d301e7b3c6",
+            "ffdbb12a2f2c4d14911685e7683e0ef0",
+            "bee3501b2a17427784a717e50a85e7fa",
+            "8bc9d8ba866c442b9118d9630009939c",
+            "9f56a2d9979c4bd8928c644c22c3ecdf",
+            "9503a45960984adc97b58e16c50662e0",
+            "da6e93f3e4984780b930fe7a706983ea",
+            "ab93eabd7cea4b94b4b7a387f101e8a1",
+            "704f2f5a9b1c49d5a75a0025a5dda11b",
+            "dd0e646fad3f4a89ba23b39d162bd8d9",
+            "d43c6df07ddb466587807d6dbe1ff614",
+            "e0e8b840b8ea4d0d9db09afe99fa287d",
+            "9327977822be4b1294f80e876552e305",
+            "77304d1a46b3468a98483e02ec0ac4a4",
+            "8c4d4fc5a30f4e7cb3be53fe2adda33d",
+            "e90658f4bcb642baa78426012f863152",
+            "f7434f3e03124a1c938a39af79d7fa59",
+            "c1314f241a434c41b45d84dc4d3b30f8",
+            "37de928300e34184881039378bd75e7f",
+            "0e936d9dbf9c4fdd86bbfe9730dedc47",
+            "e21e180307e5485cbbe908672fd6639a",
+            "2e2b0c1599c341a198f632f46a40c90e",
+            "bff139df987d4a62abec6456cb27f3d4",
+            "ebe1cc366d324ad59b264c8b3c431441",
+            "114dece49dba437c8572ef94b23c3b1e",
+            "be724f04b03942b2a033a7e8898bb4fd",
+            "fcbab4d8dced41a18dfccce81e3a45a0",
+            "c1f9c267ba3f40039cdb5eb3267e8043",
+            "33b3b1d0295646edaac7b4822761aeb0",
+            "fba7aa824b38467ab3061b226114cdec",
+            "f3075dccbd2747b4a7913b66f44f2596",
+            "fe18bba7f3fb4c31bf840541f36b3425",
+            "fd4f333f7ece4450b04e1a9af1f9d2f6",
+            "f60a2bdb6b6b4e0e8c3508580e247132",
+            "c0892a1881de4eb4bfabc6a68f87ae99",
+            "1bec6297c90242a88672d195bc09d429",
+            "d1f9b10c130542f094c8fd3d1e23b5e9",
+            "e575d87a7efe4ec7b1efde489839d4a6",
+            "edc99591b9c747b689b94d0052fec14c",
+            "35cc989ca3374e7dba0cb166febc4bde",
+            "158c8b85dbf34de6a94b4e35e2fc7d5a",
+            "0b4c9753a7cb4354b8e5f187e6e1ad7c",
+            "4471ff62258549fba9514bb67050f965",
+            "9cd5211b5d8b457aa0002f1d17b80028",
+            "19127c7bb1554ccbac877059f9a82db0",
+            "f4667818b9d34a09891cd727a429a610",
+            "9ed02dc43412471a9ab47f3620ccf3a5",
+            "6932489232ec4ab18a160b1e7fbcdfe1",
+            "4540927d98f54466b434ba4c0edf045d",
+            "e400cbf14bcc446a9d33b210cd93550b",
+            "71002199df6b40c9a1ac40df5fb27a1b",
+            "4b27c267393640f28f6eae0875bd2ed9",
+            "9858cb74a09748a39e8149baac96702c",
+            "eb1c9535e6a546098b760528b2ea387c",
+            "18357b321ce44d7b8bd9d1c886f69275",
+            "279937fe03bc4e4eb25b472d7e9df163",
+            "bca2c7185b6749fd899c06a2ba4c5e46",
+            "1f7d30f71bbd4547a9150d21da071055",
+            "e366ae3fceec4566b9ed303d6c5f90af",
+            "5dd7d150dbe04f08b165ce7f2c27cd11",
+            "b634bb73cfa743d09a5999101b840976",
+            "742b1030acfd414bbd9d5327b7e3826d",
+            "0f480e3a0b0a45d2a2d2dec3cad923f3",
+            "fcb30372e7404c5d8a1ad4df91e6c7b2",
+            "2860e3bb3baf4f7da058465850e800c5",
+            "3efd18ea8eaa41918894883da9541bfa",
+            "e09f1bcbb9d94c09be53e5e1303642c2",
+            "82177df57a494de8900c14c2f5185175",
+            "ccfcdc95baf646f8aeb3d516742383f2",
+            "8f5bd719974e41c3a8dd9a5b0d3d71e6",
+            "b87c84de30e84b3abf4871461fb9cbd3",
+            "e7d8e4fe58384e93a106de546068c65e",
+            "0aa8ab56b85f4171a79c3bc210594025",
+            "67da6c4260574869aa24c3cbc1bc1654",
+            "94b9088614464f60a203de39dbcae853",
+            "fea1b70fb46745feb5111b3929175b5d",
+            "f365820a3d3c42b2948abfe32065de14",
+            "823f1c78f15043e38bbd4dca3932a86a",
+            "a1959759c5424da9961fb2a308d4dee4",
+            "34c9c0137b504cd799c6bd6de69507c2",
+            "735d4f225b24414294fc1b213c61223c",
+            "5e5e15b0569b474c9620083b3ec6af55",
+            "03a3c744d716431488163b4358b80f92",
+            "a5434ee714f9498d83870544b67c0cb7",
+            "3aaecbf540f54a2db9ab0931e3b1fe57",
+            "9e333ed3b5014069ac1dd969255dd591"
+          ]
+        },
+        "id": "IwrpurmloGOy",
+        "outputId": "84fa167f-ba27-4255-d508-dc9df56ad39b"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "\n",
+            "     #@@ #@@      @@# @@#\n",
+            "    @@  @@          @@  @@           =@@#                               @@                 #@    =@@#.\n",
+            "    @@    #@@@@@@@@@    @@           #@#@=                              @@                 #@     .=@@\n",
+            "      #@@@@@@@@@@@@@@@@@            =@# @#     ##=     ##    =####=+    @@      =#####+  =#@@###.   @@\n",
+            "    @@@@@@@@@@/  +@@/  +@@          #@  =@=     #@=   @@   =@#+  +#@#   @@    =@#+  +#@#   #@.      @@\n",
+            "    @@@@@@@@@@  ##@@  ##@@         =@#   @#      =@# @#    @@      @@   @@    @@      #@   #@       @@\n",
+            "     @@@@@@@@@@@@@@@@@@@@          #@=+++#@=      =@@#     @@      @@   @@    @@      #@   #@       @@\n",
+            "                                  =@#=====@@     =@# @#    @@      @@   @@    @@      #@   #@       @@\n",
+            "    @@@@@@@@@@@@@@@@  @@@@        #@      #@=   #@=  +@@   #@#    =@#   @@.   =@#    =@#   #@.      @@\n",
+            "                                 =@#       @#  #@=     #@   =#@@@@#=    +#@@=  +#@@@@#=    .##@@+   @@\n",
+            "    @@@@  @@@@@@@@@@@@@@@@\n",
+            "\n",
+            "[2025-05-07 22:08:14,344] [INFO] [axolotl.monkeypatch.peft.utils.patch_peft_prep_code:76] [PID:1336] [RANK:0] patching prepare_model_for_kbit_training to allow for overrides\u001b[39m\n",
+            "[2025-05-07 22:08:14,549] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:80] [PID:1336] [RANK:0] Applying Cut Cross Entropy to model type: qwen3\u001b[39m\n"
+          ]
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "004d9177a6a14118a5930dc3cc13147b",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "model.safetensors.index.json:   0%|          | 0.00/36.5k [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "cdebbc55a1164c018546c2ac6f8c620c",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "model-00001-of-00008.safetensors:   0%|          | 0.00/3.84G [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "500e272208a246089613bf788a165271",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "model-00002-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "4d05314858354e729d76094b3b0ce761",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "model-00003-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "dd0e646fad3f4a89ba23b39d162bd8d9",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "model-00004-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "e21e180307e5485cbbe908672fd6639a",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "model-00005-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "fe18bba7f3fb4c31bf840541f36b3425",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "model-00006-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "4471ff62258549fba9514bb67050f965",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "model-00007-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "eb1c9535e6a546098b760528b2ea387c",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "model-00008-of-00008.safetensors:   0%|          | 0.00/1.91G [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[2025-05-07 22:09:49,798] [INFO] [accelerate.utils.modeling.get_balanced_memory:990] [PID:1336] We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).\n"
+          ]
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "2860e3bb3baf4f7da058465850e800c5",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "fea1b70fb46745feb5111b3929175b5d",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[2025-05-07 22:11:37,521] [INFO] [axolotl.utils.models.load_model:1302] [PID:1336] [RANK:0] cuda memory usage after model load: 9.264GB (+1.721GB cache, +0.375GB misc)\u001b[39m\n",
+            "[2025-05-07 22:11:37,532] [INFO] [axolotl.utils.models.prepare_model:1205] [PID:1336] [RANK:0] converting PEFT model w/ prepare_model_for_kbit_training\u001b[39m\n",
+            "[2025-05-07 22:11:37,537] [INFO] [axolotl.utils.models.load_model:1341] [PID:1336] [RANK:0] Converting modules to torch.float16\u001b[39m\n",
+            "trainable params: 128,450,560 || all params: 14,896,757,760 || trainable%: 0.8623\n",
+            "[2025-05-07 22:11:40,170] [INFO] [axolotl.utils.models.load_model:1402] [PID:1336] [RANK:0] cuda memory usage after adapters: 9.743GB (+1.476GB cache, +0.375GB misc)\u001b[39m\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/usr/local/lib/python3.11/dist-packages/axolotl/core/trainers/base.py:64: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `AxolotlTrainer.__init__`. Use `processing_class` instead.\n",
+            "  super().__init__(*_args, **kwargs)\n",
+            "No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[2025-05-07 22:11:41,755] [INFO] [axolotl.train.save_initial_configs:359] [PID:1336] [RANK:0] Pre-saving adapter config to ./outputs/qwen-sft-pirate-rrr...\u001b[39m\n",
+            "[2025-05-07 22:11:41,756] [INFO] [axolotl.train.save_initial_configs:363] [PID:1336] [RANK:0] Pre-saving tokenizer to ./outputs/qwen-sft-pirate-rrr...\u001b[39m\n",
+            "[2025-05-07 22:11:41,974] [INFO] [axolotl.train.save_initial_configs:366] [PID:1336] [RANK:0] Pre-saving model config to ./outputs/qwen-sft-pirate-rrr...\u001b[39m\n",
+            "[2025-05-07 22:11:41,982] [INFO] [axolotl.train.execute_training:211] [PID:1336] [RANK:0] Starting trainer...\u001b[39m\n",
+            "[2025-05-07 22:11:45,047] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:411] [PID:1336] [RANK:0] gather_len_batches: [1540]\u001b[39m\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+            "You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
+          ]
+        },
+        {
+          "data": {
+            "text/html": [
+              "\n",
+              "    <div>\n",
+              "      \n",
+              "      <progress value='25' max='25' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+              "      [25/25 09:25, Epoch 0/1]\n",
+              "    </div>\n",
+              "    <table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              " <tr style=\"text-align: left;\">\n",
+              "      <th>Step</th>\n",
+              "      <th>Training Loss</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <td>1</td>\n",
+              "      <td>1.092300</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>2</td>\n",
+              "      <td>1.554200</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>3</td>\n",
+              "      <td>1.041400</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>4</td>\n",
+              "      <td>1.733800</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>5</td>\n",
+              "      <td>1.430000</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>6</td>\n",
+              "      <td>1.258500</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>7</td>\n",
+              "      <td>1.343600</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>8</td>\n",
+              "      <td>1.101700</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>9</td>\n",
+              "      <td>1.086500</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>10</td>\n",
+              "      <td>0.813200</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>11</td>\n",
+              "      <td>0.689600</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>12</td>\n",
+              "      <td>0.826700</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>13</td>\n",
+              "      <td>1.541800</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>14</td>\n",
+              "      <td>0.948000</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>15</td>\n",
+              "      <td>1.357000</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>16</td>\n",
+              "      <td>1.085800</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>17</td>\n",
+              "      <td>1.516800</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>18</td>\n",
+              "      <td>1.146800</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>19</td>\n",
+              "      <td>0.834800</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>20</td>\n",
+              "      <td>0.968000</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>21</td>\n",
+              "      <td>1.388800</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>22</td>\n",
+              "      <td>1.511500</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>23</td>\n",
+              "      <td>1.338500</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>24</td>\n",
+              "      <td>1.206600</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>25</td>\n",
+              "      <td>1.504600</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table><p>"
+            ],
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[2025-05-07 22:12:42,746] [INFO] [axolotl.callbacks.on_step_end:128] [PID:1336] [RANK:0] cuda memory usage while training: 9.768GB (+3.287GB cache, +0.646GB misc)\u001b[39m\n",
+            "[2025-05-07 22:21:46,859] [INFO] [axolotl.train.save_trained_model:231] [PID:1336] [RANK:0] Training completed! Saving pre-trained model to ./outputs/qwen-sft-pirate-rrr.\u001b[39m\n"
+          ]
+        }
+      ],
+      "source": [
+        "from axolotl.train import train\n",
+        "\n",
+        "# just train the first 25 steps for demo.\n",
+        "# This is sufficient to align the model as we've used packing to maximize the trainable samples per step.\n",
+        "cfg.max_steps = 25\n",
+        "model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "j1b9ypF78eCb"
+      },
+      "source": [
+        "# Inferencing the trained model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "r3_vHhif8YEs",
+        "outputId": "e5050605-f6c9-421c-98f9-bde56a281eae"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Ahoy there, matey! Shiver me timbers, ye be lookin' for the Pythagorean theorem, eh? Well, hold yer horses and listen up, for I'll be tellin' ye all about it in me own special way.\n",
+            "\n",
+            "The Pythagorean theorem be a real gem of a mathematical trick that helps ye find the length of a side of a right triangle. Now, a right triangle be a triangle with a right angle, which be that little corner that looks like a square. \n",
+            "\n",
+            "The theorem be named after a clever fellow named Pythagoras, who be a mathematician from ancient Greece. He discovered that if ye have a right triangle, the square of the length of the hypotenuse (that be the side opposite the right angle) be equal to the sum of the squares of the other two sides. \n",
+            "\n",
+            "In other words, if ye have a triangle with sides of length a, b, and c (\n"
+          ]
+        }
+      ],
+      "source": [
+        "import torch\n",
+        "from transformers import TextStreamer\n",
+        "\n",
+        "messages = [\n",
+        "    {\n",
+        "        \"role\": \"user\",\n",
+        "        \"content\": \"Explain the Pythagorean theorem to me.\",\n",
+        "    },\n",
+        "]\n",
+        "\n",
+        "prompt = tokenizer.apply_chat_template(\n",
+        "    messages,\n",
+        "    add_generation_prompt=True,\n",
+        "    tokenize=False,\n",
+        "    enable_thinking = False,\n",
+        ")\n",
+        "\n",
+        "outputs = model.generate(\n",
+        "    **tokenizer(prompt, return_tensors = \"pt\").to(\"cuda\"),\n",
+        "    max_new_tokens = 192,\n",
+        "    temperature = 1.0, top_p = 0.8, top_k = 32,\n",
+        "    streamer = TextStreamer(tokenizer, skip_prompt = True),\n",
+        ")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "HoGwT2JRSIjA"
+      },
+      "source": [
+        "# Saving your trained model\n",
+        "\n",
+        "Axolotl automatically saves checkpoints to the `output_dir` path.\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "5BmSbiy6NaaS",
+        "outputId": "f5e1d913-7d55-42d2-8340-f9f1b0bc2b38"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "total 506M\n",
+            "-rw-r--r-- 1 root root  845 May  7 22:21 adapter_config.json\n",
+            "-rw-r--r-- 1 root root 491M May  7 22:21 adapter_model.safetensors\n",
+            "-rw-r--r-- 1 root root  707 May  7 22:11 added_tokens.json\n",
+            "drwxr-xr-x 2 root root 4.0K May  7 22:17 checkpoint-13\n",
+            "drwxr-xr-x 2 root root 4.0K May  7 22:21 checkpoint-25\n",
+            "-rw-r--r-- 1 root root 1.2K May  7 22:11 config.json\n",
+            "-rw-r--r-- 1 root root 1.6M May  7 22:11 merges.txt\n",
+            "-rw-r--r-- 1 root root 2.6K May  7 22:21 README.md\n",
+            "-rw-r--r-- 1 root root  613 May  7 22:11 special_tokens_map.json\n",
+            "-rw-r--r-- 1 root root 9.5K May  7 22:11 tokenizer_config.json\n",
+            "-rw-r--r-- 1 root root  11M May  7 22:11 tokenizer.json\n",
+            "-rw-r--r-- 1 root root 2.7M May  7 22:11 vocab.json\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Show the saved checkpoints in the output_dir\n",
+        "!ls -lh \"./outputs/qwen-sft-pirate-rrr\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_PCIFWxuOZd6"
+      },
+      "source": [
+        "Setting `hub_model_id: ` in the original config would have automatically uploaded the model to HuggingFace Hub (e.g. `hub_model_id: username/model_id`)\n",
+        "\n",
+        "If you prefer to manually upload the training artifacts, we can still upload the entire final checkpoint to HuggingFace from the CLI."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 955,
+          "referenced_widgets": [
+            "c12ea43372ac4d57bb9605f1a429b397",
+            "86816687746246b4a6105e8010384e25",
+            "6f05e9bebf7b40c9835808e77de6c236",
+            "c7433acd3c4841e6958ae8f7e87b1808",
+            "19c1e38389fa46c7b7e2152a56e1df34",
+            "0e067d8db8ed48308a718d5f57683fd1",
+            "131065f118274a1586ac38e39ed84ef0",
+            "8640ac440fbc4644b9a3af7ba3ae7183",
+            "5cea7996f02040b187ece0bb2d6a8d1f",
+            "2e257c8be2da40b4bb67a9e4ab6811f3",
+            "56e3768bef5a4b9db4168c5c17f509c2",
+            "62c028fdef904dedb9cdeca2b3bda725",
+            "a7cf477e80fc43e0ad82c7997b076dce",
+            "835bcc28a5564fb9b3d651bc8e32dc46",
+            "9f1c9a0695384bdaa6f8b847ef89bee8",
+            "b1bea589efa14258a9982071b87938bf",
+            "590eef89881545aa8bbef9a8bbe7fb00",
+            "4b1f04ff63d14a118fdd15814dff50e4",
+            "39789237703c4a418134243055c9cbf5",
+            "a3a945817f684328b34651fe052393ec"
+          ]
+        },
+        "id": "2yw8pLvlSMl8",
+        "outputId": "6e489ab2-4abe-4e28-84ca-959f912433a4"
+      },
+      "outputs": [
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "c12ea43372ac4d57bb9605f1a429b397",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "It seems you are trying to upload a large folder at once. This might take some time and then fail if the folder is too large. For such cases, it is recommended to upload in smaller batches or to use `HfApi().upload_large_folder(...)`/`huggingface-cli upload-large-folder` instead. For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/upload#upload-a-large-folder.\n",
+            "Start hashing 40 files.\n",
+            "Finished hashing 40 files.\n",
+            "Uploading files using Xet Storage..\n",
+            "Uploading...:  87% 1.82G/2.10G [00:23<00:04, 67.3MB/s]Cancellation requested; stopping current tasks.\n",
+            "Traceback (most recent call last):\n",
+            "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/_commit_api.py\", line 598, in _upload_xet_files\n",
+            "    upload_files(\n",
+            "RuntimeError: Xet Runtime Error: Task cancelled; possible runtime shutdown in progress (task 9 was cancelled).\n",
+            "\n",
+            "During handling of the above exception, another exception occurred:\n",
+            "\n",
+            "Traceback (most recent call last):\n",
+            "  File \"/usr/local/bin/huggingface-cli\", line 8, in <module>\n",
+            "    sys.exit(main())\n",
+            "             ^^^^^^\n",
+            "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/huggingface_cli.py\", line 57, in main\n",
+            "    service.run()\n",
+            "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/upload.py\", line 207, in run\n",
+            "    print(self._upload())\n",
+            "          ^^^^^^^^^^^^^^\n",
+            "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/upload.py\", line 302, in _upload\n",
+            "    return self.api.upload_folder(\n",
+            "           ^^^^^^^^^^^^^^^^^^^^^^^\n",
+            "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_validators.py\", line 114, in _inner_fn\n",
+            "    return fn(*args, **kwargs)\n",
+            "           ^^^^^^^^^^^^^^^^^^^\n",
+            "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 1633, in _inner\n",
+            "    return fn(self, *args, **kwargs)\n",
+            "           ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+            "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 4942, in upload_folder\n",
+            "    commit_info = self.create_commit(\n",
+            "                  ^^^^^^^^^^^^^^^^^^^\n",
+            "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_validators.py\", line 114, in _inner_fn\n",
+            "    return fn(*args, **kwargs)\n",
+            "           ^^^^^^^^^^^^^^^^^^^\n",
+            "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 1633, in _inner\n",
+            "    return fn(self, *args, **kwargs)\n",
+            "           ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+            "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 4202, in create_commit\n",
+            "    self.preupload_lfs_files(\n",
+            "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 4483, in preupload_lfs_files\n",
+            "    _upload_xet_files(**upload_kwargs, create_pr=create_pr)  # type: ignore [arg-type]\n",
+            "    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+            "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_validators.py\", line 114, in _inner_fn\n",
+            "    return fn(*args, **kwargs)\n",
+            "           ^^^^^^^^^^^^^^^^^^^\n",
+            "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/_commit_api.py\", line 592, in _upload_xet_files\n",
+            "    with progress_cm as progress:\n",
+            "  File \"/usr/local/lib/python3.11/dist-packages/tqdm/std.py\", line 1138, in __exit__\n",
+            "    def __exit__(self, exc_type, exc_value, traceback):\n",
+            "\n",
+            "KeyboardInterrupt\n",
+            "^C\n"
+          ]
+        }
+      ],
+      "source": [
+        "from huggingface_hub import notebook_login\n",
+        "# remove the partial epoch checkpoints\n",
+        "!rm -rf \"./outputs/qwen-sft-pirate-rrr/checkpoint-*\"\n",
+        "\n",
+        "# HF Notebook login widget\n",
+        "notebook_login()\n",
+        "\n",
+        "# upload the LoRA adapter for your model to HF, remember to update the username/model-name below\n",
+        "!huggingface-cli upload --repo-type=model winglian/pirate-qwen-14B \"./outputs/qwen-sft-pirate-rrr\""
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "widgets": {
+      "application/vnd.jupyter.widget-state+json": {
+        "00321cce58884f6f9b3855a21fcd9187": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "004d9177a6a14118a5930dc3cc13147b": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_a80410b919e442c49aea15acc1ce1a72",
+              "IPY_MODEL_c6e00f5224364822bc4239b176686919",
+              "IPY_MODEL_ec11d1e5ae7b42c883d9b1f38a65356e"
+            ],
+            "layout": "IPY_MODEL_734185351eb543fa9a00a881dcbb9fe7"
+          }
+        },
+        "0077aedc3d174560bce924ee89e9c006": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "03a3c744d716431488163b4358b80f92": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "03b093d592ba4386aa61f7b8483da660": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_b8766a88716948cf968f4563531a76d9",
+              "IPY_MODEL_6f3a28b912714c6e931003549664bfa3",
+              "IPY_MODEL_16d1283741404b7bb319094c992fce01"
+            ],
+            "layout": "IPY_MODEL_2a5bb0e818ab47be8cf6465988328503"
+          }
+        },
+        "042e091f75694c47aee761e760e76773": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "0546d04aae644dde846c58a4afb598a6": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "054c8dffadba48c6b895a6cc62448ecc": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "07fb3a2c8315494e97b447e672dfae06": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_12815f401eba44658caa7b2e490137a8",
+            "placeholder": "​",
+            "style": "IPY_MODEL_30e02aa2d0d241979369e598287f2639",
+            "value": "Drop Samples with Zero Trainable Tokens (num_proc=2): 100%"
+          }
+        },
+        "083f9cda8d754c168beee10d2f8955a2": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_a0a11e929edd4189b79723d618522c33",
+            "max": 728,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_e87ea87fcff247b5bbcc331ba79a8dc2",
+            "value": 728
+          }
+        },
+        "09007681cf8d42aeb8c1d2f6a74e470a": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_b195f160ca20442fadd8b5aed0ee41af",
+            "placeholder": "​",
+            "style": "IPY_MODEL_ca65e32eb52f48c09a84b33cb18f22cd",
+            "value": " 11.4M/11.4M [00:00&lt;00:00, 21.8MB/s]"
+          }
+        },
+        "0a46ad75c198463d843fb35e813642cb": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_b8e39e4dddc3497fbc29ae45c66da759",
+            "max": 11422654,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_63b4e563e85c4f03b1b72beda9577bcc",
+            "value": 11422654
+          }
+        },
+        "0aa8ab56b85f4171a79c3bc210594025": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "0b4c9753a7cb4354b8e5f187e6e1ad7c": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "0cd7efffbb3c4c4b972e63749f61ab97": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "0dea5caa27384f5689e3cab51f558727": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "0e067d8db8ed48308a718d5f57683fd1": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_b1bea589efa14258a9982071b87938bf",
+            "placeholder": "​",
+            "style": "IPY_MODEL_590eef89881545aa8bbef9a8bbe7fb00",
+            "value": "\n<b>Pro Tip:</b> If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. </center>"
+          }
+        },
+        "0e50870ed0c643e0b6c18cc5d7ddae7f": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_bfcdbba993b74972a9e3e575f86908ff",
+            "placeholder": "​",
+            "style": "IPY_MODEL_6ebb2ec171414e47a14765505f64bb3c",
+            "value": " 3.84G/3.84G [00:09&lt;00:00, 664MB/s]"
+          }
+        },
+        "0e936d9dbf9c4fdd86bbfe9730dedc47": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "0f417447a7bd4a33acca96fa37aec877": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "0f480e3a0b0a45d2a2d2dec3cad923f3": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "0f6907ebbc6242c8bde059cef1e1bd29": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_5bdfd87fc6cd4f9dabef7cfee29c8060",
+              "IPY_MODEL_64f54d4a744a4627a07c3c0120276f3b",
+              "IPY_MODEL_65b75b9b8bc143cf997796af68ff6668"
+            ],
+            "layout": "IPY_MODEL_d6fe74e4255444368f8f90a62157d869"
+          }
+        },
+        "114dece49dba437c8572ef94b23c3b1e": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "12815f401eba44658caa7b2e490137a8": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "12b56912736849fea2ad8124456fdc5c": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_97e36007e1304e1583fd81bfb13f0edd",
+            "max": 1671853,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_c65dc74c7d6f4bab8f7dd28455161dd8",
+            "value": 1671853
+          }
+        },
+        "131065f118274a1586ac38e39ed84ef0": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": "center",
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": "flex",
+            "flex": null,
+            "flex_flow": "column",
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": "50%"
+          }
+        },
+        "158c8b85dbf34de6a94b4e35e2fc7d5a": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "16a188a0b06d45f980dcf3933509fe0a": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_349eee9f56d64f0cba6fc24ff2c50c9b",
+            "placeholder": "​",
+            "style": "IPY_MODEL_7e5d3774060e4589aa65982da5ea4ef4",
+            "value": " 9985/9985 [00:04&lt;00:00, 2604.11 examples/s]"
+          }
+        },
+        "16d1283741404b7bb319094c992fce01": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_a4e5789584564049b83df7c6c54a3e08",
+            "placeholder": "​",
+            "style": "IPY_MODEL_ff3a94b146a948b6907f5d80c7157f99",
+            "value": " 9985/0 [00:00&lt;00:00, 50763.46 examples/s]"
+          }
+        },
+        "1811cda0644e4190a9469d1774435d82": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "18357b321ce44d7b8bd9d1c886f69275": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_e366ae3fceec4566b9ed303d6c5f90af",
+            "placeholder": "​",
+            "style": "IPY_MODEL_5dd7d150dbe04f08b165ce7f2c27cd11",
+            "value": "model-00008-of-00008.safetensors: 100%"
+          }
+        },
+        "19127c7bb1554ccbac877059f9a82db0": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "danger",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_e400cbf14bcc446a9d33b210cd93550b",
+            "max": 3963750880,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_71002199df6b40c9a1ac40df5fb27a1b",
+            "value": 3963750502
+          }
+        },
+        "19c1e38389fa46c7b7e2152a56e1df34": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ButtonModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ButtonModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ButtonView",
+            "button_style": "",
+            "description": "Login",
+            "disabled": false,
+            "icon": "",
+            "layout": "IPY_MODEL_835bcc28a5564fb9b3d651bc8e32dc46",
+            "style": "IPY_MODEL_9f1c9a0695384bdaa6f8b847ef89bee8",
+            "tooltip": ""
+          }
+        },
+        "1bec6297c90242a88672d195bc09d429": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "1c6f1f10667545aaab958016ba7e2c94": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "1d5117195d4b49eb8f1a73b18419f7ce": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_0dea5caa27384f5689e3cab51f558727",
+            "placeholder": "​",
+            "style": "IPY_MODEL_a6f48410b9964fefba0c3009a77dc838",
+            "value": " 9.68k/9.68k [00:00&lt;00:00, 812kB/s]"
+          }
+        },
+        "1f7d30f71bbd4547a9150d21da071055": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "200df5e79b9244849e589ecb0250a520": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_f4a1795dc7514a718f478245f521f0ba",
+            "placeholder": "​",
+            "style": "IPY_MODEL_5e746eb25bbe416fb585fa24e79f5177",
+            "value": "model-00002-of-00008.safetensors: 100%"
+          }
+        },
+        "20352e5f58d24bb8b1f3940efd14fe4a": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "253017b0d0534e54ab44e181f6d7c82d": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_1c6f1f10667545aaab958016ba7e2c94",
+            "placeholder": "​",
+            "style": "IPY_MODEL_e6e969610738449887259063967f82b0",
+            "value": " 2.78M/2.78M [00:00&lt;00:00, 17.8MB/s]"
+          }
+        },
+        "258b7c635c1045329d4669e48c46ccd5": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_6f68ed9889f54ad2ae8a3b95ac263a83",
+              "IPY_MODEL_80366349d81e4dcc892db6cd56e384f3",
+              "IPY_MODEL_c73055099c084dca996159e23e162d0b"
+            ],
+            "layout": "IPY_MODEL_977f799afaac4a55b2dc1cffa7d5b63b"
+          }
+        },
+        "279937fe03bc4e4eb25b472d7e9df163": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "danger",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_b634bb73cfa743d09a5999101b840976",
+            "max": 1912371880,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_742b1030acfd414bbd9d5327b7e3826d",
+            "value": 1912371698
+          }
+        },
+        "27beaf06e41b472abdb544a43c720c5a": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "2860e3bb3baf4f7da058465850e800c5": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_3efd18ea8eaa41918894883da9541bfa",
+              "IPY_MODEL_e09f1bcbb9d94c09be53e5e1303642c2",
+              "IPY_MODEL_82177df57a494de8900c14c2f5185175"
+            ],
+            "layout": "IPY_MODEL_ccfcdc95baf646f8aeb3d516742383f2"
+          }
+        },
+        "2a51b36be41745468e4c2d7a21b1c0d2": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "2a5bb0e818ab47be8cf6465988328503": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "2b3a2659b12244bd8548320320016dbf": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "2e257c8be2da40b4bb67a9e4ab6811f3": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "2e2b0c1599c341a198f632f46a40c90e": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_be724f04b03942b2a033a7e8898bb4fd",
+            "placeholder": "​",
+            "style": "IPY_MODEL_fcbab4d8dced41a18dfccce81e3a45a0",
+            "value": "model-00005-of-00008.safetensors: 100%"
+          }
+        },
+        "3036608c71904ce9ae4bb2a9fa8802d9": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_5ca6be24acb548cea130bd58e9954c7c",
+            "placeholder": "​",
+            "style": "IPY_MODEL_5cfb02ee044b4011a378efa8b54a370f",
+            "value": " 3.96G/3.96G [00:10&lt;00:00, 531MB/s]"
+          }
+        },
+        "30a81da86f8043eca301e86a8651201a": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "30e02aa2d0d241979369e598287f2639": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "3225603166b54e7aab766b9964a2f660": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "33b3b1d0295646edaac7b4822761aeb0": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "349eee9f56d64f0cba6fc24ff2c50c9b": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "34c9c0137b504cd799c6bd6de69507c2": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "34cf3df51fbc41cabfdbba153c007f0e": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "35c811d2ae8e43f3b5cecbdd3cfa857f": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "35cc989ca3374e7dba0cb166febc4bde": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "366a343b62fa47d8985a3bd464d99f9e": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "37de928300e34184881039378bd75e7f": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "388f618924274d21a066f098f4f1e744": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_7c95f85a2b1f47a1bd846d110c47bb3c",
+              "IPY_MODEL_083f9cda8d754c168beee10d2f8955a2",
+              "IPY_MODEL_62e1a65582f446a78612eaa804e08a7d"
+            ],
+            "layout": "IPY_MODEL_487a177d020f4605834878b2fdc7afa3"
+          }
+        },
+        "39789237703c4a418134243055c9cbf5": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "3aaecbf540f54a2db9ab0931e3b1fe57": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "3c21e4a511b4441192c03b7f1d0976e9": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "3efd18ea8eaa41918894883da9541bfa": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_8f5bd719974e41c3a8dd9a5b0d3d71e6",
+            "placeholder": "​",
+            "style": "IPY_MODEL_b87c84de30e84b3abf4871461fb9cbd3",
+            "value": "Loading checkpoint shards: 100%"
+          }
+        },
+        "41f3b32c2f6b4034ae7a3b9124e28bc7": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "4471ff62258549fba9514bb67050f965": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_9cd5211b5d8b457aa0002f1d17b80028",
+              "IPY_MODEL_19127c7bb1554ccbac877059f9a82db0",
+              "IPY_MODEL_f4667818b9d34a09891cd727a429a610"
+            ],
+            "layout": "IPY_MODEL_9ed02dc43412471a9ab47f3620ccf3a5"
+          }
+        },
+        "4540927d98f54466b434ba4c0edf045d": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "487a177d020f4605834878b2fdc7afa3": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "4b1f04ff63d14a118fdd15814dff50e4": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "LabelModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "LabelModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "LabelView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_39789237703c4a418134243055c9cbf5",
+            "placeholder": "​",
+            "style": "IPY_MODEL_a3a945817f684328b34651fe052393ec",
+            "value": "Connecting..."
+          }
+        },
+        "4b27c267393640f28f6eae0875bd2ed9": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "4c727d40ef0443449afc31724ee79f0c": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "4d05314858354e729d76094b3b0ce761": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_c42acf646f344a88b8c11f81e67f7206",
+              "IPY_MODEL_7be6f04c284e4326bb4ff3d301e7b3c6",
+              "IPY_MODEL_ffdbb12a2f2c4d14911685e7683e0ef0"
+            ],
+            "layout": "IPY_MODEL_bee3501b2a17427784a717e50a85e7fa"
+          }
+        },
+        "4d468f96ec924681ad65eb671674b93e": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "4f1977d7e4824ef1a14b65f0f42bba10": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "4fd114abe9f5494ab59858949f5055f1": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "500e272208a246089613bf788a165271": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_200df5e79b9244849e589ecb0250a520",
+              "IPY_MODEL_cc94432d08464affa3e58b560bdad194",
+              "IPY_MODEL_3036608c71904ce9ae4bb2a9fa8802d9"
+            ],
+            "layout": "IPY_MODEL_adacfdcc1b0140efac56918e9ccf064e"
+          }
+        },
+        "519a7b154022443db6703f04a9142bae": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_d02274afd47b462291c745f261209d42",
+            "max": 27341251,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_0f417447a7bd4a33acca96fa37aec877",
+            "value": 27341251
+          }
+        },
+        "56e3768bef5a4b9db4168c5c17f509c2": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "590eef89881545aa8bbef9a8bbe7fb00": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "598da69727bd4fb8b1caf465ac736d7a": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "5bdfd87fc6cd4f9dabef7cfee29c8060": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_4d468f96ec924681ad65eb671674b93e",
+            "placeholder": "​",
+            "style": "IPY_MODEL_ad7599de524549c48bf2d3124ad4b299",
+            "value": "Dropping Long Sequences (num_proc=2): 100%"
+          }
+        },
+        "5ca240f31e6b44e3882c5eb37cd5a309": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": "20px"
+          }
+        },
+        "5ca6be24acb548cea130bd58e9954c7c": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "5cea7996f02040b187ece0bb2d6a8d1f": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "5cfb02ee044b4011a378efa8b54a370f": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "5dd7d150dbe04f08b165ce7f2c27cd11": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "5e18768f7ad6434ba8b8b8a2e853e204": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "5e5e15b0569b474c9620083b3ec6af55": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "5e746eb25bbe416fb585fa24e79f5177": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "5eb06edeb58e4930b1affef2a59eae81": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "5f86cd894de94c3280fadc1e2fd0ee13": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_a20927bf5f2c41f58c1e31ac858ab36c",
+              "IPY_MODEL_0a46ad75c198463d843fb35e813642cb",
+              "IPY_MODEL_09007681cf8d42aeb8c1d2f6a74e470a"
+            ],
+            "layout": "IPY_MODEL_ebc80d1a55fa47f4a5ea2756588569ec"
+          }
+        },
+        "60c1a0d765c14a1d888317e6a507e4ea": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "62c028fdef904dedb9cdeca2b3bda725": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "62e1a65582f446a78612eaa804e08a7d": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_5e18768f7ad6434ba8b8b8a2e853e204",
+            "placeholder": "​",
+            "style": "IPY_MODEL_bb33aec33a6447078c31bfd728942994",
+            "value": " 728/728 [00:00&lt;00:00, 20.3kB/s]"
+          }
+        },
+        "62e302ebdad64aada0ffe64ae1c873f3": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "63580b6fb30642479fe3000915bf551a": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "63b4e563e85c4f03b1b72beda9577bcc": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "64f54d4a744a4627a07c3c0120276f3b": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_0546d04aae644dde846c58a4afb598a6",
+            "max": 9985,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_897b77a56c09479bb11d7f2a30997e55",
+            "value": 9985
+          }
+        },
+        "65b75b9b8bc143cf997796af68ff6668": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_81c3db71ac704280ad030072655f1537",
+            "placeholder": "​",
+            "style": "IPY_MODEL_042e091f75694c47aee761e760e76773",
+            "value": " 9985/9985 [00:02&lt;00:00, 3977.47 examples/s]"
+          }
+        },
+        "67da6c4260574869aa24c3cbc1bc1654": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "6932489232ec4ab18a160b1e7fbcdfe1": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "6ebb2ec171414e47a14765505f64bb3c": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "6f05e9bebf7b40c9835808e77de6c236": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "PasswordModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "PasswordModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "PasswordView",
+            "continuous_update": true,
+            "description": "Token:",
+            "description_tooltip": null,
+            "disabled": false,
+            "layout": "IPY_MODEL_2e257c8be2da40b4bb67a9e4ab6811f3",
+            "placeholder": "​",
+            "style": "IPY_MODEL_56e3768bef5a4b9db4168c5c17f509c2",
+            "value": ""
+          }
+        },
+        "6f3a28b912714c6e931003549664bfa3": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_5ca240f31e6b44e3882c5eb37cd5a309",
+            "max": 1,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_5eb06edeb58e4930b1affef2a59eae81",
+            "value": 1
+          }
+        },
+        "6f68ed9889f54ad2ae8a3b95ac263a83": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_41f3b32c2f6b4034ae7a3b9124e28bc7",
+            "placeholder": "​",
+            "style": "IPY_MODEL_a10d0a76010f4e508c65a9b69ebc5156",
+            "value": "Tokenizing Prompts (num_proc=2): 100%"
+          }
+        },
+        "704f2f5a9b1c49d5a75a0025a5dda11b": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "71002199df6b40c9a1ac40df5fb27a1b": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "71c8af139cd248b1b51101fd46a93f35": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_d0e9dce55cec4c1ca619a0ccf209d924",
+            "max": 9675,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_4c727d40ef0443449afc31724ee79f0c",
+            "value": 9675
+          }
+        },
+        "734185351eb543fa9a00a881dcbb9fe7": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "735d4f225b24414294fc1b213c61223c": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "742b1030acfd414bbd9d5327b7e3826d": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "77304d1a46b3468a98483e02ec0ac4a4": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "7baeab52d6694c32b1efd1ea1a0a7782": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_93a44a11aa4846fa8efc6c1413ef1627",
+            "placeholder": "​",
+            "style": "IPY_MODEL_a55060adc3564407ac81ad7297d34aaa",
+            "value": "train.jsonl: 100%"
+          }
+        },
+        "7be6f04c284e4326bb4ff3d301e7b3c6": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "danger",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_9503a45960984adc97b58e16c50662e0",
+            "max": 3963750880,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_da6e93f3e4984780b930fe7a706983ea",
+            "value": 3963750502
+          }
+        },
+        "7c2485c6cdfe463da6fdb35982a1070d": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_ad1236893754446881e153adc9d5c962",
+              "IPY_MODEL_daee63fd167e4441a32324b51b00ad2b",
+              "IPY_MODEL_fe41858c6bd04c58840112b67c19a336"
+            ],
+            "layout": "IPY_MODEL_d262c82138024169b9f3aa034ca756fa"
+          }
+        },
+        "7c95f85a2b1f47a1bd846d110c47bb3c": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_7fd44cf9ca6e4726bfd7ac21846d6a14",
+            "placeholder": "​",
+            "style": "IPY_MODEL_366a343b62fa47d8985a3bd464d99f9e",
+            "value": "config.json: 100%"
+          }
+        },
+        "7cd0b85ebd204b7aba908417811ce4e0": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_7baeab52d6694c32b1efd1ea1a0a7782",
+              "IPY_MODEL_519a7b154022443db6703f04a9142bae",
+              "IPY_MODEL_d4183e9715f34d249942b8271cca3bdf"
+            ],
+            "layout": "IPY_MODEL_da2347ac94764a3fa2743343cf0d3cd2"
+          }
+        },
+        "7e5d3774060e4589aa65982da5ea4ef4": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "7fd44cf9ca6e4726bfd7ac21846d6a14": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "80366349d81e4dcc892db6cd56e384f3": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_f8ef805b776145c3bfa9ba8d90972058",
+            "max": 9985,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_cc587493c33c4f118d1b1170f85be24c",
+            "value": 9985
+          }
+        },
+        "813621384dc748b0ad06775e22761c0b": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "81c3db71ac704280ad030072655f1537": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "82177df57a494de8900c14c2f5185175": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_67da6c4260574869aa24c3cbc1bc1654",
+            "placeholder": "​",
+            "style": "IPY_MODEL_94b9088614464f60a203de39dbcae853",
+            "value": " 8/8 [01:47&lt;00:00, 11.64s/it]"
+          }
+        },
+        "823f1c78f15043e38bbd4dca3932a86a": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_03a3c744d716431488163b4358b80f92",
+            "max": 239,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_a5434ee714f9498d83870544b67c0cb7",
+            "value": 239
+          }
+        },
+        "835bcc28a5564fb9b3d651bc8e32dc46": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "8640ac440fbc4644b9a3af7ba3ae7183": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "86816687746246b4a6105e8010384e25": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_8640ac440fbc4644b9a3af7ba3ae7183",
+            "placeholder": "​",
+            "style": "IPY_MODEL_5cea7996f02040b187ece0bb2d6a8d1f",
+            "value": "<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svg\nalt='Hugging Face'> <br> Copy a token from <a\nhref=\"https://huggingface.co/settings/tokens\" target=\"_blank\">your Hugging Face\ntokens page</a> and paste it below. <br> Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file. </center>"
+          }
+        },
+        "879c8ab5873847a8833bd74123be90a4": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_ef223e8504b64e3592589880326aaf41",
+            "placeholder": "​",
+            "style": "IPY_MODEL_598da69727bd4fb8b1caf465ac736d7a",
+            "value": " 1.67M/1.67M [00:00&lt;00:00, 19.0MB/s]"
+          }
+        },
+        "897b77a56c09479bb11d7f2a30997e55": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "8bc9d8ba866c442b9118d9630009939c": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "8c4d4fc5a30f4e7cb3be53fe2adda33d": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "8f5bd719974e41c3a8dd9a5b0d3d71e6": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "8f726dbfb45d4528afa33e36a6313267": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "9327977822be4b1294f80e876552e305": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_37de928300e34184881039378bd75e7f",
+            "placeholder": "​",
+            "style": "IPY_MODEL_0e936d9dbf9c4fdd86bbfe9730dedc47",
+            "value": " 3.96G/3.96G [00:13&lt;00:00, 273MB/s]"
+          }
+        },
+        "936d04b5fe1b4c63bf0b080e423d051b": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "93a44a11aa4846fa8efc6c1413ef1627": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "94b9088614464f60a203de39dbcae853": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "9503a45960984adc97b58e16c50662e0": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "95caff42f08a4c2aa14c867b8f37f231": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_de7c37ee83e24f0c889e84d07279c2ec",
+              "IPY_MODEL_9d4897eefb5f48259ffb2d23e332f752",
+              "IPY_MODEL_253017b0d0534e54ab44e181f6d7c82d"
+            ],
+            "layout": "IPY_MODEL_27beaf06e41b472abdb544a43c720c5a"
+          }
+        },
+        "977f799afaac4a55b2dc1cffa7d5b63b": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "97e36007e1304e1583fd81bfb13f0edd": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "9858cb74a09748a39e8149baac96702c": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "9b42e08b3c9548818488268768a118b1": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_d955dcaa0e944e719f3a06139dd54a03",
+            "placeholder": "​",
+            "style": "IPY_MODEL_d3de2662c7964f1ba96e58da382af720",
+            "value": "merges.txt: 100%"
+          }
+        },
+        "9cd5211b5d8b457aa0002f1d17b80028": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_6932489232ec4ab18a160b1e7fbcdfe1",
+            "placeholder": "​",
+            "style": "IPY_MODEL_4540927d98f54466b434ba4c0edf045d",
+            "value": "model-00007-of-00008.safetensors: 100%"
+          }
+        },
+        "9d4897eefb5f48259ffb2d23e332f752": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_30a81da86f8043eca301e86a8651201a",
+            "max": 2776833,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_e8b7a81040904c1e89e58978223b1737",
+            "value": 2776833
+          }
+        },
+        "9e333ed3b5014069ac1dd969255dd591": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "9ed02dc43412471a9ab47f3620ccf3a5": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "9f1c9a0695384bdaa6f8b847ef89bee8": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ButtonStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ButtonStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "button_color": null,
+            "font_weight": ""
+          }
+        },
+        "9f56a2d9979c4bd8928c644c22c3ecdf": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "a0a11e929edd4189b79723d618522c33": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "a10d0a76010f4e508c65a9b69ebc5156": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "a138859f19b74fc0928dc236ab5359db": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_9b42e08b3c9548818488268768a118b1",
+              "IPY_MODEL_12b56912736849fea2ad8124456fdc5c",
+              "IPY_MODEL_879c8ab5873847a8833bd74123be90a4"
+            ],
+            "layout": "IPY_MODEL_20352e5f58d24bb8b1f3940efd14fe4a"
+          }
+        },
+        "a1959759c5424da9961fb2a308d4dee4": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_3aaecbf540f54a2db9ab0931e3b1fe57",
+            "placeholder": "​",
+            "style": "IPY_MODEL_9e333ed3b5014069ac1dd969255dd591",
+            "value": " 239/239 [00:00&lt;00:00, 30.9kB/s]"
+          }
+        },
+        "a20927bf5f2c41f58c1e31ac858ab36c": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_1811cda0644e4190a9469d1774435d82",
+            "placeholder": "​",
+            "style": "IPY_MODEL_35c811d2ae8e43f3b5cecbdd3cfa857f",
+            "value": "tokenizer.json: 100%"
+          }
+        },
+        "a3a945817f684328b34651fe052393ec": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "a44f630e099e43899f20a77084ae60cd": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_ed5ca967ad5342929e578ac6aa4dc4c0",
+            "placeholder": "​",
+            "style": "IPY_MODEL_af401d117d5047629d3a6e2361757b62",
+            "value": "model-00001-of-00008.safetensors: 100%"
+          }
+        },
+        "a4e5789584564049b83df7c6c54a3e08": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "a5434ee714f9498d83870544b67c0cb7": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "a55060adc3564407ac81ad7297d34aaa": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "a6f48410b9964fefba0c3009a77dc838": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "a7cf477e80fc43e0ad82c7997b076dce": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "a80410b919e442c49aea15acc1ce1a72": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_fa1282ccc7544e4f818e2f03ccffe4a5",
+            "placeholder": "​",
+            "style": "IPY_MODEL_bbbf575d2a4b4c6ea8389be79b2a6039",
+            "value": "model.safetensors.index.json: 100%"
+          }
+        },
+        "ab93eabd7cea4b94b4b7a387f101e8a1": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "ac764024cf1c4e08ba7749afd2cd20ac": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "ad1236893754446881e153adc9d5c962": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_62e302ebdad64aada0ffe64ae1c873f3",
+            "placeholder": "​",
+            "style": "IPY_MODEL_bd1b0dfed6d34d16af33a4a58330f5ec",
+            "value": "Saving the dataset (1/1 shards): 100%"
+          }
+        },
+        "ad7599de524549c48bf2d3124ad4b299": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "adacfdcc1b0140efac56918e9ccf064e": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "af401d117d5047629d3a6e2361757b62": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "b191ac001a2e4962bc9a245fcdf26e6b": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "b195f160ca20442fadd8b5aed0ee41af": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "b1bea589efa14258a9982071b87938bf": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "b5b65414154544aa8a71b1a39164aad7": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "b634bb73cfa743d09a5999101b840976": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "b82aa8c57f7c422a9a9c90f333ed2a99": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_c0991cf63ee6458b96e9a75e7a88b61a",
+              "IPY_MODEL_71c8af139cd248b1b51101fd46a93f35",
+              "IPY_MODEL_1d5117195d4b49eb8f1a73b18419f7ce"
+            ],
+            "layout": "IPY_MODEL_3c21e4a511b4441192c03b7f1d0976e9"
+          }
+        },
+        "b8766a88716948cf968f4563531a76d9": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_2b3a2659b12244bd8548320320016dbf",
+            "placeholder": "​",
+            "style": "IPY_MODEL_0cd7efffbb3c4c4b972e63749f61ab97",
+            "value": "Generating train split: "
+          }
+        },
+        "b87c84de30e84b3abf4871461fb9cbd3": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "b8e39e4dddc3497fbc29ae45c66da759": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "bb33aec33a6447078c31bfd728942994": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "bbbf575d2a4b4c6ea8389be79b2a6039": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "bca2c7185b6749fd899c06a2ba4c5e46": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_0f480e3a0b0a45d2a2d2dec3cad923f3",
+            "placeholder": "​",
+            "style": "IPY_MODEL_fcb30372e7404c5d8a1ad4df91e6c7b2",
+            "value": " 1.91G/1.91G [00:05&lt;00:00, 444MB/s]"
+          }
+        },
+        "bd1b0dfed6d34d16af33a4a58330f5ec": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "be724f04b03942b2a033a7e8898bb4fd": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "bed8726b8069434687c75452e21f19e5": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_fa864b41586f4a7aa56aeafd1d84eb75",
+            "max": 9985,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_3225603166b54e7aab766b9964a2f660",
+            "value": 9985
+          }
+        },
+        "bee3501b2a17427784a717e50a85e7fa": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "bfcdbba993b74972a9e3e575f86908ff": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "bff139df987d4a62abec6456cb27f3d4": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "danger",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_c1f9c267ba3f40039cdb5eb3267e8043",
+            "max": 3963750880,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_33b3b1d0295646edaac7b4822761aeb0",
+            "value": 3963750502
+          }
+        },
+        "c0892a1881de4eb4bfabc6a68f87ae99": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_158c8b85dbf34de6a94b4e35e2fc7d5a",
+            "placeholder": "​",
+            "style": "IPY_MODEL_0b4c9753a7cb4354b8e5f187e6e1ad7c",
+            "value": " 3.96G/3.96G [00:15&lt;00:00, 564MB/s]"
+          }
+        },
+        "c0991cf63ee6458b96e9a75e7a88b61a": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_ed28e2e0410d4e0b855467e798e53d66",
+            "placeholder": "​",
+            "style": "IPY_MODEL_d93f134f802b4b69b575bdaf07dbd27c",
+            "value": "tokenizer_config.json: 100%"
+          }
+        },
+        "c12ea43372ac4d57bb9605f1a429b397": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "VBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "VBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "VBoxView",
+            "box_style": "",
+            "children": [],
+            "layout": "IPY_MODEL_131065f118274a1586ac38e39ed84ef0"
+          }
+        },
+        "c1314f241a434c41b45d84dc4d3b30f8": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "c1f9c267ba3f40039cdb5eb3267e8043": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "c33ced495f70464aa4a3a91922090853": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "c3725c7f79fe415fbd1ea336f0cc9cf1": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "danger",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_b191ac001a2e4962bc9a245fcdf26e6b",
+            "max": 3841788544,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_054c8dffadba48c6b895a6cc62448ecc",
+            "value": 3841788178
+          }
+        },
+        "c3be9109d63c485d9c0ef4f9bc0f9218": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "c42acf646f344a88b8c11f81e67f7206": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_8bc9d8ba866c442b9118d9630009939c",
+            "placeholder": "​",
+            "style": "IPY_MODEL_9f56a2d9979c4bd8928c644c22c3ecdf",
+            "value": "model-00003-of-00008.safetensors: 100%"
+          }
+        },
+        "c6164e05a1914ae48083db9ad7f4ef7c": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "c65dc74c7d6f4bab8f7dd28455161dd8": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "c6e00f5224364822bc4239b176686919": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_2a51b36be41745468e4c2d7a21b1c0d2",
+            "max": 36514,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_4fd114abe9f5494ab59858949f5055f1",
+            "value": 36514
+          }
+        },
+        "c73055099c084dca996159e23e162d0b": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_e40d1c1ac9494b3bade9858324e7ffdf",
+            "placeholder": "​",
+            "style": "IPY_MODEL_d65b6b060d9845779299491ac5599c31",
+            "value": " 9985/9985 [01:04&lt;00:00, 189.08 examples/s]"
+          }
+        },
+        "c7433acd3c4841e6958ae8f7e87b1808": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "CheckboxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "CheckboxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "CheckboxView",
+            "description": "Add token as git credential?",
+            "description_tooltip": null,
+            "disabled": false,
+            "indent": true,
+            "layout": "IPY_MODEL_62c028fdef904dedb9cdeca2b3bda725",
+            "style": "IPY_MODEL_a7cf477e80fc43e0ad82c7997b076dce",
+            "value": false
+          }
+        },
+        "c84cc07789be48aebb322c23d355289e": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_0077aedc3d174560bce924ee89e9c006",
+            "placeholder": "​",
+            "style": "IPY_MODEL_00321cce58884f6f9b3855a21fcd9187",
+            "value": "Add position_id column (Sample Packing) (num_proc=2): 100%"
+          }
+        },
+        "ca65e32eb52f48c09a84b33cb18f22cd": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "cc587493c33c4f118d1b1170f85be24c": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "cc94432d08464affa3e58b560bdad194": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "danger",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_b5b65414154544aa8a71b1a39164aad7",
+            "max": 3963750816,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_f0a58fbd0fca4340890041f99fa2f8c8",
+            "value": 3963750438
+          }
+        },
+        "ccfcdc95baf646f8aeb3d516742383f2": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "cdebbc55a1164c018546c2ac6f8c620c": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_a44f630e099e43899f20a77084ae60cd",
+              "IPY_MODEL_c3725c7f79fe415fbd1ea336f0cc9cf1",
+              "IPY_MODEL_0e50870ed0c643e0b6c18cc5d7ddae7f"
+            ],
+            "layout": "IPY_MODEL_c33ced495f70464aa4a3a91922090853"
+          }
+        },
+        "d02274afd47b462291c745f261209d42": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "d07c8b97d3314f1c852e44bdd40f61ed": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "d0e9dce55cec4c1ca619a0ccf209d924": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "d1f9b10c130542f094c8fd3d1e23b5e9": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "d262c82138024169b9f3aa034ca756fa": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "d3de2662c7964f1ba96e58da382af720": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "d4183e9715f34d249942b8271cca3bdf": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_63580b6fb30642479fe3000915bf551a",
+            "placeholder": "​",
+            "style": "IPY_MODEL_8f726dbfb45d4528afa33e36a6313267",
+            "value": " 27.3M/27.3M [00:00&lt;00:00, 31.0MB/s]"
+          }
+        },
+        "d43c6df07ddb466587807d6dbe1ff614": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_8c4d4fc5a30f4e7cb3be53fe2adda33d",
+            "placeholder": "​",
+            "style": "IPY_MODEL_e90658f4bcb642baa78426012f863152",
+            "value": "model-00004-of-00008.safetensors: 100%"
+          }
+        },
+        "d65b6b060d9845779299491ac5599c31": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "d6fe74e4255444368f8f90a62157d869": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "d93f134f802b4b69b575bdaf07dbd27c": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "d955dcaa0e944e719f3a06139dd54a03": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "da2347ac94764a3fa2743343cf0d3cd2": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "da6e93f3e4984780b930fe7a706983ea": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "daee63fd167e4441a32324b51b00ad2b": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_d07c8b97d3314f1c852e44bdd40f61ed",
+            "max": 9985,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_ebb69a2c3d0a4299a484698287b3087c",
+            "value": 9985
+          }
+        },
+        "dc892a596f6942d7973c616c38f0eebb": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_c84cc07789be48aebb322c23d355289e",
+              "IPY_MODEL_bed8726b8069434687c75452e21f19e5",
+              "IPY_MODEL_16a188a0b06d45f980dcf3933509fe0a"
+            ],
+            "layout": "IPY_MODEL_60c1a0d765c14a1d888317e6a507e4ea"
+          }
+        },
+        "dd0e646fad3f4a89ba23b39d162bd8d9": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_d43c6df07ddb466587807d6dbe1ff614",
+              "IPY_MODEL_e0e8b840b8ea4d0d9db09afe99fa287d",
+              "IPY_MODEL_9327977822be4b1294f80e876552e305"
+            ],
+            "layout": "IPY_MODEL_77304d1a46b3468a98483e02ec0ac4a4"
+          }
+        },
+        "de7c37ee83e24f0c889e84d07279c2ec": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_34cf3df51fbc41cabfdbba153c007f0e",
+            "placeholder": "​",
+            "style": "IPY_MODEL_ac764024cf1c4e08ba7749afd2cd20ac",
+            "value": "vocab.json: 100%"
+          }
+        },
+        "dfd2a2649b8341ef913207526708aff1": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "e09f1bcbb9d94c09be53e5e1303642c2": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_e7d8e4fe58384e93a106de546068c65e",
+            "max": 8,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_0aa8ab56b85f4171a79c3bc210594025",
+            "value": 8
+          }
+        },
+        "e0e8b840b8ea4d0d9db09afe99fa287d": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "danger",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_f7434f3e03124a1c938a39af79d7fa59",
+            "max": 3963750880,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_c1314f241a434c41b45d84dc4d3b30f8",
+            "value": 3963750502
+          }
+        },
+        "e21e180307e5485cbbe908672fd6639a": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_2e2b0c1599c341a198f632f46a40c90e",
+              "IPY_MODEL_bff139df987d4a62abec6456cb27f3d4",
+              "IPY_MODEL_ebe1cc366d324ad59b264c8b3c431441"
+            ],
+            "layout": "IPY_MODEL_114dece49dba437c8572ef94b23c3b1e"
+          }
+        },
+        "e366ae3fceec4566b9ed303d6c5f90af": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "e3fb3fc6afe04b3c9b7ac61809ce78fa": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_c6164e05a1914ae48083db9ad7f4ef7c",
+            "placeholder": "​",
+            "style": "IPY_MODEL_813621384dc748b0ad06775e22761c0b",
+            "value": " 9985/9985 [00:03&lt;00:00, 3622.89 examples/s]"
+          }
+        },
+        "e400cbf14bcc446a9d33b210cd93550b": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "e40d1c1ac9494b3bade9858324e7ffdf": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "e575d87a7efe4ec7b1efde489839d4a6": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "e5a82df528bb4e408797a3b6c2758f4a": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "e6e969610738449887259063967f82b0": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "e7d8e4fe58384e93a106de546068c65e": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "e87ea87fcff247b5bbcc331ba79a8dc2": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "e8b7a81040904c1e89e58978223b1737": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "e90658f4bcb642baa78426012f863152": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "eb1c9535e6a546098b760528b2ea387c": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_18357b321ce44d7b8bd9d1c886f69275",
+              "IPY_MODEL_279937fe03bc4e4eb25b472d7e9df163",
+              "IPY_MODEL_bca2c7185b6749fd899c06a2ba4c5e46"
+            ],
+            "layout": "IPY_MODEL_1f7d30f71bbd4547a9150d21da071055"
+          }
+        },
+        "ebb69a2c3d0a4299a484698287b3087c": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "ebc80d1a55fa47f4a5ea2756588569ec": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "ebe1cc366d324ad59b264c8b3c431441": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_fba7aa824b38467ab3061b226114cdec",
+            "placeholder": "​",
+            "style": "IPY_MODEL_f3075dccbd2747b4a7913b66f44f2596",
+            "value": " 3.96G/3.96G [00:13&lt;00:00, 398MB/s]"
+          }
+        },
+        "ec030fc3c346426f9abc3a89892258d3": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_dfd2a2649b8341ef913207526708aff1",
+            "max": 9985,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_4f1977d7e4824ef1a14b65f0f42bba10",
+            "value": 9985
+          }
+        },
+        "ec11d1e5ae7b42c883d9b1f38a65356e": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_936d04b5fe1b4c63bf0b080e423d051b",
+            "placeholder": "​",
+            "style": "IPY_MODEL_f1cef8e8dc2646fb9fd09f3b09081074",
+            "value": " 36.5k/36.5k [00:00&lt;00:00, 4.32MB/s]"
+          }
+        },
+        "ed28e2e0410d4e0b855467e798e53d66": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "ed5ca967ad5342929e578ac6aa4dc4c0": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "edc99591b9c747b689b94d0052fec14c": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "ef0a3c7a6f14460fb4da096928ae249e": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_07fb3a2c8315494e97b447e672dfae06",
+              "IPY_MODEL_ec030fc3c346426f9abc3a89892258d3",
+              "IPY_MODEL_e3fb3fc6afe04b3c9b7ac61809ce78fa"
+            ],
+            "layout": "IPY_MODEL_c3be9109d63c485d9c0ef4f9bc0f9218"
+          }
+        },
+        "ef223e8504b64e3592589880326aaf41": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "f0a58fbd0fca4340890041f99fa2f8c8": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "f113ebd8c1c34806bea4dd7ed3035173": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "f1cef8e8dc2646fb9fd09f3b09081074": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "f3075dccbd2747b4a7913b66f44f2596": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "f365820a3d3c42b2948abfe32065de14": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_735d4f225b24414294fc1b213c61223c",
+            "placeholder": "​",
+            "style": "IPY_MODEL_5e5e15b0569b474c9620083b3ec6af55",
+            "value": "generation_config.json: 100%"
+          }
+        },
+        "f4667818b9d34a09891cd727a429a610": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_4b27c267393640f28f6eae0875bd2ed9",
+            "placeholder": "​",
+            "style": "IPY_MODEL_9858cb74a09748a39e8149baac96702c",
+            "value": " 3.96G/3.96G [00:11&lt;00:00, 457MB/s]"
+          }
+        },
+        "f4a1795dc7514a718f478245f521f0ba": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "f60a2bdb6b6b4e0e8c3508580e247132": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "danger",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_edc99591b9c747b689b94d0052fec14c",
+            "max": 3963750880,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_35cc989ca3374e7dba0cb166febc4bde",
+            "value": 3963750502
+          }
+        },
+        "f7434f3e03124a1c938a39af79d7fa59": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "f8ef805b776145c3bfa9ba8d90972058": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "fa1282ccc7544e4f818e2f03ccffe4a5": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "fa864b41586f4a7aa56aeafd1d84eb75": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "fba7aa824b38467ab3061b226114cdec": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "fcb30372e7404c5d8a1ad4df91e6c7b2": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "fcbab4d8dced41a18dfccce81e3a45a0": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "fd4f333f7ece4450b04e1a9af1f9d2f6": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_d1f9b10c130542f094c8fd3d1e23b5e9",
+            "placeholder": "​",
+            "style": "IPY_MODEL_e575d87a7efe4ec7b1efde489839d4a6",
+            "value": "model-00006-of-00008.safetensors: 100%"
+          }
+        },
+        "fe18bba7f3fb4c31bf840541f36b3425": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_fd4f333f7ece4450b04e1a9af1f9d2f6",
+              "IPY_MODEL_f60a2bdb6b6b4e0e8c3508580e247132",
+              "IPY_MODEL_c0892a1881de4eb4bfabc6a68f87ae99"
+            ],
+            "layout": "IPY_MODEL_1bec6297c90242a88672d195bc09d429"
+          }
+        },
+        "fe41858c6bd04c58840112b67c19a336": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_e5a82df528bb4e408797a3b6c2758f4a",
+            "placeholder": "​",
+            "style": "IPY_MODEL_f113ebd8c1c34806bea4dd7ed3035173",
+            "value": " 9985/9985 [00:00&lt;00:00, 44264.88 examples/s]"
+          }
+        },
+        "fea1b70fb46745feb5111b3929175b5d": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_f365820a3d3c42b2948abfe32065de14",
+              "IPY_MODEL_823f1c78f15043e38bbd4dca3932a86a",
+              "IPY_MODEL_a1959759c5424da9961fb2a308d4dee4"
+            ],
+            "layout": "IPY_MODEL_34c9c0137b504cd799c6bd6de69507c2"
+          }
+        },
+        "ff3a94b146a948b6907f5d80c7157f99": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "ffdbb12a2f2c4d14911685e7683e0ef0": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_ab93eabd7cea4b94b4b7a387f101e8a1",
+            "placeholder": "​",
+            "style": "IPY_MODEL_704f2f5a9b1c49d5a75a0025a5dda11b",
+            "value": " 3.96G/3.96G [00:12&lt;00:00, 656MB/s]"
+          }
+        }
+      }
+    }
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import torch\n",
-    "# Check so there is a gpu available, a T4(free tier) is enough to run this notebook\n",
-    "assert (torch.cuda.is_available()==True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!pip install --no-build-isolation axolotl[deepspeed]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Hugging Face login (optional)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from huggingface_hub import notebook_login\n",
-    "notebook_login()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Example configuration"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import yaml\n",
-    "\n",
-    "yaml_string = \"\"\"\n",
-    "base_model: NousResearch/Meta-Llama-3.1-8B\n",
-    "\n",
-    "load_in_8bit: false\n",
-    "load_in_4bit: true\n",
-    "strict: false\n",
-    "\n",
-    "datasets:\n",
-    "  - path: tatsu-lab/alpaca\n",
-    "    type: alpaca\n",
-    "dataset_prepared_path: last_run_prepared\n",
-    "val_set_size: 0.05\n",
-    "output_dir: ./outputs/lora-out\n",
-    "\n",
-    "sequence_len: 2048\n",
-    "sample_packing: true\n",
-    "eval_sample_packing: true\n",
-    "pad_to_sequence_len: true\n",
-    "\n",
-    "adapter: qlora\n",
-    "lora_model_dir:\n",
-    "lora_r: 32\n",
-    "lora_alpha: 16\n",
-    "lora_dropout: 0.05\n",
-    "lora_target_linear: true\n",
-    "lora_fan_in_fan_out:\n",
-    "lora_modules_to_save:\n",
-    "  - embed_tokens\n",
-    "  - lm_head\n",
-    "\n",
-    "wandb_project:\n",
-    "wandb_entity:\n",
-    "wandb_watch:\n",
-    "wandb_name:\n",
-    "wandb_log_model:\n",
-    "\n",
-    "gradient_accumulation_steps: 2\n",
-    "micro_batch_size: 1\n",
-    "num_epochs: 1\n",
-    "optimizer: paged_adamw_8bit\n",
-    "lr_scheduler: cosine\n",
-    "learning_rate: 2e-5\n",
-    "\n",
-    "train_on_inputs: false\n",
-    "group_by_length: false\n",
-    "bf16: auto\n",
-    "fp16:\n",
-    "tf32: false\n",
-    "\n",
-    "gradient_checkpointing: true\n",
-    "early_stopping_patience:\n",
-    "resume_from_checkpoint:\n",
-    "logging_steps: 1\n",
-    "xformers_attention:\n",
-    "flash_attention: false\n",
-    "sdp_attention: true\n",
-    "\n",
-    "warmup_steps: 1\n",
-    "max_steps: 25\n",
-    "evals_per_epoch: 1\n",
-    "eval_table_size:\n",
-    "saves_per_epoch: 1\n",
-    "debug:\n",
-    "deepspeed:\n",
-    "weight_decay: 0.0\n",
-    "fsdp:\n",
-    "fsdp_config:\n",
-    "special_tokens:\n",
-    "  pad_token: <|end_of_text|>\n",
-    "\"\"\"\n",
-    "\n",
-    "\n",
-    "# Convert the YAML string to a Python dictionary\n",
-    "yaml_dict = yaml.safe_load(yaml_string)\n",
-    "\n",
-    "# Specify your file path\n",
-    "file_path = 'test_axolotl.yaml'\n",
-    "\n",
-    "# Write the YAML file\n",
-    "with open(file_path, 'w') as file:\n",
-    "    yaml.dump(yaml_dict, file)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Above we have a configuration file with base LLM model and datasets specified, among many other things. Axolotl can automatically detect whether the specified datasets are on HuggingFace repo or local machine.\n",
-    "\n",
-    "The Axolotl configuration options encompass model and dataset selection, data pre-processing, and training. Let's go through them line by line:\n",
-    "\n",
-    "*   \"base model\": String value, specifies the underlying pre-trained LLM that will be used for finetuning\n",
-    "\n",
-    "Next we have options for model weights quantization. Quantization allows for reduction in occupied memory on GPUs.\n",
-    "\n",
-    "*   \"load_in_8bit\": Boolean value, whether to quantize the model weights into 8-bit integer.\n",
-    "\n",
-    "*   \"load_in_4bit\": Boolean value, whether to quantize the model weights into 4-bit integer.\n",
-    "\n",
-    "*   \"strict\": Boolean value. If false, it allows for overriding established configuration options in the yaml file when executing in command-line interface.\n",
-    "\n",
-    "*   \"datasets\": a list of dicts that contain path and type of data sets as well as other optional configurations where datasets are concerned. Supports multiple datasets.\n",
-    "\n",
-    "*   \"val_set_size\": Either a float value less than one or an integer less than the total size of dataset. Sets the size of validation set from the whole dataset. If float, sets the proportion of the dataset assigned for validation. If integer, sets the direct size of validation set.\n",
-    "\n",
-    "*   \"output_dir\": String value. Path of trained model.\n",
-    "\n",
-    "For data preprocessing:\n",
-    "\n",
-    "*   \"sequence_len\": Integer. Specifies the maximum sequence length of the input. Typically 2048 or less.\n",
-    "\n",
-    "*   \"pad_to_sequence_len\": Boolean. Padding input to maximum sequence length.\n",
-    "\n",
-    "*   \"sample_packing\": Boolean. Specifies whether to use multi-packing with block diagonal attention.\n",
-    "\n",
-    "*   \"special_tokens\": Python dict, optional. Allows users to specify the additional special tokens to be ignored by the tokenizer.\n",
-    "\n",
-    "For LoRA configuration and its hyperparamters:\n",
-    "\n",
-    "*   \"adapter\": String. Either \"lora\" or \"qlora\", depending on user's choice.\n",
-    "\n",
-    "*   \"lora_model_dir\": String, Optional. Path to directory that contains LoRA model, if there is already a trained LoRA model the user would like to use.\n",
-    "\n",
-    "*   \"lora_r\": Integer. Refers to the rank of LoRA decomposition matrices. Higher value will reduce LoRA efficiency. Recommended to be set to 8.\n",
-    "\n",
-    "*   \"lora_alpha\": Integer. Scale the weight matrices by $\\frac{\\text{lora_alpha}}{\\text{lora_r}}$Recommended to be fixed at 16.\n",
-    "\n",
-    "*   \"lora_dropout\": Float that is 1 or less. The dropout probability of a lora layer.\n",
-    "\n",
-    "*   \"lora_target_linear\": Boolean. If true, lora will target all linear modules in the transformers architecture.\n",
-    "\n",
-    "*   \"lora_modules_to_save\": If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.\n",
-    "\n",
-    "See [LoRA](https://arxiv.org/abs/2106.09685) for detailed explanation of LoRA implementation.\n",
-    "\n",
-    "For the training configurations:\n",
-    "\n",
-    "*   \"gradient_accumulation_steps\": Integer. The number of steps over which to accumulate gradient for batch training. E.g. if 2, backprop is performed every two steps.\n",
-    "\n",
-    "*   \"micro_batch_size\": Integer. Batch size per gpu / gradient_accumulation_steps\n",
-    "\n",
-    "*   \"num_epochs\": Integer. Number of epochs. One epoch is when training has looped over every batch in the whole data set once.\n",
-    "\n",
-    "*   \"optimizer\": The optimizer to use for the training.\n",
-    "\n",
-    "*   \"learning_rate\": The learning rate.\n",
-    "\n",
-    "*   \"lr_scheduler\": The learning rate scheduler to use for adjusting learning rate during training.\n",
-    "\n",
-    "*   \"train_on_inputs\": Boolean. Whether to ignore or include the user's prompt from the training labels.\n",
-    "\n",
-    "*   \"group_by_length\": Boolean. Whether to group similarly sized data to minimize padding.\n",
-    "\n",
-    "*   \"bf16\": Either \"auto\", \"true\", or \"false\". Whether to use CUDA bf16 floating point format. If set to \"auto\", will automatically apply bf16 should the gpu supports it.\n",
-    "\n",
-    "*   \"fp16\": Optional. Specifies whether to use CUDA fp16. Automatically set to true if \"bf16\" is set to true. Otherwise false.\n",
-    "\n",
-    "*   \"tf32\": Boolean. Whether to use CUDA tf32. Will override bf16.\n",
-    "\n",
-    "*   \"gradient_checkpointing\": Boolean. Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\n",
-    "\n",
-    "*   \"gradient_checkpointing_kwargs\": Python Dict. Fed into the trainer.\n",
-    "\n",
-    "*   \"logging_steps\": Integer. Log training information over every specified number of steps.\n",
-    "\n",
-    "*   \"flash_attention\": Boolean. Whether to use the [flash attention](https://github.com/Dao-AILab/flash-attention) mechanism.\n",
-    "\n",
-    "*   \"sdp_attention\": Boolean. Whether to use the Scaled Dot Product attention mechanism (the attention mechanism in the [original implementation](https://arxiv.org/abs/1706.03762) of transformers.)\n",
-    "\n",
-    "*   \"warmup_steps\": Integer. The number of pre-training steps where a very low learning rate is used.\n",
-    "\n",
-    "*   \"evals_per_epoch\": Integer. Number of evaluations to be performed within one training epoch.\n",
-    "\n",
-    "*   \"saves_per_epoch\": Integer. Number of times the model is saved in one training epoch.\n",
-    "\n",
-    "*   \"weight_decay\": Positive Float. Sets the \"strength\" of weight decay (i.e. setting the coefficient of L2 regularization)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The above is but a snippet aiming to get users familiarized with the types of streamlined configuration options axolotl provides. For a full list of configuration options, see [here](https://axolotl-ai-cloud.github.io/axolotl/docs/config.html)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Train the model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!accelerate launch -m axolotl.cli.train /content/test_axolotl.yaml"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Predict with trained model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!accelerate launch -m axolotl.cli.inference /content/test_axolotl.yaml \\\n",
-    "    --lora_model_dir=\"./outputs/lora-out\" --gradio"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Deeper Dive"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "It is also helpful to gain some familiarity over some of the core inner workings of axolotl"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Configuration Normalization"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Axolotl uses a custom Dict class, called ```DictDefault```\n",
-    "to store configurations specified in the yaml configuration file (into a Python variable named ```cfg```). The definition for this custom Dict can be found in the [utils/dict.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/dict.py)\n",
-    "\n",
-    "```DictDefault``` is amended such that calling a missing key from it will result in a ```None``` return type. This is important because if some configuration options aren't specified by the user, the ```None``` type allows Axolotl to perform boolean operations to determine the default settings for missing configurations. For more examples on how this is done, check out [utils/config/__init__.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/config/__init__.py)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Loading Models, Tokenizers, and Trainer"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "If we inspect [cli.train.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/cli/train.py), we will find that most of the heavy lifting were done by the function ```train()``` which is itself imported from [src/axolotl/train.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/train.py).\n",
-    "\n",
-    "```train()``` takes care of loading the appropriate tokenizer and pre-trained model through ```load_model()``` and ```load_tokenizer()``` from [src/axolotl/utils/models.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/models.py) respectively.\n",
-    "\n",
-    "```load_tokenizer()``` loads in the appropriate tokenizer given the desired model, as well as chat templates.\n",
-    "\n",
-    "```ModelLoader``` class follows after tokenizer has been selected. It will automatically discern the base model type, load in the desired model, as well as applying model-appropriate attention mechanism modifications (e.g. flash attention). Depending on which base model the user chooses in the configuration, ```ModelLoader``` will utilize the corresponding \"attention hijacking\" script. For example, if the user specified the base model to be ```NousResearch/Meta-Llama-3.1-8B```, which is of llama type, and set ```flash_attn``` to ```True```, ```ModelLoader``` will load in [llama_attn_hijack_flash.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/monkeypatch/llama_attn_hijack_flash.py). For a list of supported attention hijacking, please refer to the directory [/src/axolotl/monkeypatch/](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/monkeypatch)\n",
-    "\n",
-    "Another important operation encompassed in ```train()``` is setting up the training that takes into account of user-specified traning configurations (e.g. num_epochs, optimizer) through the use of ```setup_trainer()``` from [/src/axolotl/utils/trainer.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/trainer.py), which in turn relies on modules from [/src/axolotl/core/trainer_builder.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/core/trainer_builder.py).\n",
-    "```trainer_builder.py``` provides a list of trainer object options bespoke for the task type (Causal or Reinforcement learning ('dpo', 'ipo', 'kto') )"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Monkey patch\n",
-    "\n",
-    "The [Monkey patch directory](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/monkeypatch) is where model architecture/optimization patching scripts are stored (these are modifications that are not implemented in the official releases, hence the name monkey patch). It includes attention jacking, ReLoRA, and unsloth optimization."
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "name": "python",
-   "version": "3.9.6"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
+  "nbformat": 4,
+  "nbformat_minor": 0
 }
diff --git a/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml b/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
index 2c0495ced..fc9a75e3f 100644
--- a/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
@@ -21,7 +21,7 @@ output_dir: ./outputs/lora-out
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+
 
 adapter: lora
 lora_model_dir:
@@ -51,8 +51,10 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml b/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
index de9c956e0..b527edc6f 100644
--- a/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
@@ -21,7 +21,7 @@ output_dir: ./outputs/lora-out
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+
 
 adapter: lora
 lora_model_dir:
@@ -51,8 +51,10 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/deepseek-v2/fft-fsdp-16b.yaml b/examples/deepseek-v2/fft-fsdp-16b.yaml
index 0ed97db36..6e936da16 100644
--- a/examples/deepseek-v2/fft-fsdp-16b.yaml
+++ b/examples/deepseek-v2/fft-fsdp-16b.yaml
@@ -12,7 +12,7 @@ output_dir: ./outputs/out
 
 sequence_len: 2048
 sample_packing: true
-pad_to_sequence_len: true
+
 
 wandb_project:
 wandb_entity:
@@ -37,7 +37,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 100
+warmup_ratio: 0.1
 evals_per_epoch: 2
 saves_per_epoch: 1
 weight_decay: 0.0
@@ -55,3 +55,5 @@ fsdp_config:
   fsdp_transformer_layer_cls_to_wrap: DeepseekV2DecoderLayer
   fsdp_state_dict_type: FULL_STATE_DICT
   fsdp_sharding_strategy: FULL_SHARD
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/deepseek-v2/qlora-fsdp-2_5.yaml b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
index 34dbeaafe..aab5034a0 100644
--- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -30,7 +30,7 @@ output_dir: ./outputs/out
 
 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 
 wandb_project:
 wandb_entity:
@@ -61,7 +61,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 100
+warmup_ratio: 0.1
 evals_per_epoch: 2
 saves_per_epoch: 1
 weight_decay: 0.0
@@ -79,3 +79,5 @@ fsdp_config:
   fsdp_transformer_layer_cls_to_wrap: DeepseekV2DecoderLayer
   fsdp_state_dict_type: FULL_STATE_DICT
   fsdp_sharding_strategy: FULL_SHARD
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/devstral/README.md b/examples/devstral/README.md
new file mode 100644
index 000000000..b53635a8f
--- /dev/null
+++ b/examples/devstral/README.md
@@ -0,0 +1,67 @@
+# Finetune Devstral with Axolotl
+
+Devstral Small is a 24B parameter opensource model from MistralAI found on HuggingFace [Devstral-Small-2505](https://huggingface.co/mistralai/Devstral-Small-2505) and [Devstral-Small-2507](https://huggingface.co/mistralai/Devstral-Small-2507). `Devstral-Small-2507` is the latest version of the model and has [function calling](https://mistralai.github.io/mistral-common/usage/tools/) support.
+
+This guide shows how to fine-tune it with Axolotl with multi-turn conversations with proper masking.
+
+The model was fine-tuned ontop of [Mistral-Small-3.1](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503) without the vision layer and has a context of up to 128k tokens.
+
+Thanks to the team at MistralAI for giving us early access to prepare for this release.
+
+## Getting started
+
+1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
+
+    Here is an example of how to install from pip:
+
+```bash
+# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
+pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+```
+
+2. Run the finetuning example:
+
+```bash
+axolotl train examples/devstral/devstral-small-qlora.yml
+```
+
+This config uses about 21GB VRAM.
+
+Let us know how it goes. Happy finetuning! 🚀
+
+### TIPS
+
+- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
+- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
+- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
+- Learn how to use function calling with Axolotl at [docs](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#using-tool-use).
+
+## Optimization Guides
+
+- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
+- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
+- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
+- [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy)
+- [Liger Kernel](https://docs.axolotl.ai/docs/custom_integrations.html#liger-kernels)
+
+## Limitations
+
+We only support the `mistral-common` tokenizer for Supervised Fine-tuning at the moment and for `type: chat_template` only.
+
+In addition, we do not support overriding tokens yet.
+
+## Related Resources
+
+- [MistralAI Devstral Blog](https://mistral.ai/news/devstral)
+- [MistralAI Devstral 1.1 Blog](https://mistral.ai/news/devstral-2507)
+- [Axolotl Docs](https://docs.axolotl.ai)
+- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
+- [Axolotl Website](https://axolotl.ai)
+- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
+
+
+## Future Work
+
+- Add parity to Preference Tuning, RL, Multi-modal, etc.
+- Add parity to other tokenizer configs like overriding tokens.
diff --git a/examples/devstral/devstral-small-qlora.yml b/examples/devstral/devstral-small-qlora.yml
new file mode 100644
index 000000000..7fe4dd433
--- /dev/null
+++ b/examples/devstral/devstral-small-qlora.yml
@@ -0,0 +1,66 @@
+base_model: mistralai/Devstral-Small-2507
+
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+# Enable to use mistral-common tokenizer
+tokenizer_use_mistral_common: true
+
+load_in_8bit: false
+load_in_4bit: true
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+datasets:
+  - path: fozziethebeat/alpaca_messages_2k_test
+    type: chat_template
+
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.1
+output_dir: ./outputs/qlora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 2048
+sample_packing: true
+
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0
+lora_target_linear: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 1
+optimizer: adamw_torch
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+loss_watchdog_threshold: 5.0
+loss_watchdog_patience: 3
+
+warmup_ratio: 0.05
+evals_per_epoch: 4
+saves_per_epoch: 1
+
+weight_decay: 0.0
+special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/distributed-parallel/README.md b/examples/distributed-parallel/README.md
new file mode 100644
index 000000000..ad7c48d5f
--- /dev/null
+++ b/examples/distributed-parallel/README.md
@@ -0,0 +1,52 @@
+# ND Parallelism Examples
+
+This directory contains example configurations for training models using ND Parallelism in Axolotl. These examples demonstrate how to compose different parallelism strategies (FSDP, TP, CP, HSDP) for efficient multi-GPU training.
+
+## Quick Start
+
+1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
+
+2. Run the command below:
+
+```bash
+# Train Qwen3 8B with FSDP + TP + CP on a single 8-GPU node
+axolotl train examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml
+
+# Train Llama 3.1 8B with HSDP + TP on 2 nodes (16 GPUs total)
+axolotl train examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml
+```
+
+## Example Configurations
+
+### Single Node (8 GPUs)
+
+**Qwen3 8B with FSDP + TP + CP** ([qwen3-8b-fsdp-tp-cp.yaml](./qwen3-8b-fsdp-tp-cp.yaml))
+- Uses all 3 parallelism dimensions on a single node
+- Ideal for: when model weights, activations, and/or context are too large to fit on single GPU
+
+```yaml
+dp_shard_size: 2         # FSDP across 2 GPUs
+tensor_parallel_size: 2  # TP across 2 GPUs
+context_parallel_size: 2 # CP across 2 GPUs
+# Total: 2 × 2 × 2 = 8 GPUs
+```
+
+### Multi-Node
+
+**Llama 3.1 8B with HSDP + TP** ([llama-3_1-8b-hsdp-tp.yaml](./llama-3_1-8b-hsdp-tp.yaml))
+- FSDP & TP within nodes, DDP across nodes to minimize inter-node communication
+- Ideal for: Scaling to multiple nodes while maintaining training efficiency
+
+```yaml
+dp_shard_size: 4        # FSDP within each 4-GPU group
+tensor_parallel_size: 2 # TP within each node
+dp_replicate_size: 2    # DDP across 2 groups
+# Total: (4 × 2) × 2 = 16 GPUs (2 nodes)
+```
+
+## Learn More
+
+- [ND Parallelism Documentation](https://docs.axolotl.ai/docs/nd_parallelism.html)
+- [Blog: Accelerate ND-Parallel Guide](https://huggingface.co/blog/accelerate-nd-parallel)
+- [Multi-GPU Training Guide](https://docs.axolotl.ai/docs/multi-gpu.html)
+- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
diff --git a/examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml b/examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml
new file mode 100644
index 000000000..f10dc9bd2
--- /dev/null
+++ b/examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml
@@ -0,0 +1,47 @@
+base_model: meta-llama/Llama-3.1-8B
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+dp_shard_size: 4
+dp_replicate_size: 2
+tensor_parallel_size: 2
+# context_parallel_size: 2
+
+dataset_prepared_path: last_run_prepared
+
+special_tokens:
+  pad_token: <|end_of_text|>
+
+fsdp_version: 2
+fsdp_config:
+  offload_params: false
+  state_dict_type: FULL_STATE_DICT
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  reshard_after_forward: true
+
+datasets:
+  - path: tatsu-lab/alpaca
+    type: alpaca
+
+output_dir: ./outputs/ndp-out/
+
+sequence_len: 2048
+sample_packing: true
+flash_attention: true
+
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+num_epochs: 2
+optimizer: adamw_torch_fused
+lr_scheduler: constant_with_warmup
+learning_rate: 2e-6
+
+bf16: true
+tf32: true
+
+logging_steps: 1
+saves_per_epoch: 1
+
+warmup_ratio: 0.1
diff --git a/examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml b/examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml
new file mode 100644
index 000000000..584a33f44
--- /dev/null
+++ b/examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml
@@ -0,0 +1,46 @@
+base_model: Qwen/Qwen3-8B
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+dp_shard_size: 2
+# dp_replicate_size: 1
+context_parallel_size: 2
+tensor_parallel_size: 2
+
+dataset_prepared_path: last_run_prepared
+
+fsdp_version: 2
+fsdp_config:
+  offload_params: false
+  state_dict_type: FULL_STATE_DICT
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: Qwen3DecoderLayer
+  reshard_after_forward: true
+
+datasets:
+  - path: tatsu-lab/alpaca
+    type: alpaca
+
+output_dir: ./outputs/ndp-out/
+
+sequence_len: 8192
+sample_packing: true
+flash_attention: true
+
+gradient_accumulation_steps: 1
+micro_batch_size: 1  # must be 1 when using context parallel
+num_epochs: 2
+optimizer: adamw_torch_fused
+lr_scheduler: constant_with_warmup
+learning_rate: 2e-6
+
+bf16: true
+tf32: true
+
+logging_steps: 1
+saves_per_epoch: 1
+
+warmup_ratio: 0.1
+
+special_tokens:
diff --git a/examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml b/examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml
new file mode 100644
index 000000000..2473179f0
--- /dev/null
+++ b/examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml
@@ -0,0 +1,73 @@
+base_model: tiiuae/Falcon-H1-1.5B-Deep-Base
+# optionally might have model_type or tokenizer_type
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: true
+
+# huggingface repo
+chat_template: falcon_h1
+datasets:
+  - path: cgato/SlimOrcaDedupCleaned
+    type: chat_template
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+adapter: qlora
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - in_proj
+  - gate_proj
+  - up_proj
+  - down_proj
+
+sequence_len: 2048
+sample_packing: false
+eval_sample_packing: false
+
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 4
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch:
+saves_per_epoch: 1
+weight_decay: 0.0
+special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/falcon-h1/falcon-h1-1b-qlora.yaml b/examples/falcon-h1/falcon-h1-1b-qlora.yaml
new file mode 100644
index 000000000..bfb7836ef
--- /dev/null
+++ b/examples/falcon-h1/falcon-h1-1b-qlora.yaml
@@ -0,0 +1,72 @@
+base_model: tiiuae/Falcon-H1-1.5B-Base
+# optionally might have model_type or tokenizer_type
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: true
+
+# huggingface repo
+chat_template: falcon_h1
+datasets:
+  - path: cgato/SlimOrcaDedupCleaned
+    type: chat_template
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+adapter: qlora
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - in_proj
+  - gate_proj
+  - up_proj
+  - down_proj
+
+sequence_len: 2048
+sample_packing: false
+eval_sample_packing: false
+
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 4
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch:
+saves_per_epoch: 1
+weight_decay: 0.0
+special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/falcon-h1/falcon-h1-34b-qlora.yaml b/examples/falcon-h1/falcon-h1-34b-qlora.yaml
new file mode 100644
index 000000000..80a9d45b5
--- /dev/null
+++ b/examples/falcon-h1/falcon-h1-34b-qlora.yaml
@@ -0,0 +1,73 @@
+base_model: tiiuae/Falcon-H1-34B-Base
+# optionally might have model_type or tokenizer_type
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: true
+
+# huggingface repo
+chat_template: falcon_h1
+datasets:
+  - path: cgato/SlimOrcaDedupCleaned
+    type: chat_template
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+adapter: qlora
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - in_proj
+  - gate_proj
+  - up_proj
+  - down_proj
+
+sequence_len: 2048
+sample_packing: false
+eval_sample_packing: false
+
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 4
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch:
+saves_per_epoch: 1
+weight_decay: 0.0
+special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/falcon-h1/falcon-h1-3b-qlora.yaml b/examples/falcon-h1/falcon-h1-3b-qlora.yaml
new file mode 100644
index 000000000..02be8ac5d
--- /dev/null
+++ b/examples/falcon-h1/falcon-h1-3b-qlora.yaml
@@ -0,0 +1,73 @@
+base_model: tiiuae/Falcon-H1-3B-Base
+# optionally might have model_type or tokenizer_type
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: true
+
+# huggingface repo
+chat_template: falcon_h1
+datasets:
+  - path: cgato/SlimOrcaDedupCleaned
+    type: chat_template
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+adapter: qlora
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - in_proj
+  - gate_proj
+  - up_proj
+  - down_proj
+
+sequence_len: 2048
+sample_packing: false
+eval_sample_packing: false
+
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 4
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 1
+saves_per_epoch: 1
+weight_decay: 0.0
+special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/falcon-h1/falcon-h1-500m-qlora.yaml b/examples/falcon-h1/falcon-h1-500m-qlora.yaml
new file mode 100644
index 000000000..b112d5d85
--- /dev/null
+++ b/examples/falcon-h1/falcon-h1-500m-qlora.yaml
@@ -0,0 +1,73 @@
+base_model: tiiuae/Falcon-H1-0.5B-Instruct
+# optionally might have model_type or tokenizer_type
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: true
+
+# huggingface repo
+chat_template: falcon_h1
+datasets:
+  - path: cgato/SlimOrcaDedupCleaned
+    type: chat_template
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+adapter: qlora
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - in_proj
+  - gate_proj
+  - up_proj
+  - down_proj
+
+sequence_len: 2048
+sample_packing: false
+eval_sample_packing: false
+
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 4
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch:
+saves_per_epoch: 1
+weight_decay: 0.0
+special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/falcon-h1/falcon-h1-7b-qlora.yaml b/examples/falcon-h1/falcon-h1-7b-qlora.yaml
new file mode 100644
index 000000000..c5505873d
--- /dev/null
+++ b/examples/falcon-h1/falcon-h1-7b-qlora.yaml
@@ -0,0 +1,73 @@
+base_model: tiiuae/Falcon-H1-7B-Base
+# optionally might have model_type or tokenizer_type
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: true
+
+# huggingface repo
+chat_template: falcon_h1
+datasets:
+  - path: cgato/SlimOrcaDedupCleaned
+    type: chat_template
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+adapter: qlora
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - in_proj
+  - gate_proj
+  - up_proj
+  - down_proj
+
+sequence_len: 2048
+sample_packing: false
+eval_sample_packing: false
+
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 4
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 1
+saves_per_epoch: 1
+weight_decay: 0.0
+special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/gemma2/qlora.yml b/examples/gemma2/qlora.yml
index cb96a32c1..8a295a1f8 100644
--- a/examples/gemma2/qlora.yml
+++ b/examples/gemma2/qlora.yml
@@ -31,7 +31,7 @@ lora_target_linear: true
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+
 
 wandb_project:
 wandb_entity:
@@ -60,3 +60,5 @@ evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/gemma2/reward-model.yaml b/examples/gemma2/reward-model.yaml
index ce01a4572..67b1228b2 100644
--- a/examples/gemma2/reward-model.yaml
+++ b/examples/gemma2/reward-model.yaml
@@ -18,7 +18,7 @@ remove_unused_columns: false
 sequence_len: 2048
 sample_packing: false
 eval_sample_packing: false
-pad_to_sequence_len: true
+
 
 wandb_project:
 wandb_entity:
@@ -50,3 +50,5 @@ evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/gemma3/gemma-3-1b-qlora.yml b/examples/gemma3/gemma-3-1b-qlora.yml
index 44310558c..115717db7 100644
--- a/examples/gemma3/gemma-3-1b-qlora.yml
+++ b/examples/gemma3/gemma-3-1b-qlora.yml
@@ -13,6 +13,8 @@ load_in_4bit: true
 
 # huggingface repo
 chat_template: gemma3
+eot_tokens:
+  - <end_of_turn>
 datasets:
   - path: cgato/SlimOrcaDedupCleaned
     type: chat_template
@@ -33,7 +35,7 @@ lora_target_linear: true
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+
 
 wandb_project:
 wandb_entity:
@@ -64,3 +66,5 @@ evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/gemma3/gemma-3-4b-qlora.yml b/examples/gemma3/gemma-3-4b-qlora.yml
index 29f8cc1e1..44ba9c879 100644
--- a/examples/gemma3/gemma-3-4b-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-qlora.yml
@@ -6,6 +6,8 @@ load_in_4bit: true
 ddp_find_unused_parameters: true
 
 chat_template: gemma3
+eot_tokens:
+  - <end_of_turn>
 datasets:
   - path: cgato/SlimOrcaDedupCleaned
     type: chat_template
@@ -23,12 +25,12 @@ lora_model_dir:
 
 sequence_len: 2048
 sample_packing: true
-pad_to_sequence_len: true
+
 
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
 
 wandb_project:
 wandb_entity:
@@ -58,3 +60,5 @@ warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/gemma3/gemma-3-4b-vision-qlora.yml b/examples/gemma3/gemma-3-4b-vision-qlora.yml
index 3fd9eb5f0..e9e606b69 100644
--- a/examples/gemma3/gemma-3-4b-vision-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-vision-qlora.yml
@@ -12,6 +12,8 @@ sample_packing: false
 ddp_find_unused_parameters: true
 
 chat_template: gemma3
+eot_tokens:
+  - <end_of_turn>
 datasets:
   - path: HuggingFaceH4/llava-instruct-mix-vsft
     type: chat_template
@@ -30,7 +32,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
 
 wandb_project:
 wandb_entity:
@@ -60,3 +62,5 @@ warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/gemma3n/README.md b/examples/gemma3n/README.md
new file mode 100644
index 000000000..8c4e02a1d
--- /dev/null
+++ b/examples/gemma3n/README.md
@@ -0,0 +1,62 @@
+# Finetune Gemma-3n with Axolotl
+
+Gemma-3n is a family of multimodal models from Google found on [HuggingFace](https://huggingface.co/collections/google/gemma-3n-685065323f5984ef315c93f4). This guide shows how to fine-tune it with Axolotl.
+
+## Getting started
+
+1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
+
+    Here is an example of how to install from pip:
+
+```bash
+# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
+pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+```
+
+2. In addition to Axolotl's requirements, Gemma-3n requires:
+
+```bash
+pip3 install timm==1.0.17
+
+# for loading audio data
+pip3 install librosa==0.11.0
+```
+
+3. Run the finetuning example:
+
+```bash
+# text only
+axolotl train examples/gemma3n/gemma-3n-e2b-qlora.yml
+
+# text + vision
+axolotl train examples/gemma3n/gemma-3n-e2b-vision-qlora.yml
+
+# text + vision + audio
+axolotl train examples/gemma3n/gemma-3n-e2b-vision-audio-qlora.yml
+```
+
+Let us know how it goes. Happy finetuning! 🚀
+
+WARNING: The loss and grad norm will be much higher than normal. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look.
+
+### TIPS
+
+- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
+- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
+- The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
+- The multimodal dataset format follows the OpenAI multi-content Messages format as seen [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format).
+
+## Optimization Guides
+
+- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
+- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
+- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
+
+## Related Resources
+
+- [Gemma 3n Blog](https://ai.google.dev/gemma/docs/gemma-3n)
+- [Axolotl Docs](https://docs.axolotl.ai)
+- [Axolotl Website](https://axolotl.ai)
+- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
+- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
diff --git a/examples/gemma3n/gemma-3n-e2b-qlora.yml b/examples/gemma3n/gemma-3n-e2b-qlora.yml
new file mode 100644
index 000000000..ad7ab5726
--- /dev/null
+++ b/examples/gemma3n/gemma-3n-e2b-qlora.yml
@@ -0,0 +1,74 @@
+base_model: google/gemma-3n-E2B-it
+
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+cut_cross_entropy: true
+
+load_in_8bit: false
+load_in_4bit: true
+
+# for use with fft to only train on language model layers
+# unfrozen_parameters:
+  # - model.language_model.*
+  # - lm_head
+  # - embed_tokens
+
+
+chat_template: gemma3n
+eot_tokens:
+  - <end_of_turn>
+datasets:
+  - path: cgato/SlimOrcaDedupCleaned
+    type: chat_template
+    split: train[:1%]
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+adapter: qlora
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+# lora_target_linear: # Does not work with gemma3n currently
+lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'
+
+sequence_len: 2048
+sample_packing: true
+eval_sample_packing: true
+pad_to_sequence_len: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+num_epochs: 4
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+resume_from_checkpoint:
+logging_steps: 1
+# flash_attention: true  # Any attention impl does not work with gemma3n now
+
+warmup_ratio: 0.1
+evals_per_epoch:
+saves_per_epoch: 1
+weight_decay: 0.0
+special_tokens:
diff --git a/examples/gemma3n/gemma-3n-e2b-vision-audio-qlora.yml b/examples/gemma3n/gemma-3n-e2b-vision-audio-qlora.yml
new file mode 100644
index 000000000..d72d7fbc0
--- /dev/null
+++ b/examples/gemma3n/gemma-3n-e2b-vision-audio-qlora.yml
@@ -0,0 +1,78 @@
+base_model: google/gemma-3n-E2B-it
+processor_type: AutoProcessor
+
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+cut_cross_entropy: true
+
+# for use with fft to only train on language model layers
+# unfrozen_parameters:
+  # - model.language_model.*
+  # - lm_head
+  # - embed_tokens
+
+load_in_4bit: true
+
+# these 3 lines are needed for now to handle vision chat templates w images
+skip_prepare_dataset: true
+remove_unused_columns: false
+sample_packing: false
+
+# gemma3 doesn't seem to play nice with ddp
+ddp_find_unused_parameters: true
+
+chat_template: gemma3n
+eot_tokens:
+  - <end_of_turn>
+
+# sample dataset below requires downloading audio/image in advance
+# wget https://huggingface.co/datasets/Nanobit/text-vision-audio-2k-test/resolve/main/African_elephant.jpg
+# wget https://huggingface.co/datasets/Nanobit/text-vision-audio-2k-test/resolve/main/En-us-African_elephant.oga
+datasets:
+  - path: Nanobit/text-vision-audio-2k-test
+    type: chat_template
+dataset_prepared_path:
+val_set_size: 0.01
+output_dir: ./outputs/out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 2048
+pad_to_sequence_len: false
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: true
+fp16:
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+logging_steps: 1
+# flash_attention: true  # Any attention impl does not work with gemma3n now
+
+warmup_ratio: 0.1
+evals_per_epoch: 1
+saves_per_epoch: 1
+weight_decay: 0.0
diff --git a/examples/gemma3n/gemma-3n-e2b-vision-qlora.yml b/examples/gemma3n/gemma-3n-e2b-vision-qlora.yml
new file mode 100644
index 000000000..c87eca663
--- /dev/null
+++ b/examples/gemma3n/gemma-3n-e2b-vision-qlora.yml
@@ -0,0 +1,75 @@
+base_model: google/gemma-3n-E2B-it
+processor_type: AutoProcessor
+
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+cut_cross_entropy: true
+
+# for use with fft to only train on language model layers
+# unfrozen_parameters:
+  # - model.language_model.*
+  # - lm_head
+  # - embed_tokens
+
+load_in_4bit: true
+
+# these 3 lines are needed for now to handle vision chat templates w images
+skip_prepare_dataset: true
+remove_unused_columns: false
+sample_packing: false
+
+# gemma3 doesn't seem to play nice with ddp
+ddp_find_unused_parameters: true
+
+chat_template: gemma3n
+eot_tokens:
+  - <end_of_turn>
+datasets:
+  - path: HuggingFaceH4/llava-instruct-mix-vsft
+    type: chat_template
+    split: train[:1%]
+dataset_prepared_path:
+val_set_size: 0.01
+output_dir: ./outputs/out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 2048
+pad_to_sequence_len: false
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: true
+fp16:
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+logging_steps: 1
+# flash_attention: true  # Any attention impl does not work with gemma3n now
+
+warmup_ratio: 0.1
+evals_per_epoch: 1
+saves_per_epoch: 1
+weight_decay: 0.0
diff --git a/examples/glm4/qlora-32b.yaml b/examples/glm4/qlora-32b.yaml
index 86d9b43f8..832abde05 100644
--- a/examples/glm4/qlora-32b.yaml
+++ b/examples/glm4/qlora-32b.yaml
@@ -17,7 +17,7 @@ lora_model_dir:
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: true
-pad_to_sequence_len: true
+
 
 lora_r: 16
 lora_alpha: 32
@@ -55,8 +55,10 @@ flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/gpt-oss/README.md b/examples/gpt-oss/README.md
new file mode 100644
index 000000000..6dadb8230
--- /dev/null
+++ b/examples/gpt-oss/README.md
@@ -0,0 +1,74 @@
+# Finetune OpenAI's GPT-OSS with Axolotl
+
+[GPT-OSS](https://huggingface.co/collections/openai/gpt-oss-68911959590a1634ba11c7a4) are a family of open-weight MoE models trained by OpenAI, released in August 2025. There are two variants: 20B and 120B.
+
+This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
+
+## Getting started
+
+1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
+
+    Here is an example of how to install from pip:
+
+```bash
+# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
+pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+```
+
+2. Choose one of the following configs below for training the 20B model. (for 120B, see [below](#training-120b))
+
+```bash
+# LoRA SFT linear layers (1x48GB @ ~44GiB)
+axolotl train examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml
+
+# FFT SFT with offloading (2x24GB @ ~21GiB/GPU)
+axolotl train examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml
+
+# FFT SFT (8x48GB @ ~36GiB/GPU or 4x80GB @ ~46GiB/GPU)
+axolotl train examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml
+```
+
+Note: Memory usage taken from `device_mem_reserved(gib)` from logs.
+
+### Training 120B
+
+On 8xH100s
+
+```bash
+# FFT SFT with offloading (8x80GB @ ~49GiB/GPU)
+axolotl train examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
+```
+
+### Tool use
+
+GPT-OSS has a comprehensive tool understanding. Axolotl supports tool calling datasets for Supervised Fine-tuning.
+
+Here is an example dataset config:
+```yaml
+datasets:
+  - path: Nanobit/text-tools-2k-test
+    type: chat_template
+```
+
+See [Nanobit/text-tools-2k-test](https://huggingface.co/datasets/Nanobit/text-tools-2k-test) for the sample dataset.
+
+Refer to [our docs](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#using-tool-use) for more info.
+
+### TIPS
+
+- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
+- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
+
+## Optimization Guides
+
+- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
+- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
+
+## Related Resources
+
+- [GPT-OSS Blog](https://openai.com/index/introducing-gpt-oss/)
+- [Axolotl Docs](https://docs.axolotl.ai)
+- [Axolotl Website](https://axolotl.ai)
+- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
+- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
diff --git a/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml b/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
new file mode 100644
index 000000000..4a9d51fdf
--- /dev/null
+++ b/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
@@ -0,0 +1,67 @@
+# the original mxfp4 quantized model is not supported with FSDP cpu_ram_efficient_loading
+# FSDP cpu_ram_efficient_loading is used to reduce the initial CPU memory usage when loading the model
+base_model: axolotl-ai-co/gpt-oss-120b-dequantized
+
+use_kernels: false
+
+dp_shard_size: 16  # requires 2x8xH100 nodes
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+experimental_skip_move_to_device: true  # prevent OOM by NOT putting model to GPU before sharding
+
+datasets:
+  - path: HuggingFaceH4/Multilingual-Thinking
+    type: chat_template
+    field_thinking: thinking
+    template_thinking_key: thinking
+
+dataset_prepared_path: last_run_prepared
+val_set_size: 0
+output_dir: ./outputs/gpt-oss-out/
+
+sequence_len: 4096
+sample_packing: true
+pad_to_sequence_len: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 2
+micro_batch_size: 1
+num_epochs: 1
+
+optimizer: adamw_torch_fused  # 8bit optimizers do not work with FSDP2 offload
+lr_scheduler: constant_with_warmup
+learning_rate: 2e-5
+
+bf16: true
+tf32: true
+
+flash_attention: true
+attn_implementation: kernels-community/vllm-flash-attn3
+
+gradient_checkpointing: true
+activation_offloading: true
+
+logging_steps: 1
+saves_per_epoch: 1
+
+warmup_ratio: 0.03
+
+special_tokens:
+eot_tokens:
+  - "<|end|>"
+
+fsdp_version: 2
+fsdp_config:
+  offload_params: true
+  state_dict_type: SHARDED_STATE_DICT
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: GptOssDecoderLayer
+  reshard_after_forward: true
+  cpu_ram_efficient_loading: true
diff --git a/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml b/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml
new file mode 100644
index 000000000..440f0c509
--- /dev/null
+++ b/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml
@@ -0,0 +1,58 @@
+base_model: openai/gpt-oss-20b
+use_kernels: false
+model_quantization_config: Mxfp4Config
+model_quantization_config_kwargs:
+  dequantize: true
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+experimental_skip_move_to_device: true  # prevent OOM by NOT putting model to GPU before sharding
+
+datasets:
+  - path: HuggingFaceH4/Multilingual-Thinking
+    type: chat_template
+    field_thinking: thinking
+    template_thinking_key: thinking
+
+dataset_prepared_path: last_run_prepared
+val_set_size: 0
+output_dir: ./outputs/gpt-oss-out/
+
+sequence_len: 4096
+sample_packing: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 2
+micro_batch_size: 1
+num_epochs: 1
+
+optimizer: adamw_torch_8bit
+lr_scheduler: constant_with_warmup
+learning_rate: 2e-5
+
+bf16: true
+tf32: true
+
+flash_attention: true
+attn_implementation: kernels-community/vllm-flash-attn3
+
+gradient_checkpointing: true
+activation_offloading: true
+
+logging_steps: 1
+saves_per_epoch: 1
+
+warmup_ratio: 0.03
+
+special_tokens:
+eot_tokens:
+  - "<|end|>"
+
+# choose the zero3 configuration that best fits your system capabilities
+deepspeed: deepspeed_configs/zero3_bf16.json
diff --git a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml
new file mode 100644
index 000000000..a6ba83433
--- /dev/null
+++ b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml
@@ -0,0 +1,68 @@
+base_model: openai/gpt-oss-20b
+use_kernels: true
+model_quantization_config: Mxfp4Config
+model_quantization_config_kwargs:
+  dequantize: true
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+experimental_skip_move_to_device: true  # prevent OOM by NOT putting model to GPU before sharding
+
+datasets:
+  - path: HuggingFaceH4/Multilingual-Thinking
+    type: chat_template
+    field_thinking: thinking
+    template_thinking_key: thinking
+
+dataset_prepared_path: last_run_prepared
+val_set_size: 0
+output_dir: ./outputs/gpt-oss-out/
+
+sequence_len: 4096
+sample_packing: true
+pad_to_sequence_len: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 2
+micro_batch_size: 1
+num_epochs: 1
+
+optimizer: adamw_torch_fused  # 8bit optimizers do not work with FSDP2 offload
+lr_scheduler: constant_with_warmup
+learning_rate: 2e-5
+
+bf16: true
+tf32: true
+
+flash_attention: true
+attn_implementation: kernels-community/vllm-flash-attn3
+
+gradient_checkpointing: true
+activation_offloading: true
+
+logging_steps: 1
+saves_per_epoch: 1
+
+warmup_ratio: 0.03
+
+special_tokens:
+eot_tokens:
+  - "<|end|>"
+
+fsdp_version: 2
+fsdp_config:
+  offload_params: true
+  state_dict_type: SHARDED_STATE_DICT
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: GptOssDecoderLayer
+  reshard_after_forward: true
+  #  cpu_ram_efficient_loading: true
+
+# cpu_ram_efficient_loading cannot be used with MXFP4 model quantization.
+# It can only be used with a dequantized model like `axolotl-ai-co/gpt-oss-120b-dequantized`
diff --git a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml
new file mode 100644
index 000000000..aa658c863
--- /dev/null
+++ b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml
@@ -0,0 +1,64 @@
+base_model: openai/gpt-oss-20b
+use_kernels: false
+model_quantization_config: Mxfp4Config
+model_quantization_config_kwargs:
+  dequantize: true
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+experimental_skip_move_to_device: true  # prevent OOM by NOT putting model to GPU before sharding
+
+datasets:
+  - path: HuggingFaceH4/Multilingual-Thinking
+    type: chat_template
+    field_thinking: thinking
+    template_thinking_key: thinking
+
+dataset_prepared_path: last_run_prepared
+val_set_size: 0
+output_dir: ./outputs/gpt-oss-out/
+
+sequence_len: 4096
+sample_packing: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 2
+micro_batch_size: 1
+num_epochs: 1
+
+optimizer: adamw_torch_8bit
+lr_scheduler: constant_with_warmup
+learning_rate: 2e-5
+
+bf16: true
+tf32: true
+
+flash_attention: true
+attn_implementation: kernels-community/vllm-flash-attn3
+
+gradient_checkpointing: true
+activation_offloading: true
+
+logging_steps: 1
+saves_per_epoch: 1
+
+warmup_ratio: 0.03
+
+special_tokens:
+eot_tokens:
+  - "<|end|>"
+
+fsdp_version: 2
+fsdp_config:
+  offload_params: false
+  state_dict_type: SHARDED_STATE_DICT
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: GptOssDecoderLayer
+  reshard_after_forward: true
+#  cpu_ram_efficient_loading: true
diff --git a/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml b/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml
new file mode 100644
index 000000000..c4e1a982d
--- /dev/null
+++ b/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml
@@ -0,0 +1,67 @@
+base_model: openai/gpt-oss-20b
+use_kernels: true
+model_quantization_config: Mxfp4Config
+model_quantization_config_kwargs:
+  dequantize: true
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+experimental_skip_move_to_device: true  # prevent OOM by not putting model to GPU before sharding
+
+datasets:
+  - path: HuggingFaceH4/Multilingual-Thinking
+    type: chat_template
+    field_thinking: thinking
+    template_thinking_key: thinking
+
+dataset_prepared_path: last_run_prepared
+val_set_size: 0
+output_dir: ./outputs/gpt-oss-out/
+
+sequence_len: 4096
+sample_packing: true
+
+adapter: lora
+lora_r: 8
+lora_alpha: 16
+lora_dropout: 0.0  # dropout not supported when using LoRA over expert parameters
+lora_target_linear: true
+
+# TODO: not supported for now, see peft#2710
+#lora_target_parameters:  # target the experts in the last two layers
+#  - "22._checkpoint_wrapped_module.mlp.experts.gate_up_proj"
+#  - "22._checkpoint_wrapped_module.mlp.experts.down_proj"
+#  - "23._checkpoint_wrapped_module.mlp.experts.gate_up_proj"
+#  - "23._checkpoint_wrapped_module.mlp.experts.down_proj"
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 8
+micro_batch_size: 1
+num_epochs: 1
+
+optimizer: adamw_torch_8bit
+lr_scheduler: constant_with_warmup
+learning_rate: 2e-4
+
+bf16: true
+tf32: true
+
+flash_attention: true
+attn_implementation: kernels-community/vllm-flash-attn3
+
+gradient_checkpointing: true
+activation_offloading: true
+
+logging_steps: 1
+saves_per_epoch: 1
+warmup_ratio: 0.1
+
+special_tokens:
+eot_tokens:
+  - "<|end|>"
diff --git a/examples/jamba/qlora.yaml b/examples/jamba/qlora.yaml
index 2cb0eea41..538ed3a10 100644
--- a/examples/jamba/qlora.yaml
+++ b/examples/jamba/qlora.yaml
@@ -49,8 +49,10 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/jamba/qlora_deepspeed.yaml b/examples/jamba/qlora_deepspeed.yaml
index d13ce6483..b288635e7 100644
--- a/examples/jamba/qlora_deepspeed.yaml
+++ b/examples/jamba/qlora_deepspeed.yaml
@@ -48,10 +48,12 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch:
 saves_per_epoch: 1
 
 deepspeed: deepspeed_configs/zero2.json
 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/jamba/qlora_fsdp_large.yaml b/examples/jamba/qlora_fsdp_large.yaml
index 6badaba19..150e5e2ec 100644
--- a/examples/jamba/qlora_fsdp_large.yaml
+++ b/examples/jamba/qlora_fsdp_large.yaml
@@ -23,7 +23,7 @@ save_safetensors: true
 adapter: qlora
 sequence_len: 2048
 sample_packing: true
-pad_to_sequence_len: true
+
 
 lora_r: 16
 lora_alpha: 16
@@ -47,7 +47,7 @@ gradient_checkpointing_kwargs:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
@@ -64,3 +64,5 @@ fsdp_config:
   fsdp_transformer_layer_cls_to_wrap: JambaAttentionDecoderLayer,JambaMambaDecoderLayer
   fsdp_state_dict_type: FULL_STATE_DICT
   fsdp_sharding_strategy: FULL_SHARD
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/lfm2/README.md b/examples/lfm2/README.md
new file mode 100644
index 000000000..eb9ca911f
--- /dev/null
+++ b/examples/lfm2/README.md
@@ -0,0 +1,7 @@
+# Liquid Foundation Models 2
+
+LFM2 support in transformers exists in the main branch, but is not yet included in the transformers release.
+
+```bash
+pip install --upgrade --no-deps --force-reinstall git+https://github.com/huggingface/transformers.git
+```
diff --git a/examples/lfm2/lfm2-350m-fft.yaml b/examples/lfm2/lfm2-350m-fft.yaml
new file mode 100644
index 000000000..16a0a028e
--- /dev/null
+++ b/examples/lfm2/lfm2-350m-fft.yaml
@@ -0,0 +1,50 @@
+base_model: LiquidAI/LFM2-350M
+
+chunked_cross_entropy: true
+
+chat_template: tokenizer_default
+eot_tokens:
+  - "<|im_end|>"
+datasets:
+  - path: mlabonne/FineTome-100k
+    type: chat_template
+    split: train[:20%]
+    field_messages: conversations
+    message_field_role: from
+    message_field_content: value
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.05
+output_dir: ./outputs/out
+
+sequence_len: 4096
+sample_packing: true
+
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 2
+micro_batch_size: 4
+num_epochs: 1
+optimizer: adamw_torch_fused
+lr_scheduler: cosine
+learning_rate: 5e-5
+
+bf16: true
+tf32: true
+
+gradient_checkpointing: false
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 2
+saves_per_epoch: 1
+
+weight_decay: 0.0
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-2/fft_optimized.yml b/examples/llama-2/fft_optimized.yml
index 86b1b6a21..ea119348e 100644
--- a/examples/llama-2/fft_optimized.yml
+++ b/examples/llama-2/fft_optimized.yml
@@ -14,7 +14,7 @@ output_dir: ./outputs/out
 
 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 
 adapter:
 lora_model_dir:
@@ -45,13 +45,14 @@ logging_steps: 1
 flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
-flash_attn_fuse_qkv: false
 flash_attn_fuse_mlp: true
 
-warmup_steps: 100
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 
 deepspeed: #deepspeed_configs/zero2.json # multi-gpu only
 weight_decay: 0.1
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-2/gptq-lora.yml b/examples/llama-2/gptq-lora.yml
index 0f1b34016..de1caaa05 100644
--- a/examples/llama-2/gptq-lora.yml
+++ b/examples/llama-2/gptq-lora.yml
@@ -56,7 +56,7 @@ logging_steps: 1
 flash_attention:
 sdp_attention:
 flash_optimum:
-warmup_steps: 100
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
@@ -64,3 +64,5 @@ special_tokens:
   bos_token: "<s>"
   eos_token: "</s>"
   unk_token: "<unk>"
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-2/lisa.yml b/examples/llama-2/lisa.yml
index a76a792ae..d21c01a49 100644
--- a/examples/llama-2/lisa.yml
+++ b/examples/llama-2/lisa.yml
@@ -14,7 +14,7 @@ output_dir: ./outputs/lisa-out
 
 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 
 adapter:
 lora_model_dir:
@@ -49,10 +49,9 @@ logging_steps: 1
 flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
-flash_attn_fuse_qkv: false
 flash_attn_fuse_mlp: true
 
-warmup_steps: 100
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
@@ -60,3 +59,5 @@ special_tokens:
   bos_token: "<s>"
   eos_token: "</s>"
   unk_token: "<unk>"
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-2/loftq.yml b/examples/llama-2/loftq.yml
index 22dbf2d99..619e5bcce 100644
--- a/examples/llama-2/loftq.yml
+++ b/examples/llama-2/loftq.yml
@@ -14,7 +14,7 @@ output_dir: ./outputs/lora-out
 
 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 
 adapter: lora
 lora_model_dir:
@@ -47,8 +47,10 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-2/lora.yml b/examples/llama-2/lora.yml
index 679aed3a9..0a677f11a 100644
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out
 
 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 
 adapter: lora
 lora_model_dir:
@@ -47,8 +47,10 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-2/qlora-fsdp.yml b/examples/llama-2/qlora-fsdp.yml
index a42eabd4b..54f4b86b4 100644
--- a/examples/llama-2/qlora-fsdp.yml
+++ b/examples/llama-2/qlora-fsdp.yml
@@ -20,7 +20,7 @@ lora_model_dir:
 
 sequence_len: 512
 sample_packing: false
-pad_to_sequence_len: true
+
 
 lora_r: 32
 lora_alpha: 16
@@ -50,7 +50,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
@@ -67,3 +67,5 @@ fsdp_config:
   fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
   fsdp_state_dict_type: FULL_STATE_DICT
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-2/qlora.yml b/examples/llama-2/qlora.yml
index de65928bc..327d88c15 100644
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -20,7 +20,7 @@ lora_model_dir:
 
 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 
 lora_r: 32
 lora_alpha: 16
@@ -48,8 +48,10 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-2/relora.yml b/examples/llama-2/relora.yml
index e0a5f7068..fabdf0e0f 100644
--- a/examples/llama-2/relora.yml
+++ b/examples/llama-2/relora.yml
@@ -18,16 +18,19 @@ lora_model_dir:
 
 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 
 lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 
-relora_steps: 150
-relora_warmup_steps: 10
+relora: true
+relora_prune_ratio: 0.9
 relora_cpu_offload: false
+jagged_restart_steps: 150
+jagged_restart_warmup_steps: 10
+jagged_restart_anneal_steps: false
 
 wandb_project:
 wandb_entity:
@@ -50,7 +53,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
@@ -58,3 +61,5 @@ special_tokens:
   bos_token: "<s>"
   eos_token: "</s>"
   unk_token: "<unk>"
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-3-vision/lora-11b.yaml b/examples/llama-3-vision/lora-11b.yaml
index f4883e903..adbb61643 100644
--- a/examples/llama-3-vision/lora-11b.yaml
+++ b/examples/llama-3-vision/lora-11b.yaml
@@ -15,8 +15,7 @@ datasets:
   - path: HuggingFaceH4/llava-instruct-mix-vsft
     type: chat_template
     split: train[:1%]
-    field_messages: messages
-dataset_prepared_path: last_run_prepared
+dataset_prepared_path:
 val_set_size: 0.0
 output_dir: ./outputs/out
 
@@ -29,7 +28,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
 
 wandb_project:
 wandb_entity:
@@ -50,10 +49,12 @@ tf32: true
 
 gradient_checkpointing: true
 logging_steps: 1
-flash_attention: true
-eager_attention:
+# flash_attention: true  # use for text-only mode
+sdp_attention: true
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-3/3b-fp8-fsdp2.yaml b/examples/llama-3/3b-fp8-fsdp2.yaml
new file mode 100644
index 000000000..bea698c0e
--- /dev/null
+++ b/examples/llama-3/3b-fp8-fsdp2.yaml
@@ -0,0 +1,76 @@
+base_model: meta-llama/Llama-3.2-3B
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
+plugins:
+  - axolotl.integrations.liger.LigerPlugin
+
+liger_rope: true
+liger_rms_norm: true
+liger_glu_activation: true
+liger_layer_norm: true
+liger_fused_linear_cross_entropy: true
+
+datasets:
+  - path: yahma/alpaca-cleaned
+    type: alpaca
+
+output_dir: ./outputs/fp8_out/
+
+sample_packing: true
+pad_to_sequence_len: true
+sequence_len: 512
+
+flex_attention: true
+flex_attn_compile_kwargs:
+  dynamic: false
+  mode: max-autotune-no-cudagraphs
+
+torch_compile: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 1
+micro_batch_size: 16
+num_epochs: 1
+optimizer: adamw_torch_fused
+
+cosine_constant_lr_ratio: 0
+cosine_min_lr_ratio: 1.0
+learning_rate: 2e-5
+save_only_model: true
+
+fp8: true
+fp8_enable_fsdp_float8_all_gather: true
+
+resume_from_checkpoint:
+logging_steps: 1
+
+evals_per_epoch: 1
+saves_per_epoch: 1
+
+warmup_steps: 10
+weight_decay: 0.0
+
+fsdp_version: 2
+fsdp_config:
+  offload_params: false
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  state_dict_type: FULL_STATE_DICT
+  sharding_strategy: FULL_SHARD
+  reshard_after_forward: true
+  activation_checkpointing: false
+
+special_tokens:
+  pad_token: <|end_of_text|>
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-3/3b-qat-fsdp2.yaml b/examples/llama-3/3b-qat-fsdp2.yaml
new file mode 100644
index 000000000..35e3461e2
--- /dev/null
+++ b/examples/llama-3/3b-qat-fsdp2.yaml
@@ -0,0 +1,81 @@
+base_model: meta-llama/Llama-3.2-3B
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
+plugins:
+  - axolotl.integrations.liger.LigerPlugin
+
+liger_rope: true
+liger_rms_norm: true
+liger_glu_activation: true
+liger_layer_norm: true
+liger_fused_linear_cross_entropy: true
+
+datasets:
+  - path: yahma/alpaca-cleaned
+    type: alpaca
+
+output_dir: ./outputs/qat_out/
+
+sample_packing: true
+
+sequence_len: 512
+
+flex_attention: true
+flex_attn_compile_kwargs:
+  dynamic: false
+  mode: max-autotune-no-cudagraphs
+
+qat:
+  activation_dtype: int8
+  weight_dtype: int4
+  group_size: 32
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 1
+micro_batch_size: 16
+num_epochs: 1
+optimizer: adamw_torch_fused
+
+cosine_constant_lr_ratio: 0
+cosine_min_lr_ratio: 1.0
+learning_rate: 2e-5
+save_only_model: true
+bf16: true
+
+resume_from_checkpoint:
+logging_steps: 1
+
+evals_per_epoch: 1
+saves_per_epoch: 1
+
+warmup_ratio: 0.1
+weight_decay: 0.0
+fsdp:
+  - full_shard
+  - auto_wrap
+
+fsdp_config:
+  fsdp_version: 2
+  fsdp_offload_params: false
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sharding_strategy: FULL_SHARD
+  fsdp_reshard_after_forward: true
+  fsdp_activation_checkpointing: true
+
+special_tokens:
+  pad_token: <|end_of_text|>
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-3/fft-8b-liger-fsdp.yaml b/examples/llama-3/fft-8b-liger-fsdp.yaml
index eccfa6d8c..a655b97a9 100644
--- a/examples/llama-3/fft-8b-liger-fsdp.yaml
+++ b/examples/llama-3/fft-8b-liger-fsdp.yaml
@@ -26,7 +26,7 @@ output_dir: ./outputs/out
 
 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 
 wandb_project:
 wandb_entity:
@@ -51,7 +51,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 100
+warmup_ratio: 0.1
 evals_per_epoch: 2
 saves_per_epoch: 1
 weight_decay: 0.0
@@ -72,3 +72,5 @@ fsdp_config:
 special_tokens:
   pad_token: <|finetune_right_pad_id|>
   eos_token: <|eot_id|>
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-3/fft-8b.yaml b/examples/llama-3/fft-8b.yaml
index fdae3e6c4..c72ec6662 100644
--- a/examples/llama-3/fft-8b.yaml
+++ b/examples/llama-3/fft-8b.yaml
@@ -11,7 +11,7 @@ output_dir: ./outputs/out
 
 sequence_len: 8192
 sample_packing: true
-pad_to_sequence_len: true
+
 
 wandb_project:
 wandb_entity:
@@ -36,9 +36,11 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 100
+warmup_ratio: 0.1
 evals_per_epoch: 2
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
   pad_token: <|end_of_text|>
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-3/instruct-dpo-lora-8b.yml b/examples/llama-3/instruct-dpo-lora-8b.yml
index 13082294f..cf823353b 100644
--- a/examples/llama-3/instruct-dpo-lora-8b.yml
+++ b/examples/llama-3/instruct-dpo-lora-8b.yml
@@ -5,6 +5,10 @@ tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 
+special_tokens:
+  pad_token: <|finetune_right_pad_id|>
+  eos_token: <|eot_id|>
+
 load_in_8bit: true
 load_in_4bit: false
 
@@ -33,7 +37,7 @@ output_dir: ./outputs/lora-out
 
 sequence_len: 4096
 sample_packing: false
-pad_to_sequence_len: true
+
 
 adapter: lora
 lora_model_dir:
@@ -63,7 +67,9 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-3/instruct-lora-8b.yml b/examples/llama-3/instruct-lora-8b.yml
index acab862f6..69e17b9cf 100644
--- a/examples/llama-3/instruct-lora-8b.yml
+++ b/examples/llama-3/instruct-lora-8b.yml
@@ -28,7 +28,7 @@ output_dir: ./outputs/lora-out
 
 sequence_len: 4096
 sample_packing: false
-pad_to_sequence_len: true
+
 
 adapter: lora
 lora_model_dir:
@@ -58,9 +58,11 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
    pad_token: <|end_of_text|>
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-3/lora-1b-deduplicate-dpo.yml b/examples/llama-3/lora-1b-deduplicate-dpo.yml
index 10e9747cb..2897636f4 100644
--- a/examples/llama-3/lora-1b-deduplicate-dpo.yml
+++ b/examples/llama-3/lora-1b-deduplicate-dpo.yml
@@ -49,7 +49,7 @@ output_dir: ./outputs/lora-out
 
 sequence_len: 4096
 sample_packing: false
-pad_to_sequence_len: true
+
 
 adapter: lora
 lora_model_dir:
@@ -79,7 +79,9 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-3/lora-1b-deduplicate-sft.yml b/examples/llama-3/lora-1b-deduplicate-sft.yml
index 630ec92f6..c5190d892 100644
--- a/examples/llama-3/lora-1b-deduplicate-sft.yml
+++ b/examples/llama-3/lora-1b-deduplicate-sft.yml
@@ -22,7 +22,7 @@ dataset_exact_deduplication: true
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+
 
 adapter: lora
 lora_model_dir:
@@ -55,9 +55,11 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
    pad_token: <|end_of_text|>
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-3/lora-1b-kernels.yml b/examples/llama-3/lora-1b-kernels.yml
index a2d07ca49..0bcf46b17 100644
--- a/examples/llama-3/lora-1b-kernels.yml
+++ b/examples/llama-3/lora-1b-kernels.yml
@@ -14,7 +14,7 @@ lora_model_dir:
 
 sequence_len: 2048
 sample_packing: true
-pad_to_sequence_len: true
+
 
 lora_r: 16
 lora_alpha: 32
@@ -59,9 +59,11 @@ flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
   pad_token: "<|end_of_text|>"
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-3/lora-1b-ray.yml b/examples/llama-3/lora-1b-ray.yml
index bb23164eb..46c83348e 100644
--- a/examples/llama-3/lora-1b-ray.yml
+++ b/examples/llama-3/lora-1b-ray.yml
@@ -15,7 +15,7 @@ lora_model_dir:
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: true
-pad_to_sequence_len: true
+
 
 lora_r: 16
 lora_alpha: 32
@@ -53,7 +53,7 @@ flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 
@@ -64,3 +64,5 @@ special_tokens:
 
 use_ray: true
 ray_num_workers: 4
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-3/lora-1b-sample-packing-sequentially.yml b/examples/llama-3/lora-1b-sample-packing-sequentially.yml
index 769dd32e6..dba78597b 100644
--- a/examples/llama-3/lora-1b-sample-packing-sequentially.yml
+++ b/examples/llama-3/lora-1b-sample-packing-sequentially.yml
@@ -24,7 +24,7 @@ sample_packing: true
 sample_packing_sequentially: true
 curriculum_sampling: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+
 
 adapter: lora
 lora_model_dir:
@@ -57,9 +57,11 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
   pad_token: <|end_of_text|>
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-3/lora-1b.yml b/examples/llama-3/lora-1b.yml
index c31a9f39a..2ae2f0056 100644
--- a/examples/llama-3/lora-1b.yml
+++ b/examples/llama-3/lora-1b.yml
@@ -5,7 +5,7 @@ base_model: NousResearch/Llama-3.2-1B
 datasets:
   - path: teknium/GPT4-LLM-Cleaned
     type: alpaca
-dataset_prepared_path: last_run_prepared
+
 val_set_size: 0.1
 output_dir: ./outputs/lora-out
 
@@ -15,7 +15,7 @@ lora_model_dir:
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: true
-pad_to_sequence_len: true
+
 
 lora_r: 16
 lora_alpha: 32
@@ -38,6 +38,7 @@ wandb_log_model:
 gradient_accumulation_steps: 2
 micro_batch_size: 2
 num_epochs: 1
+
 optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
@@ -53,9 +54,11 @@ flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
   pad_token: "<|end_of_text|>"
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-3/lora-8b.yml b/examples/llama-3/lora-8b.yml
index ad50cd38a..d72b6527d 100644
--- a/examples/llama-3/lora-8b.yml
+++ b/examples/llama-3/lora-8b.yml
@@ -18,7 +18,7 @@ output_dir: ./outputs/lora-out
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+
 
 adapter: lora
 lora_model_dir:
@@ -51,9 +51,11 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
    pad_token: <|end_of_text|>
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-3/qlora-1b-kto.yaml b/examples/llama-3/qlora-1b-kto.yaml
index 89a51ea68..a6a84e7b1 100644
--- a/examples/llama-3/qlora-1b-kto.yaml
+++ b/examples/llama-3/qlora-1b-kto.yaml
@@ -55,9 +55,11 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 20
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
   pad_token: "<|end_of_text|>"
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-3/qlora-1b.yml b/examples/llama-3/qlora-1b.yml
index 5c8fe6628..1e4f97438 100644
--- a/examples/llama-3/qlora-1b.yml
+++ b/examples/llama-3/qlora-1b.yml
@@ -18,7 +18,7 @@ lora_model_dir:
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: true
-pad_to_sequence_len: true
+
 
 lora_r: 32
 lora_alpha: 16
@@ -56,9 +56,11 @@ flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
   pad_token: "<|end_of_text|>"
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-3/qlora-fsdp-405b.yaml b/examples/llama-3/qlora-fsdp-405b.yaml
index 2b7d51925..8ddb84d65 100644
--- a/examples/llama-3/qlora-fsdp-405b.yaml
+++ b/examples/llama-3/qlora-fsdp-405b.yaml
@@ -18,7 +18,7 @@ adapter: qlora
 
 sequence_len: 2048
 sample_packing: true
-pad_to_sequence_len: true
+
 
 lora_r: 16
 lora_alpha: 16
@@ -41,7 +41,7 @@ gradient_checkpointing_kwargs:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
@@ -60,3 +60,5 @@ fsdp_config:
   fsdp_sharding_strategy: FULL_SHARD
 special_tokens:
   pad_token: <|finetune_right_pad_id|>
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-3/qlora-fsdp-70b.yaml b/examples/llama-3/qlora-fsdp-70b.yaml
index 412b6721c..c052bc19d 100644
--- a/examples/llama-3/qlora-fsdp-70b.yaml
+++ b/examples/llama-3/qlora-fsdp-70b.yaml
@@ -20,7 +20,7 @@ lora_model_dir:
 
 sequence_len: 512
 sample_packing: false
-pad_to_sequence_len: true
+
 
 lora_r: 8
 lora_alpha: 16
@@ -50,7 +50,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
@@ -69,3 +69,5 @@ fsdp_config:
   fsdp_sharding_strategy: FULL_SHARD
 special_tokens:
   pad_token: <|end_of_text|>
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-3/qlora.yml b/examples/llama-3/qlora.yml
index 4cc9fc3db..a8f47a0e2 100644
--- a/examples/llama-3/qlora.yml
+++ b/examples/llama-3/qlora.yml
@@ -20,7 +20,7 @@ lora_model_dir:
 
 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 
 lora_r: 32
 lora_alpha: 16
@@ -48,9 +48,11 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
   pad_token: "<|end_of_text|>"
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-3/sparse-finetuning.yaml b/examples/llama-3/sparse-finetuning.yaml
index 1bbb88028..348756b70 100644
--- a/examples/llama-3/sparse-finetuning.yaml
+++ b/examples/llama-3/sparse-finetuning.yaml
@@ -16,7 +16,7 @@ output_dir: ./outputs/out
 
 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 eval_sample_packing: false
 
 wandb_project:
@@ -47,7 +47,7 @@ logging_steps: 1
 xformers_attention:
 flash_attention: true
 
-warmup_steps: 100
+warmup_ratio: 0.1
 evals_per_epoch: 2
 eval_table_size:
 saves_per_epoch: 1
@@ -75,3 +75,5 @@ llmcompressor:
           ]
           start: 0
   save_compressed: true
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml b/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml
index 2be94f4ef..b20f79758 100644
--- a/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml
+++ b/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml
@@ -47,7 +47,7 @@ output_dir: ./outputs/out
 
 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 
 gradient_accumulation_steps: 1
 micro_batch_size: 1
@@ -66,7 +66,7 @@ gradient_checkpointing: offload
 gradient_checkpointing_kwargs:
   use_reentrant: false
 
-warmup_steps: 20
+warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
@@ -84,5 +84,7 @@ fsdp_config:
   fsdp_state_dict_type: FULL_STATE_DICT
   fsdp_sharding_strategy: FULL_SHARD
 special_tokens:
-  pad_token: <|finetune_right_pad_id|>
+  pad_token: <|finetune_right_pad|>
   eos_token: <|eot|>
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml b/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml
index eeae872a6..40449009c 100644
--- a/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml
+++ b/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml
@@ -48,7 +48,7 @@ output_dir: ./outputs/out
 
 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 
 wandb_project:
 wandb_entity:
@@ -69,7 +69,7 @@ tf32: true
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 100
+warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
@@ -88,5 +88,7 @@ fsdp_config:
   fsdp_sharding_strategy: FULL_SHARD
   fsdp_activation_checkpointing: true
 special_tokens:
-  pad_token: <|finetune_right_pad_id|>
+  pad_token: <|finetune_right_pad|>
   eos_token: <|eot|>
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml b/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml
index 17ad70634..abdc51378 100644
--- a/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml
+++ b/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml
@@ -51,7 +51,7 @@ output_dir: ./outputs/out
 
 sequence_len: 4096  # up to 8k will work on a single H100
 sample_packing: true
-pad_to_sequence_len: true
+
 
 wandb_project:
 wandb_entity:
@@ -76,10 +76,12 @@ gradient_checkpointing: offload
 gradient_checkpointing_kwargs:
   use_reentrant: false
 
-warmup_steps: 20
+warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
-  pad_token: <|finetune_right_pad_id|>
+  pad_token: <|finetune_right_pad|>
   eos_token: <|eot|>
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml b/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml
index eff708e4d..9975949bb 100644
--- a/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml
+++ b/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml
@@ -65,7 +65,7 @@ tf32: true
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 100
+warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
@@ -84,5 +84,7 @@ fsdp_config:
   fsdp_sharding_strategy: FULL_SHARD
   fsdp_activation_checkpointing: true
 special_tokens:
-  pad_token: <|finetune_right_pad_id|>
+  pad_token: <|finetune_right_pad|>
   eos_token: <|eot|>
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml b/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml
index 9a411883e..02c04c691 100644
--- a/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml
+++ b/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml
@@ -46,7 +46,7 @@ output_dir: ./outputs/out
 
 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 
 gradient_accumulation_steps: 1
 micro_batch_size: 2
@@ -64,7 +64,7 @@ flex_attn_compile_kwargs:
   dynamic: false
   mode: max-autotune-no-cudagraphs
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
@@ -74,7 +74,7 @@ fsdp:
 fsdp_config:
   fsdp_version: 2
   fsdp_offload_params: false
-  fsdp_cpu_ram_efficient_loading: true
+  # fsdp_cpu_ram_efficient_loading: true # does not work with load_in_8bit/4bit
   fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
   fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
   fsdp_state_dict_type: SHARDED_STATE_DICT
@@ -82,5 +82,7 @@ fsdp_config:
   fsdp_reshard_after_forward: true
   fsdp_activation_checkpointing: true
 special_tokens:
-  pad_token: <|finetune_right_pad_id|>
+  pad_token: <|finetune_right_pad|>
   eos_token: <|eot|>
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-4/scout-qlora-single-h100-flex.yaml b/examples/llama-4/scout-qlora-single-h100-flex.yaml
index 20352f81e..33a691189 100644
--- a/examples/llama-4/scout-qlora-single-h100-flex.yaml
+++ b/examples/llama-4/scout-qlora-single-h100-flex.yaml
@@ -51,7 +51,7 @@ output_dir: ./outputs/out
 
 sequence_len: 4096  # up to 8k will work on a single H100
 sample_packing: true
-pad_to_sequence_len: true
+
 
 gradient_accumulation_steps: 1
 micro_batch_size: 1
@@ -74,11 +74,13 @@ gradient_checkpointing_kwargs:
   use_reentrant: false
 
 logging_steps: 1
-warmup_steps: 20
+warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 
 weight_decay: 0.0
 special_tokens:
-  pad_token: <|finetune_right_pad_id|>
+  pad_token: <|finetune_right_pad|>
   eos_token: <|eot|>
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml b/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml
index 9fbd34107..ac7e05659 100644
--- a/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml
+++ b/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml
@@ -67,7 +67,7 @@ flex_attn_compile_kwargs:
   dynamic: false
   mode: max-autotune-no-cudagraphs
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
@@ -85,5 +85,7 @@ fsdp_config:
   fsdp_reshard_after_forward: true
   fsdp_activation_checkpointing: true
 special_tokens:
-  pad_token: <|finetune_right_pad_id|>
+  pad_token: <|finetune_right_pad|>
   eos_token: <|eot|>
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llava/lora-7b.yaml b/examples/llava/lora-7b.yaml
index 54edd04dc..77ef7474d 100644
--- a/examples/llava/lora-7b.yaml
+++ b/examples/llava/lora-7b.yaml
@@ -11,8 +11,7 @@ datasets:
   - path: HuggingFaceH4/llava-instruct-mix-vsft
     type: chat_template
     split: train[:1%]
-    field_messages: messages
-dataset_prepared_path: last_run_prepared
+dataset_prepared_path:
 val_set_size: 0.0
 output_dir: ./outputs/out
 
@@ -25,7 +24,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
 
 wandb_project:
 wandb_entity:
@@ -53,3 +52,5 @@ warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/magistral/README.md b/examples/magistral/README.md
new file mode 100644
index 000000000..48ce712da
--- /dev/null
+++ b/examples/magistral/README.md
@@ -0,0 +1,87 @@
+# Finetune Magistral Small with Axolotl
+
+Magistral Small is a 24B parameter opensource model from MistralAI found on HuggingFace at [2506](https://huggingface.co/mistralai/Magistral-Small-2506) and [2507](https://huggingface.co/mistralai/Magistral-Small-2507) (see [Thinking](#thinking)). This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
+
+MistralAI has also released a proprietary medium-sized version called Magistral Medium.
+
+Thanks to the team at MistralAI for giving us early access to prepare for this release.
+
+## Getting started
+
+1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
+
+    Here is an example of how to install from pip:
+
+```bash
+# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
+pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+```
+
+2. Run the finetuning example:
+
+```bash
+axolotl train examples/magistral/magistral-small-qlora.yaml
+```
+
+This config uses about 24GB VRAM.
+
+Let us know how it goes. Happy finetuning! 🚀
+
+### Thinking
+
+MistralAI has released their [2507](https://huggingface.co/mistralai/Magistral-Small-2507) model with thinking capabilities. The model requires the multi-content dataset format with support for an extra `role: thinking` within system and assistant messages.
+
+Example format:
+
+```json
+{
+    "messages": [
+        {"role": "system", "content": [{ "type": "text", "text": "{SYSTEM_PROMPT}"}]},
+        {"role": "user", "content": [{ "type": "text", "text": "..."}]},
+        {"role": "assistant", "content": [{ "type": "thinking", "thinking": "..."}, { "type": "text", "text": "..." }]},
+    ],
+}
+```
+
+Example config: `./magistral-small-think-qlora.yaml`.
+
+The `thinking` section also supports an optional arg `closed: bool` (`True` default) which controls adding the closing `[/THINK]` tag.
+
+Limitations:
+- You cannot mix `content: str` with `content: list[dict]` as the `dataset.load_dataset` may complain about different types for `content` key.
+- This mode does not work with custom `train_detail` and `training` at the moment.
+
+### TIPS
+
+- We recommend adding the same/similar SystemPrompt that the model is tuned for. You can find this within the repo's files titled `SYSTEM_PROMPT.txt`.
+- For inference, the official MistralAI team recommends `top_p: 0.95` and `temperature: 0.7` with `max_tokens: 40960`.
+- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
+- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
+- The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
+
+## Optimization Guides
+
+- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
+- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
+- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
+
+## Limitations
+
+We only support the `mistral-common` tokenizer for Supervised Fine-tuning at the moment and for `type: chat_template` only.
+
+In addition, we do not support overriding tokens yet.
+
+## Related Resources
+
+- [MistralAI Magistral Blog](https://mistral.ai/news/magistral/)
+- [Axolotl Docs](https://docs.axolotl.ai)
+- [Axolotl Website](https://axolotl.ai)
+- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
+- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
+
+
+## Future Work
+
+- Add parity to Preference Tuning, RL, Multi-modal, etc.
+- Add parity to other tokenizer configs like overriding tokens.
diff --git a/examples/magistral/magistral-small-fsdp-qlora.yaml b/examples/magistral/magistral-small-fsdp-qlora.yaml
new file mode 100644
index 000000000..d46c49fe0
--- /dev/null
+++ b/examples/magistral/magistral-small-fsdp-qlora.yaml
@@ -0,0 +1,76 @@
+base_model: mistralai/Magistral-Small-2506
+
+# Enable to use mistral-common tokenizer
+tokenizer_use_mistral_common: true
+
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+load_in_8bit: false
+load_in_4bit: true
+
+datasets:
+  - path: fozziethebeat/alpaca_messages_2k_test
+    type: chat_template
+
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.1
+output_dir: ./outputs/lora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 2048
+sample_packing: true
+eval_sample_packing: false
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_target_modules:
+  - gate_proj
+  - down_proj
+  - up_proj
+  - q_proj
+  - v_proj
+  - k_proj
+  - o_proj
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 1
+optimizer: adamw_torch_fused
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing:
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 1
+saves_per_epoch: 1
+
+fsdp:
+  - full_shard
+  - auto_wrap
+fsdp_config:
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_transformer_layer_cls_to_wrap: MistralDecoderLayer
+  fsdp_activation_checkpointing: true
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/magistral/magistral-small-qlora.yaml b/examples/magistral/magistral-small-qlora.yaml
new file mode 100644
index 000000000..188924d39
--- /dev/null
+++ b/examples/magistral/magistral-small-qlora.yaml
@@ -0,0 +1,67 @@
+base_model: mistralai/Magistral-Small-2506
+
+# Enable to use mistral-common tokenizer
+tokenizer_use_mistral_common: true
+
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+load_in_8bit: false
+load_in_4bit: true
+
+datasets:
+  - path: fozziethebeat/alpaca_messages_2k_test
+    type: chat_template
+
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.1
+output_dir: ./outputs/lora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 2048
+sample_packing: true
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_target_modules:
+  - gate_proj
+  - down_proj
+  - up_proj
+  - q_proj
+  - v_proj
+  - k_proj
+  - o_proj
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 1
+saves_per_epoch: 1
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/magistral/magistral-small-think-qlora.yaml b/examples/magistral/magistral-small-think-qlora.yaml
new file mode 100644
index 000000000..b715b3156
--- /dev/null
+++ b/examples/magistral/magistral-small-think-qlora.yaml
@@ -0,0 +1,67 @@
+base_model: mistralai/Magistral-Small-2507
+
+# Enable to use mistral-common tokenizer
+tokenizer_use_mistral_common: true
+
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+load_in_8bit: false
+load_in_4bit: true
+
+datasets:
+  - path: Nanobit/text-think-2k-test
+    type: chat_template
+
+dataset_prepared_path: last_run_prepared
+val_set_size: 0
+output_dir: ./outputs/lora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 2048
+sample_packing: true
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_target_modules:
+  - gate_proj
+  - down_proj
+  - up_proj
+  - q_proj
+  - v_proj
+  - k_proj
+  - o_proj
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 1
+saves_per_epoch: 1
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/mamba/config.yml b/examples/mamba/config.yml
index 3d4583932..e6b335804 100644
--- a/examples/mamba/config.yml
+++ b/examples/mamba/config.yml
@@ -41,10 +41,12 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention:
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 tokens:
 save_safetensors: False
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/mistral/bigstral-ds-zero3.yaml b/examples/mistral/bigstral-ds-zero3.yaml
index f626a92a1..a8dc36216 100644
--- a/examples/mistral/bigstral-ds-zero3.yaml
+++ b/examples/mistral/bigstral-ds-zero3.yaml
@@ -27,7 +27,7 @@ output_dir: ./outputs/out
 
 sequence_len: 2048
 sample_packing: true
-pad_to_sequence_len: true
+
 
 gradient_accumulation_steps: 1
 micro_batch_size: 1
@@ -53,3 +53,5 @@ special_tokens:
   eos_token: "<|im_end|>"
 tokens:
   - "<|im_start|>"
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/mistral/config.yml b/examples/mistral/config.yml
index 15edffb44..e74162537 100644
--- a/examples/mistral/config.yml
+++ b/examples/mistral/config.yml
@@ -14,7 +14,7 @@ output_dir: ./outputs/out
 
 sequence_len: 8192
 sample_packing: true
-pad_to_sequence_len: true
+
 eval_sample_packing: false
 
 wandb_project:
@@ -38,8 +38,10 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/mistral/lora-mps.yml b/examples/mistral/lora-mps.yml
index e6f46affb..07ce191dc 100644
--- a/examples/mistral/lora-mps.yml
+++ b/examples/mistral/lora-mps.yml
@@ -18,7 +18,7 @@ lora_model_dir:
 
 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 
 lora_r: 32
 lora_alpha: 16
@@ -59,8 +59,10 @@ sdp_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/mistral/lora.yml b/examples/mistral/lora.yml
index 9af4274fd..757287f19 100644
--- a/examples/mistral/lora.yml
+++ b/examples/mistral/lora.yml
@@ -20,7 +20,7 @@ lora_model_dir:
 
 sequence_len: 8192
 sample_packing: true
-pad_to_sequence_len: true
+
 
 lora_r: 32
 lora_alpha: 16
@@ -59,8 +59,10 @@ flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/mistral/mistral-dpo-qlora.yml b/examples/mistral/mistral-dpo-qlora.yml
index af707973f..8fea14a0f 100644
--- a/examples/mistral/mistral-dpo-qlora.yml
+++ b/examples/mistral/mistral-dpo-qlora.yml
@@ -31,7 +31,7 @@ output_dir: ./outputs/dpo-qlora
 
 sequence_len: 2048
 sample_packing: false
-pad_to_sequence_len: true
+
 
 adapter: qlora
 lora_model_dir:
@@ -73,10 +73,12 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: false
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
   bos_token: "<|im_start|>"
   eos_token: "<|im_end|>"
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/mistral/mistral-qlora-fsdp.yml b/examples/mistral/mistral-qlora-fsdp.yml
index e234b19a2..8e1f03d24 100644
--- a/examples/mistral/mistral-qlora-fsdp.yml
+++ b/examples/mistral/mistral-qlora-fsdp.yml
@@ -56,7 +56,7 @@ flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 
@@ -74,3 +74,5 @@ fsdp_config:
   fsdp_state_dict_type: FULL_STATE_DICT
   fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/mistral/mistral-qlora-orpo.yml b/examples/mistral/mistral-qlora-orpo.yml
index 6c0212b7c..850d286f3 100644
--- a/examples/mistral/mistral-qlora-orpo.yml
+++ b/examples/mistral/mistral-qlora-orpo.yml
@@ -25,7 +25,7 @@ lora_model_dir:
 
 sequence_len: 4096
 sample_packing: false
-pad_to_sequence_len: true
+
 
 lora_r: 32
 lora_alpha: 16
@@ -64,8 +64,10 @@ flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/mistral/mistral-small-3.1-24B-lora.yml b/examples/mistral/mistral-small-3.1-24B-lora.yml
index 198b3f373..3e477645e 100644
--- a/examples/mistral/mistral-small-3.1-24B-lora.yml
+++ b/examples/mistral/mistral-small-3.1-24B-lora.yml
@@ -27,7 +27,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
 
 wandb_project:
 wandb_entity:
@@ -48,11 +48,13 @@ tf32: true
 
 gradient_checkpointing: true
 logging_steps: 1
-flash_attention: false # PixtralVisionModel does not support Flash Attention 2.0 yet.
-eager_attention:
+# flash_attention: false # PixtralVisionModel does not support Flash Attention 2.0 yet.
+sdp_attention: true
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/mistral/mixtral-8x22b-qlora-fsdp.yml b/examples/mistral/mixtral-8x22b-qlora-fsdp.yml
index af6ba5a76..dc7bd9c37 100644
--- a/examples/mistral/mixtral-8x22b-qlora-fsdp.yml
+++ b/examples/mistral/mixtral-8x22b-qlora-fsdp.yml
@@ -54,7 +54,7 @@ flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 
@@ -72,3 +72,5 @@ fsdp_config:
   fsdp_state_dict_type: FULL_STATE_DICT
   fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/mistral/mixtral-qlora-fsdp.yml b/examples/mistral/mixtral-qlora-fsdp.yml
index b1843a138..5151e1292 100644
--- a/examples/mistral/mixtral-qlora-fsdp.yml
+++ b/examples/mistral/mixtral-qlora-fsdp.yml
@@ -56,7 +56,7 @@ flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 
@@ -77,3 +77,5 @@ fsdp_config:
   fsdp_forward_prefetch: false
   fsdp_backward_prefetch: BACKWARD_PRE
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/mistral/mixtral.yml b/examples/mistral/mixtral.yml
index 4c256420c..d1981a699 100644
--- a/examples/mistral/mixtral.yml
+++ b/examples/mistral/mixtral.yml
@@ -34,7 +34,7 @@ lora_model_dir:
 
 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 
 lora_r: 32
 lora_alpha: 16
@@ -74,10 +74,12 @@ flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 
 deepspeed: deepspeed_configs/zero2.json
 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/mistral/mixtral_22.yml b/examples/mistral/mixtral_22.yml
index 25e1d7155..0b606b7d7 100644
--- a/examples/mistral/mixtral_22.yml
+++ b/examples/mistral/mixtral_22.yml
@@ -25,7 +25,7 @@ output_dir: ./outputs/out
 
 sequence_len: 8000
 sample_packing: true
-pad_to_sequence_len: true
+
 
 gradient_accumulation_steps: 1
 micro_batch_size: 1
@@ -51,3 +51,5 @@ special_tokens:
   eos_token: "<|im_end|>"
 tokens:
   - "<|im_start|>"
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/mistral/qlora.yml b/examples/mistral/qlora.yml
index 607e33701..2a7495e95 100644
--- a/examples/mistral/qlora.yml
+++ b/examples/mistral/qlora.yml
@@ -20,7 +20,7 @@ lora_model_dir:
 
 sequence_len: 8192
 sample_packing: true
-pad_to_sequence_len: true
+
 
 lora_r: 32
 lora_alpha: 16
@@ -59,8 +59,10 @@ flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/orpheus/finetune.yml b/examples/orpheus/finetune.yml
index 9bcbbeee0..f4bc8054e 100644
--- a/examples/orpheus/finetune.yml
+++ b/examples/orpheus/finetune.yml
@@ -18,7 +18,7 @@ output_dir: ./outputs/out
 
 sequence_len: 8192
 sample_packing: true
-pad_to_sequence_len: true
+
 
 wandb_project:
 wandb_entity:
@@ -43,10 +43,12 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 20
+warmup_ratio: 0.1
 evals_per_epoch: 5
 saves_per_epoch: 5
 weight_decay: 0.05
 
 special_tokens:
   pad_token: <custom_token_7>
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/phi/lora-3.5.yaml b/examples/phi/lora-3.5.yaml
index ad4ce9cd4..a6fa15d98 100644
--- a/examples/phi/lora-3.5.yaml
+++ b/examples/phi/lora-3.5.yaml
@@ -28,7 +28,7 @@ output_dir: ./outputs/lora-out
 
 sequence_len: 4096
 sample_packing: false
-pad_to_sequence_len: true
+
 
 adapter: lora
 lora_model_dir:
@@ -59,7 +59,9 @@ gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 4
 weight_decay: 0.0
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/phi/phi-ft.yml b/examples/phi/phi-ft.yml
index 1562a7353..717a45929 100644
--- a/examples/phi/phi-ft.yml
+++ b/examples/phi/phi-ft.yml
@@ -15,7 +15,7 @@ output_dir: ./outputs/phi-sft-out
 
 sequence_len: 2048
 sample_packing: true
-pad_to_sequence_len: true
+
 
 adapter:
 lora_model_dir:
@@ -50,10 +50,12 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 100
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
 resize_token_embeddings_to_32x: true
 special_tokens:
   pad_token: "<|endoftext|>"
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/phi/phi-qlora.yml b/examples/phi/phi-qlora.yml
index 4cd53db97..0fe1abea5 100644
--- a/examples/phi/phi-qlora.yml
+++ b/examples/phi/phi-qlora.yml
@@ -18,7 +18,7 @@ output_dir: ./outputs/phi-sft-out
 
 sequence_len: 2048
 sample_packing: true
-pad_to_sequence_len: true
+
 
 adapter: qlora
 lora_model_dir:
@@ -53,10 +53,12 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 100
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
 resize_token_embeddings_to_32x: true
 special_tokens:
   pad_token: "<|endoftext|>"
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/phi/phi2-ft.yml b/examples/phi/phi2-ft.yml
index ca733cc71..e470c0d24 100644
--- a/examples/phi/phi2-ft.yml
+++ b/examples/phi/phi2-ft.yml
@@ -15,7 +15,7 @@ output_dir: ./outputs/phi-sft-out
 
 sequence_len: 2048
 sample_packing: true
-pad_to_sequence_len: true
+
 
 adapter:
 lora_model_dir:
@@ -50,10 +50,12 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 100
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
 resize_token_embeddings_to_32x: true
 special_tokens:
   pad_token: "<|endoftext|>"
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/phi/phi3-ft-fsdp.yml b/examples/phi/phi3-ft-fsdp.yml
index d0d14fea6..1793737b5 100644
--- a/examples/phi/phi3-ft-fsdp.yml
+++ b/examples/phi/phi3-ft-fsdp.yml
@@ -15,7 +15,7 @@ output_dir: ./phi-sft-out
 
 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 trust_remote_code: true
 
 adapter:
@@ -51,7 +51,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 100
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
@@ -71,3 +71,5 @@ fsdp_config:
 resize_token_embeddings_to_32x: true
 special_tokens:
   pad_token: "<|endoftext|>"
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/phi/phi3-ft.yml b/examples/phi/phi3-ft.yml
index 17c48da6f..0b204963c 100644
--- a/examples/phi/phi3-ft.yml
+++ b/examples/phi/phi3-ft.yml
@@ -18,7 +18,7 @@ output_dir: ./out
 
 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 
 adapter: lora
 lora_model_dir:
@@ -59,3 +59,5 @@ warmup_ratio: 0.2
 debug: true
 weight_decay: 0.1
 resize_token_embeddings_to_32x: true
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/pixtral/lora-12b.yml b/examples/pixtral/lora-12b.yml
index dec8e4b5e..fea2a60ff 100644
--- a/examples/pixtral/lora-12b.yml
+++ b/examples/pixtral/lora-12b.yml
@@ -11,8 +11,7 @@ datasets:
   - path: HuggingFaceH4/llava-instruct-mix-vsft
     type: chat_template
     split: train[:1%]
-    field_messages: messages
-dataset_prepared_path: last_run_prepared
+dataset_prepared_path:
 val_set_size: 0.0
 output_dir: ./outputs/out
 
@@ -25,7 +24,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
 
 wandb_project:
 wandb_entity:
@@ -46,8 +45,8 @@ tf32: true
 
 gradient_checkpointing: true
 logging_steps: 1
-flash_attention: false # PixtralVisionModel does not support Flash Attention 2.0 yet
-eager_attention:
+# flash_attention:  # PixtralVisionModel does not support Flash Attention 2.0 yet
+sdp_attention: true
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
@@ -55,3 +54,5 @@ saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
   pad_token: <pad>
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/qwen2-vl/lora-7b.yaml b/examples/qwen2-vl/lora-7b.yaml
index 55773bc3d..8ea608199 100644
--- a/examples/qwen2-vl/lora-7b.yaml
+++ b/examples/qwen2-vl/lora-7b.yaml
@@ -25,7 +25,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
 
 wandb_project:
 wandb_entity:
@@ -53,3 +53,5 @@ warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/qwen2/dpo.yaml b/examples/qwen2/dpo.yaml
index bd896c2b3..3e87766d6 100644
--- a/examples/qwen2/dpo.yaml
+++ b/examples/qwen2/dpo.yaml
@@ -27,7 +27,7 @@ output_dir: ./outputs/dpo-out
 
 sequence_len: 2048
 sample_packing: false
-pad_to_sequence_len: true
+
 
 wandb_project:
 wandb_entity:
@@ -50,7 +50,9 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/qwen2/prm.yaml b/examples/qwen2/prm.yaml
index 4afa24f3c..a709a598d 100644
--- a/examples/qwen2/prm.yaml
+++ b/examples/qwen2/prm.yaml
@@ -22,7 +22,7 @@ remove_unused_columns: false
 sequence_len: 2048
 sample_packing: false
 eval_sample_packing: false
-pad_to_sequence_len: true
+
 
 wandb_project:
 wandb_entity:
@@ -55,3 +55,5 @@ eval_steps: 100
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/qwen2/qlora-fsdp.yaml b/examples/qwen2/qlora-fsdp.yaml
index ed2670ab6..337619b61 100644
--- a/examples/qwen2/qlora-fsdp.yaml
+++ b/examples/qwen2/qlora-fsdp.yaml
@@ -17,7 +17,7 @@ output_dir: ./outputs/out
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: true
-pad_to_sequence_len: true
+
 
 adapter: qlora
 lora_model_dir:
@@ -49,7 +49,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
@@ -67,3 +67,5 @@ fsdp_config:
   fsdp_state_dict_type: FULL_STATE_DICT
   fsdp_sharding_strategy: FULL_SHARD
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/qwen2/reward-model.yaml b/examples/qwen2/reward-model.yaml
index 822407a1f..08b8b4552 100644
--- a/examples/qwen2/reward-model.yaml
+++ b/examples/qwen2/reward-model.yaml
@@ -18,7 +18,7 @@ remove_unused_columns: false
 sequence_len: 2048
 sample_packing: false
 eval_sample_packing: false
-pad_to_sequence_len: true
+
 
 wandb_project:
 wandb_entity:
@@ -26,7 +26,6 @@ wandb_watch:
 wandb_name:
 wandb_log_model:
 
-
 gradient_accumulation_steps: 4
 micro_batch_size: 2
 num_epochs: 4
@@ -50,3 +49,5 @@ evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/qwen2_5-vl/lora-7b.yaml b/examples/qwen2_5-vl/lora-7b.yaml
new file mode 100644
index 000000000..13a97dec3
--- /dev/null
+++ b/examples/qwen2_5-vl/lora-7b.yaml
@@ -0,0 +1,57 @@
+base_model: Qwen/Qwen2.5-VL-7B-Instruct
+processor_type: AutoProcessor
+
+# these 3 lines are needed for now to handle vision chat templates w images
+skip_prepare_dataset: true
+remove_unused_columns: false
+sample_packing: false
+
+chat_template: qwen2_vl
+datasets:
+  - path: HuggingFaceH4/llava-instruct-mix-vsft
+    type: chat_template
+    split: train[:1%]
+    field_messages: messages
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+adapter: lora
+lora_model_dir:
+
+sequence_len: 8192
+pad_to_sequence_len: false
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: true
+fp16:
+tf32: true
+
+gradient_checkpointing: true
+logging_steps: 1
+flash_attention: true
+eager_attention:
+
+warmup_ratio: 0.1
+evals_per_epoch: 1
+saves_per_epoch: 1
+weight_decay: 0.0
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/qwen3/32b-qlora.yaml b/examples/qwen3/32b-qlora.yaml
index 45a4395ac..f4a4f2816 100644
--- a/examples/qwen3/32b-qlora.yaml
+++ b/examples/qwen3/32b-qlora.yaml
@@ -22,7 +22,7 @@ dataset_prepared_path: last_run_prepared
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: true
-pad_to_sequence_len: true
+
 
 load_in_4bit: true
 adapter: qlora
@@ -62,8 +62,10 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/qwen3/8b-qat-fsdp2.yml b/examples/qwen3/8b-qat-fsdp2.yml
new file mode 100644
index 000000000..cfbe5a4b7
--- /dev/null
+++ b/examples/qwen3/8b-qat-fsdp2.yml
@@ -0,0 +1,80 @@
+base_model: Qwen/Qwen3-8B
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
+plugins:
+  - axolotl.integrations.liger.LigerPlugin
+
+liger_rope: true
+liger_rms_norm: true
+liger_glu_activation: true
+liger_layer_norm: true
+liger_fused_linear_cross_entropy: true
+
+datasets:
+  - path: tatsu-lab/alpaca
+    type: alpaca
+
+output_dir: ./outputs/qat_out/
+
+sequence_len: 2048
+sample_packing: true
+flex_attention: true
+
+
+flex_attn_compile_kwargs:
+  dynamic: false
+  mode: max-autotune-no-cudagraphs
+
+qat:
+  activation_dtype: int8
+  weight_dtype: int4
+  group_size: 256
+  fake_quant_after_n_steps: 1000
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 1
+micro_batch_size: 2
+max_steps: 2000
+optimizer: adamw_torch_fused
+lr_scheduler: cosine
+learning_rate: 2e-5
+
+bf16: true
+tf32: true
+
+resume_from_checkpoint:
+logging_steps: 1
+
+evals_per_epoch: 1
+saves_per_epoch: 1
+
+warmup_ratio: 0.1
+weight_decay: 0.0
+fsdp:
+  - full_shard
+  - auto_wrap
+
+fsdp_config:
+  fsdp_version: 2
+  fsdp_offload_params: false
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_transformer_layer_cls_to_wrap: Qwen3DecoderLayer
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sharding_strategy: FULL_SHARD
+  fsdp_reshard_after_forward: true
+  fsdp_activation_checkpointing: true
+
+special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/qwen3/qlora-fsdp.yaml b/examples/qwen3/qlora-fsdp.yaml
index dc3377b4f..e4d584dc7 100644
--- a/examples/qwen3/qlora-fsdp.yaml
+++ b/examples/qwen3/qlora-fsdp.yaml
@@ -16,7 +16,7 @@ output_dir: ./outputs/out
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: true
-pad_to_sequence_len: true
+
 
 adapter: qlora
 lora_model_dir:
@@ -48,7 +48,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 
-warmup_steps: 10
+warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
@@ -66,3 +66,5 @@ fsdp_config:
   fsdp_state_dict_type: FULL_STATE_DICT
   fsdp_sharding_strategy: FULL_SHARD
 special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/slurm/README.md b/examples/slurm/README.md
new file mode 100644
index 000000000..4c116b713
--- /dev/null
+++ b/examples/slurm/README.md
@@ -0,0 +1,66 @@
+# SLURM Multi-Node Training
+
+This directory contains an example SLURM script for running Axolotl training jobs across multiple nodes in a SLURM cluster.
+
+## Prerequisites
+
+- Access to a SLURM cluster with GPU nodes
+- Axolotl installed on all nodes (see [installation docs](https://docs.axolotl.ai/docs/installation.html))
+
+## Usage
+
+### Standard SLURM Clusters
+
+1. Copy [`axolotl.slurm`](./axolotl.slurm) to your working directory.
+2. Place your Axolotl config file (`train.yaml`) in the same directory.
+3. Set the appropriate environment variables for the job:
+    ```bash
+    export HF_TOKEN="your-huggingface-token"
+
+    # metric tracking
+    # export WANDB_API_KEY="your-wandb-api-key"
+    # ...
+    ```
+4. Submit the job:
+   ```bash
+   sbatch --export=ALL,NUM_NODES=2,NUM_TRAINERS=8,PRIMARY_ADDR=<master-node>,PRIMARY_PORT=29400 axolotl.slurm
+   ```
+
+   Where:
+   - `NUM_NODES`: Number of nodes to use
+   - `NUM_TRAINERS`: GPUs per node (typically 8)
+   - `PRIMARY_ADDR`: Hostname/IP of the master node
+   - `PRIMARY_PORT`: Port for distributed training (default: 29400)
+
+5. (Optional) Run other slurm commands:
+    ```bash
+    # check job info
+    scontrol show job axolotl-cli
+
+    # check job queue
+    squeue
+
+    # check cluster status
+    sinfo
+    ```
+
+### RunPod Instant Clusters
+
+Axolotl works with RunPod Instant Clusters. This feature provides managed SLURM clusters with zero configuration.
+
+1. **Deploy a SLURM Cluster**:
+   - Go to [RunPod Instant Clusters](https://console.runpod.io/cluster)
+   - Click "Create a Cluster"
+   - Choose your GPU type, node count, and region
+   - Choose an [Axolotl cloud docker image](https://docs.axolotl.ai/docs/docker.html#cloud)
+   - Deploy the cluster
+
+2. **Connect to the Controller Node**: Find the controller node in the RunPod console and connect via SSH
+
+3. **Follow the instructions in [Standard SLURM Clusters](#standard-slurm-clusters)**
+
+## Additional Resources
+
+- [Axolotl Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
+- [SLURM Documentation](https://slurm.schedmd.com/documentation.html)
+- [RunPod SLURM Clusters Guide](https://docs.runpod.io/instant-clusters/slurm-clusters)
diff --git a/examples/slurm/axolotl.slurm b/examples/slurm/axolotl.slurm
new file mode 100644
index 000000000..741d68ced
--- /dev/null
+++ b/examples/slurm/axolotl.slurm
@@ -0,0 +1,20 @@
+#!/bin/bash
+# Prior to running this script, export your HF_TOKEN and WANDB_API_KEY to your environment; i.e.
+# export HF_TOKEN="..."
+# export WANDB_API_KEY="..."
+#
+
+# ---------- SBATCH commands ---------- #
+#SBATCH --job-name=axolotl-slurm-multinode
+#SBATCH --ntasks-per-node=1
+#SBATCH --nodes=$NUM_NODES
+#SBATCH --gpus-per-task=8
+#SBATCH --cpus-per-task=128
+
+export TORCH_DIST_INIT_BARRIER=0
+
+srun axolotl preprocess train.yaml
+
+srun axolotl train train.yaml --launcher torchrun -- \
+    --nproc_per_node=$NUM_TRAINERS --nnodes=$NUM_NODES \
+    --rdzv_id axolotl-cli --rdzv_backend c10d --rdzv_endpoint "${PRIMARY_ADDR}:${PRIMARY_PORT}" --rdzv-conf="join_timeout=1800"
diff --git a/examples/voxtral/README.md b/examples/voxtral/README.md
new file mode 100644
index 000000000..f31e9cfd0
--- /dev/null
+++ b/examples/voxtral/README.md
@@ -0,0 +1,73 @@
+# Finetune Voxtral with Axolotl
+
+Voxtral is a [3B](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507)/[24B](https://huggingface.co/mistralai/Voxtral-Small-24B-2507) parameter opensource model from MistralAI found on HuggingFace. This guide shows how to fine-tune it with Axolotl.
+
+Thanks to the team at MistralAI for giving us early access to prepare for this release.
+
+## Getting started
+
+1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
+
+    Here is an example of how to install from pip:
+
+```bash
+# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
+pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+```
+
+2. Please install the below.
+
+```bash
+# audio
+pip3 install librosa==0.11.0
+pip3 install 'mistral_common[audio]==1.8.3'
+```
+
+3. Run the finetuning example:
+
+```bash
+# text only
+axolotl train examples/voxtral/voxtral-mini-qlora.yml
+
+# text + audio
+axolotl train examples/voxtral/voxtral-mini-audio-qlora.yml
+```
+
+These configs use about 4.8 GB VRAM.
+
+Let us know how it goes. Happy finetuning! 🚀
+
+### TIPS
+
+- For inference, the official MistralAI team recommends `temperature: 0.2` and `top_p: 0.95` for audio understanding and `temperature: 0.0` for transcription.
+- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
+- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
+- The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
+- The multimodal dataset format follows the OpenAI multi-content Messages format as seen [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format).
+
+
+## Optimization Guides
+
+- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
+- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
+- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
+
+## Limitations
+
+We only support the `mistral-common` tokenizer for Supervised Fine-tuning at the moment and for `type: chat_template` only.
+
+In addition, we do not support overriding tokens yet.
+
+## Related Resources
+
+- [MistralAI Magistral Blog](https://mistral.ai/news/magistral/)
+- [Axolotl Docs](https://docs.axolotl.ai)
+- [Axolotl Website](https://axolotl.ai)
+- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
+- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
+
+## Future Work
+
+- Add parity to Preference Tuning, RL, etc.
+- Add parity to other tokenizer configs like overriding tokens.
diff --git a/examples/voxtral/voxtral-mini-audio-qlora.yml b/examples/voxtral/voxtral-mini-audio-qlora.yml
new file mode 100644
index 000000000..8fe6adbff
--- /dev/null
+++ b/examples/voxtral/voxtral-mini-audio-qlora.yml
@@ -0,0 +1,78 @@
+base_model: mistralai/Voxtral-Mini-3B-2507
+processor_type: AutoProcessor
+
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+# Enable to use mistral-common tokenizer
+tokenizer_use_mistral_common: true
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+# for use with fft to only train on language model layers
+# unfrozen_parameters:
+  # - language_model.model.*
+  # - lm_head
+  # - embed_tokens
+
+load_in_4bit: true
+
+# these 3 lines are needed for now to handle vision chat templates w images
+skip_prepare_dataset: true
+remove_unused_columns: false
+sample_packing: false
+
+# gemma3 doesn't seem to play nice with ddp
+ddp_find_unused_parameters: true
+
+eot_tokens:
+  - <end_of_turn>
+
+# sample dataset below requires downloading audio/image in advance
+# wget https://huggingface.co/datasets/Nanobit/text-audio-2k-test/resolve/main/En-us-African_elephant.oga
+datasets:
+  - path: NanoBit/text-audio-2k-test
+    type: chat_template
+dataset_prepared_path:
+val_set_size: 0.01
+output_dir: ./outputs/out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 2048
+pad_to_sequence_len: false
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: true
+fp16:
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 1
+saves_per_epoch: 1
+weight_decay: 0.0
diff --git a/examples/voxtral/voxtral-mini-qlora.yml b/examples/voxtral/voxtral-mini-qlora.yml
new file mode 100644
index 000000000..bdbc5f867
--- /dev/null
+++ b/examples/voxtral/voxtral-mini-qlora.yml
@@ -0,0 +1,73 @@
+base_model: mistralai/Voxtral-Mini-3B-2507
+
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+# Enable to use mistral-common tokenizer
+tokenizer_use_mistral_common: true
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+load_in_8bit: false
+load_in_4bit: true
+
+# for use with fft to only train on language model layers
+# unfrozen_parameters:
+  # - language_model.model.*
+  # - lm_head
+  # - embed_tokens
+
+eot_tokens:
+  - <end_of_turn>
+datasets:
+  - path: cgato/SlimOrcaDedupCleaned
+    type: chat_template
+    split: train[:1%]
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+adapter: qlora
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'
+
+sequence_len: 2048
+sample_packing: true
+eval_sample_packing: true
+pad_to_sequence_len: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+num_epochs: 4
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch:
+saves_per_epoch: 1
+weight_decay: 0.0
+special_tokens:
diff --git a/favicon.jpg b/favicon.jpg
index 43c690244..4ec358746 100644
Binary files a/favicon.jpg and b/favicon.jpg differ
diff --git a/requirements.txt b/requirements.txt
index 4ae82dd49..370bf5a5e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,33 +1,35 @@
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
 
 # START section of dependencies that don't install on Darwin/MacOS
-bitsandbytes==0.45.4
-triton>=3.0.0
+bitsandbytes==0.46.1
+# triton 3.4.0 is not compatible with CCE
+triton>=3.0.0,<3.4.0
 mamba-ssm==1.2.0.post1
 xformers>=0.0.23.post1
 autoawq==0.2.7.post3
-liger-kernel==0.5.9
+liger-kernel==0.6.1
 # END section
 
 packaging==23.2
 
-huggingface_hub==0.31.0
-peft==0.15.2
-transformers==4.51.3
+huggingface_hub>=0.33.0
+peft==0.17.0
+transformers==4.55.0
 tokenizers>=0.21.1
-accelerate==1.6.0
-datasets==3.5.1
-deepspeed>=0.15.4
-trl==0.17.0
-hf_xet==1.1.0
-hqq==0.2.5
+accelerate==1.10.0
+datasets==4.0.0
+deepspeed>=0.17.0
+trl==0.21.0
+hf_xet==1.1.5
+kernels==0.9.0
+trackio
 
 optimum==1.16.2
 hf_transfer
 sentencepiece
-gradio==5.23.3
+gradio==5.41.1
 
-modal==0.70.5
+modal==1.0.2
 pydantic==2.10.6
 addict
 fire
@@ -63,8 +65,10 @@ langdetect==1.0.9
 immutabledict==4.2.0
 antlr4-python3-runtime==4.13.2
 
-torchao==0.9.0
+torchao==0.12.0
 schedulefree==1.4.1
 
 axolotl-contribs-lgpl==0.0.6
-axolotl-contribs-mit==0.0.3
+axolotl-contribs-mit==0.0.5
+
+mistral-common==1.8.3
diff --git a/scripts/cloud-entrypoint.sh b/scripts/cloud-entrypoint.sh
index 2d3e29181..c98e7c0d0 100755
--- a/scripts/cloud-entrypoint.sh
+++ b/scripts/cloud-entrypoint.sh
@@ -44,8 +44,13 @@ add_keys_to_authorized() {
     chmod 700 -R ~/.ssh
 }
 
+# Set SSH port
+if [ ! -z "$SSH_PORT" ]; then
+    sed -i "s/#Port 22/Port $SSH_PORT/" /etc/ssh/sshd_config
+fi
+
 if [[ $PUBLIC_KEY ]]; then
-    # runpod
+    # runpod, prime intellect
     add_keys_to_authorized "$PUBLIC_KEY"
     # Start the SSH service in the background
     service ssh start
@@ -76,5 +81,13 @@ if [ ! -L "/workspace/axolotl/outputs" ]; then
     ln -sf /workspace/data/axolotl-artifacts /workspace/axolotl/outputs
 fi
 
+# start the runpod slurm init
+SLURM_INIT="${SLURM_INIT:-/slurm-init.sh}"
+
+if [[ -f "$SLURM_INIT" ]]; then
+  echo "[entrypoint] running $SLURM_INIT..."
+  bash "$SLURM_INIT"
+fi
+
 # Execute the passed arguments (CMD)
 exec "$@"
diff --git a/scripts/cutcrossentropy_install.py b/scripts/cutcrossentropy_install.py
index bc6213dd9..b2bb0fcf8 100644
--- a/scripts/cutcrossentropy_install.py
+++ b/scripts/cutcrossentropy_install.py
@@ -9,6 +9,8 @@ except ImportError as exc:
     raise ImportError("Install torch via `pip install torch`") from exc
 from packaging.version import Version as V
 
+USE_UV = "--uv" in sys.argv[1:]
+
 v = V(torch.__version__)
 
 # no cut-cross-entropy support for torch < 2.4.0
@@ -23,7 +25,9 @@ if cce_spec:
     if not importlib.util.find_spec("cut_cross_entropy.transformers"):
         UNINSTALL_PREFIX = "pip uninstall -y cut-cross-entropy && "
 
+UV_PREFIX = "uv " if USE_UV else ""
+
 print(
     UNINSTALL_PREFIX
-    + 'pip install "cut-cross-entropy[transformers] @ git+https://github.com/apple/ml-cross-entropy.git@bad6f7b49c75fdec69471abb71b4cddd0f0c6438"'
+    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0ee9ee8"'
 )
diff --git a/scripts/motd b/scripts/motd
index bc123c312..275a4fcba 100644
--- a/scripts/motd
+++ b/scripts/motd
@@ -11,7 +11,9 @@
                                  =@#       @#  #@=     #@   =#@@@@#=    +#@@=  +#@@@@#=    .##@@+   @@
     @@@@  @@@@@@@@@@@@@@@@
 
-Welcome to the axolotl cloud image! If the you've mounted a disk to /workspace and the axolotl directory ie empty, run the following commands:
+Welcome to the axolotl cloud image! If the you've mounted a disk to /workspace and the axolotl directory is empty, run the following commands:
+
+Need help with your post-training workloads? Reach out us at contact@axolotl.ai for assistance.
 
 ```
 cd /workspace
diff --git a/scripts/unsloth_install.py b/scripts/unsloth_install.py
index bffab4670..acbd05e90 100644
--- a/scripts/unsloth_install.py
+++ b/scripts/unsloth_install.py
@@ -1,11 +1,15 @@
 # noqa
 # pylint: skip-file
+import sys
+
 try:
     import torch
 except ImportError:
     raise ImportError("Install torch via `pip install torch`")
 from packaging.version import Version as V
 
+use_uv = "--uv" in sys.argv[1:]
+
 v = V(torch.__version__)
 cuda = str(torch.version.cuda)
 try:
@@ -31,6 +35,7 @@ elif v < V("2.6.0"):
 else:
     raise RuntimeError(f"Torch = {v} too new!")
 x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "")
+uv_prefix = "uv " if use_uv else ""
 print(
-    f'pip install unsloth-zoo==2024.12.1 && pip install --no-deps "unsloth[{x}]==2024.12.4"'
+    f'{uv_prefix}pip install unsloth-zoo==2024.12.1 && {uv_prefix}pip install --no-deps "unsloth[{x}]==2024.12.4"'
 )
diff --git a/setup.py b/setup.py
index 97e7f5ff5..de6f19e56 100644
--- a/setup.py
+++ b/setup.py
@@ -66,14 +66,19 @@ def parse_requirements(extras_require_map):
 
             if (major, minor) >= (2, 7):
                 _install_requires.pop(_install_requires.index(xformers_version))
-                # _install_requires.append("xformers==0.0.29.post3")  # xformers seems to be hard pinned to 2.6.0
-                extras_require_map["vllm"] = ["vllm==0.8.5.post1"]
+                if patch == 0:
+                    _install_requires.append("xformers==0.0.30")
+                    # vllm 0.9.x is incompatible with latest transformers
+                    extras_require_map.pop("vllm")
+                else:
+                    _install_requires.append("xformers==0.0.31")
+                    extras_require_map["vllm"] = ["vllm>=0.10.0"]
             elif (major, minor) >= (2, 6):
                 _install_requires.pop(_install_requires.index(xformers_version))
-                _install_requires.append(
-                    "xformers==0.0.29.post2"
-                )  # vllm needs post2 w torch 2.6
-                extras_require_map["vllm"] = ["vllm==0.8.5.post1"]
+                _install_requires.append("xformers==0.0.29.post3")
+                # since we only support 2.6.0+cu126
+                _dependency_links.append("https://download.pytorch.org/whl/cu126")
+                extras_require_map.pop("vllm")
             elif (major, minor) >= (2, 5):
                 _install_requires.pop(_install_requires.index(xformers_version))
                 if patch == 0:
@@ -81,7 +86,9 @@ def parse_requirements(extras_require_map):
                 else:
                     _install_requires.append("xformers>=0.0.28.post3")
                 _install_requires.pop(_install_requires.index(autoawq_version))
+                extras_require_map.pop("vllm")
             elif (major, minor) >= (2, 4):
+                extras_require_map.pop("vllm")
                 if patch == 0:
                     _install_requires.pop(_install_requires.index(xformers_version))
                     _install_requires.append("xformers>=0.0.27")
@@ -111,14 +118,14 @@ def get_package_version():
 
 
 extras_require = {
-    "flash-attn": ["flash-attn==2.7.4.post1"],
+    "flash-attn": ["flash-attn==2.8.2"],
     "ring-flash-attn": [
-        "flash-attn==2.7.4.post1",
-        "ring-flash-attn>=0.1.4",
+        "flash-attn==2.8.2",
+        "ring-flash-attn>=0.1.7",
         "yunchang==0.6.0",
     ],
     "deepspeed": [
-        "deepspeed==0.15.4",
+        "deepspeed==0.17.2",
         "deepspeed-kernels",
     ],
     "mamba-ssm": [
@@ -148,13 +155,12 @@ extras_require = {
         "ray[train]",
     ],
     "vllm": [
-        "vllm==0.7.2",
+        "vllm==0.10.0",
     ],
     "llmcompressor": [
         "llmcompressor==0.5.1",
     ],
 }
-
 install_requires, dependency_links, extras_require_build = parse_requirements(
     extras_require
 )
diff --git a/src/axolotl/__init__.py b/src/axolotl/__init__.py
index 63f28adda..e08d43cc3 100644
--- a/src/axolotl/__init__.py
+++ b/src/axolotl/__init__.py
@@ -4,4 +4,4 @@ import pkgutil
 
 __path__ = pkgutil.extend_path(__path__, __name__)  # Make this a namespace package
 
-__version__ = "0.10.0.dev0"
+__version__ = "0.13.0.dev"
diff --git a/src/axolotl/cli/args.py b/src/axolotl/cli/args.py
index 088e337e4..31d854d41 100644
--- a/src/axolotl/cli/args.py
+++ b/src/axolotl/cli/args.py
@@ -28,11 +28,8 @@ class TrainerCliArgs:
     debug: bool = field(default=False)
     debug_text_only: bool = field(default=False)
     debug_num_examples: int = field(default=0)
-    merge_lora: bool = field(default=False)
     prompter: Optional[str] = field(default=None)
     shard: bool = field(default=False)
-    main_process_port: Optional[int] = field(default=None)
-    num_processes: Optional[int] = field(default=None)
 
 
 @dataclass
@@ -89,6 +86,26 @@ class VllmServeCliArgs:
         },
     )
 
+    enable_reasoning: Optional[bool] = field(
+        default=None,
+    )
+
+    reasoning_parser: Optional[str] = field(
+        default=None,
+    )
+
+
+@dataclass
+class QuantizeCliArgs:
+    """Dataclass with CLI arguments for `axolotl quantize` command."""
+
+    base_model: Optional[str] = field(default=None)
+    weight_dtype: Optional[str] = field(default=None)
+    activation_dtype: Optional[str] = field(default=None)
+    quantize_embedding: Optional[bool] = field(default=None)
+    group_size: Optional[int] = field(default=None)
+    output_dir: Optional[str] = field(default=None)
+
 
 @dataclass
 class EvaluateCliArgs:
diff --git a/src/axolotl/cli/checks.py b/src/axolotl/cli/checks.py
index 47348240e..a743e74dc 100644
--- a/src/axolotl/cli/checks.py
+++ b/src/axolotl/cli/checks.py
@@ -1,14 +1,16 @@
 """Various checks for Axolotl CLI."""
 
-import logging
 import os
 from pathlib import Path
 
 from accelerate.commands.config import config_args
 from huggingface_hub import HfApi
 from huggingface_hub.utils import LocalTokenNotFoundError
+from requests import HTTPError
 
-LOG = logging.getLogger(__name__)
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
 
 
 def check_accelerate_default_config() -> None:
@@ -45,3 +47,8 @@ def check_user_token() -> bool:
             "Error verifying HuggingFace token. Remember to log in using `huggingface-cli login` and get your access token from https://huggingface.co/settings/tokens if you want to use gated models or datasets."
         )
         return False
+    except HTTPError:
+        LOG.warning(
+            "Error accessing HuggingFace. This may be due to a network issue or rate limiting."
+        )
+        return False
diff --git a/src/axolotl/cli/cloud/__init__.py b/src/axolotl/cli/cloud/__init__.py
index 5d6900d3e..bf12ab8cb 100644
--- a/src/axolotl/cli/cloud/__init__.py
+++ b/src/axolotl/cli/cloud/__init__.py
@@ -3,16 +3,15 @@ launch axolotl in supported cloud platforms
 """
 
 from pathlib import Path
-from typing import Union
+from typing import Literal
 
 import yaml
 
-from axolotl.cli.art import print_axolotl_text_art
 from axolotl.cli.cloud.modal_ import ModalCloud
 from axolotl.utils.dict import DictDefault
 
 
-def load_cloud_cfg(cloud_config: Union[Path, str]) -> DictDefault:
+def load_cloud_cfg(cloud_config: Path | str) -> DictDefault:
     """Load and validate cloud configuration."""
     # Load cloud configuration.
     with open(cloud_config, encoding="utf-8") as file:
@@ -21,10 +20,9 @@ def load_cloud_cfg(cloud_config: Union[Path, str]) -> DictDefault:
 
 
 def do_cli_preprocess(
-    cloud_config: Union[Path, str],
-    config: Union[Path, str],
+    cloud_config: Path | str,
+    config: Path | str,
 ) -> None:
-    print_axolotl_text_art()
     cloud_cfg = load_cloud_cfg(cloud_config)
     cloud = ModalCloud(cloud_cfg)
     with open(config, "r", encoding="utf-8") as file:
@@ -33,13 +31,13 @@ def do_cli_preprocess(
 
 
 def do_cli_train(
-    cloud_config: Union[Path, str],
-    config: Union[Path, str],
-    accelerate: bool = True,
+    cloud_config: Path | str,
+    config: Path | str,
+    launcher: Literal["accelerate", "torchrun", "python"] = "accelerate",
+    launcher_args: list[str] | None = None,
     cwd=None,
     **kwargs,
 ) -> None:
-    print_axolotl_text_art()
     cloud_cfg = load_cloud_cfg(cloud_config)
     cloud = ModalCloud(cloud_cfg)
     with open(config, "r", encoding="utf-8") as file:
@@ -47,14 +45,19 @@ def do_cli_train(
     local_dirs = {}
     if cwd and not Path(cwd).joinpath("src", "axolotl").exists():
         local_dirs = {"/workspace/mounts": cwd}
-    cloud.train(config_yaml, accelerate=accelerate, local_dirs=local_dirs, **kwargs)
+    cloud.train(
+        config_yaml,
+        launcher=launcher,
+        launcher_args=launcher_args,
+        local_dirs=local_dirs,
+        **kwargs,
+    )
 
 
 def do_cli_lm_eval(
-    cloud_config: Union[Path, str],
-    config: Union[Path, str],
+    cloud_config: Path | str,
+    config: Path | str,
 ) -> None:
-    print_axolotl_text_art()
     cloud_cfg = load_cloud_cfg(cloud_config)
     cloud = ModalCloud(cloud_cfg)
     with open(config, "r", encoding="utf-8") as file:
diff --git a/src/axolotl/cli/cloud/base.py b/src/axolotl/cli/cloud/base.py
index eba8be49a..c498e8691 100644
--- a/src/axolotl/cli/cloud/base.py
+++ b/src/axolotl/cli/cloud/base.py
@@ -3,6 +3,7 @@ base class for cloud platforms from cli
 """
 
 from abc import ABC, abstractmethod
+from typing import Literal
 
 
 class Cloud(ABC):
@@ -15,5 +16,12 @@ class Cloud(ABC):
         pass
 
     @abstractmethod
-    def train(self, config_yaml: str, accelerate: bool = True) -> str:
+    def train(
+        self,
+        config_yaml: str,
+        launcher: Literal["accelerate", "torchrun", "python"] = "accelerate",
+        launcher_args: list[str] | None = None,
+        local_dirs: dict[str, str] | None = None,
+        **kwargs,
+    ):
         pass
diff --git a/src/axolotl/cli/cloud/modal_.py b/src/axolotl/cli/cloud/modal_.py
index ef59ed3d4..240c6d894 100644
--- a/src/axolotl/cli/cloud/modal_.py
+++ b/src/axolotl/cli/cloud/modal_.py
@@ -8,7 +8,7 @@ import os
 import subprocess  # nosec B404
 from pathlib import Path
 from random import randint
-from typing import Optional
+from typing import Literal
 
 import modal
 
@@ -82,7 +82,7 @@ class ModalCloud(Cloud):
         return res
 
     def get_image(self):
-        docker_tag = "main-py3.11-cu124-2.5.1"
+        docker_tag = "main-py3.11-cu124-2.6.0"
         if self.config.docker_tag:
             docker_tag = self.config.docker_tag
         docker_image = f"axolotlai/axolotl:{docker_tag}"
@@ -230,8 +230,9 @@ class ModalCloud(Cloud):
     def train(
         self,
         config_yaml: str,
-        accelerate: bool = True,
-        local_dirs: Optional[dict[str, str]] = None,
+        launcher: Literal["accelerate", "torchrun", "python"] = "accelerate",
+        launcher_args: list[str] | None = None,
+        local_dirs: dict[str, str] | None = None,
         **kwargs,
     ):
         modal_fn = self.get_train_env(local_dirs)(_train)
@@ -239,7 +240,8 @@ class ModalCloud(Cloud):
             with self.app.run(detach=True):
                 modal_fn.remote(
                     config_yaml,
-                    accelerate=accelerate,
+                    launcher=launcher,
+                    launcher_args=launcher_args,
                     volumes={k: v[0] for k, v in self.volumes.items()},
                     **kwargs,
                 )
@@ -270,20 +272,35 @@ def _preprocess(config_yaml: str, volumes=None):
     )
 
 
-def _train(config_yaml: str, accelerate: bool = True, volumes=None, **kwargs):
+def _train(
+    config_yaml: str,
+    launcher: Literal["accelerate", "torchrun", "python"] = "accelerate",
+    launcher_args: list[str] | None = None,
+    volumes=None,
+    **kwargs,  # pylint: disable=unused-argument
+):
     Path("/workspace/mounts").mkdir(parents=True, exist_ok=True)
     with open("/workspace/mounts/config.yaml", "w", encoding="utf-8") as f_out:
         f_out.write(config_yaml)
     run_folder = "/workspace/mounts"
-    if accelerate:
-        accelerate_args = "--accelerate"
+
+    launcher_args = launcher_args or []
+
+    # Build the base command
+    if launcher == "accelerate":
+        launcher_arg = "--launcher accelerate"
+    elif launcher == "torchrun":
+        launcher_arg = "--launcher torchrun"
     else:
-        accelerate_args = "--no-accelerate"
-    num_processes_args = ""
-    if num_processes := kwargs.pop("num_processes", None):
-        num_processes_args = f"--num-processes {num_processes}"
+        launcher_arg = "--launcher python"
+
+    # Build launcher args string
+    launcher_args_str = ""
+    if launcher_args:
+        launcher_args_str = "-- " + " ".join(launcher_args)
+
     run_cmd(
-        f"axolotl train {accelerate_args} {num_processes_args} /workspace/mounts/config.yaml",
+        f"axolotl train {launcher_arg} /workspace/mounts/config.yaml {launcher_args_str}".strip(),
         run_folder,
         volumes,
     )
diff --git a/src/axolotl/cli/config.py b/src/axolotl/cli/config.py
index 8f1fe7185..0f1245aed 100644
--- a/src/axolotl/cli/config.py
+++ b/src/axolotl/cli/config.py
@@ -1,7 +1,6 @@
 """Configuration loading and processing."""
 
 import json
-import logging
 import os
 import tempfile
 from pathlib import Path
@@ -22,11 +21,14 @@ from axolotl.utils.config import (
     validate_config,
 )
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
 from axolotl.utils.mlflow_ import setup_mlflow_env_vars
 from axolotl.utils.trainer import prepare_opinionated_env, prepare_optim_env
 from axolotl.utils.wandb_ import setup_wandb_env_vars
 
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__)
+
+API_KEY_FIELDS = {"comet_api_key"}
 
 
 def check_remote_config(config: Union[str, Path]) -> Union[str, Path]:
@@ -119,12 +121,12 @@ def choose_config(path: Path) -> str:
         )
 
     if len(yaml_files) == 1:
-        print(f"Using default YAML file '{yaml_files[0]}'")
+        LOG.info(f"Using default YAML file '{yaml_files[0]}'")
         return str(yaml_files[0])
 
-    print("Choose a YAML file:")
+    LOG.info("Choose a YAML file:")
     for idx, file in enumerate(yaml_files):
-        print(f"{idx + 1}. {file}")
+        LOG.info(f"{idx + 1}. {file}")
 
     chosen_file = None
     while chosen_file is None:
@@ -133,9 +135,9 @@ def choose_config(path: Path) -> str:
             if 1 <= choice <= len(yaml_files):
                 chosen_file = str(yaml_files[choice - 1])
             else:
-                print("Invalid choice. Please choose a number from the list.")
+                LOG.info("Invalid choice. Please choose a number from the list.")
         except ValueError:
-            print("Invalid input. Please enter a number.")
+            LOG.info("Invalid input. Please enter a number.")
 
     return chosen_file
 
@@ -151,6 +153,8 @@ def prepare_plugins(cfg: DictDefault):
         plugin_manager = PluginManager.get_instance()
         for plugin_name in cfg["plugins"]:
             plugin_manager.register(plugin_name)
+        for plugin in plugin_manager.plugins.values():
+            plugin.register(cfg)
 
 
 def plugin_set_cfg(cfg: DictDefault):
@@ -195,14 +199,13 @@ def load_cfg(
     # If there are any options passed in the cli, if it is something that seems valid
     # from the yaml, then overwrite the value
     cfg_keys = cfg.keys()
-    for k, _ in kwargs.items():
-        # if not strict, allow writing to cfg even if it's not in the yml already
-        if k in cfg_keys or not cfg.strict:
-            # handle booleans
-            if isinstance(cfg[k], bool):
-                cfg[k] = bool(kwargs[k])
+    for key, value in kwargs.items():
+        # If not strict, allow writing to cfg even if it's not in the yml already
+        if key in cfg_keys or not cfg.strict:
+            if isinstance(cfg[key], bool):
+                cfg[key] = bool(value)
             else:
-                cfg[k] = kwargs[k]
+                cfg[key] = value
 
     try:
         device_props = torch.cuda.get_device_properties("cuda")
@@ -233,4 +236,15 @@ def load_cfg(
     setup_comet_env_vars(cfg)
     plugin_set_cfg(cfg)
 
+    cfg_to_log = {
+        k: "[REDACTED]" if k in API_KEY_FIELDS else v
+        for k, v in cfg.items()
+        if v is not None
+    }
+
+    LOG.info(
+        "config:\n%s",
+        json.dumps(cfg_to_log, indent=2, default=str, sort_keys=True),
+    )
+
     return cfg
diff --git a/src/axolotl/cli/delinearize_llama4.py b/src/axolotl/cli/delinearize_llama4.py
index c92bae930..90227fccd 100644
--- a/src/axolotl/cli/delinearize_llama4.py
+++ b/src/axolotl/cli/delinearize_llama4.py
@@ -9,7 +9,6 @@ from typing import Generator, Union
 import fire
 import torch
 from accelerate import init_empty_weights
-from dotenv import load_dotenv
 from transformers import AutoProcessor
 
 
@@ -152,5 +151,4 @@ def do_cli(model: Union[Path, str], output: Union[Path, str]) -> None:
 
 
 if __name__ == "__main__":
-    load_dotenv()
     fire.Fire(do_cli)
diff --git a/src/axolotl/cli/evaluate.py b/src/axolotl/cli/evaluate.py
index e52da66b7..9dd3b0083 100644
--- a/src/axolotl/cli/evaluate.py
+++ b/src/axolotl/cli/evaluate.py
@@ -1,24 +1,21 @@
 """CLI to run evaluation on a model."""
 
-import logging
 import os
 from pathlib import Path
 from typing import Union
 
 import fire
-from dotenv import load_dotenv
 from transformers.hf_argparser import HfArgumentParser
 
 from axolotl.cli.args import TrainerCliArgs
-from axolotl.cli.art import print_axolotl_text_art
 from axolotl.cli.checks import check_accelerate_default_config, check_user_token
 from axolotl.cli.config import load_cfg
 from axolotl.common.datasets import load_datasets, load_preference_datasets
 from axolotl.evaluate import evaluate
-from axolotl.utils import patch_optimized_env
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
 
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__)
 
 
 def do_evaluate(cfg: DictDefault, cli_args: TrainerCliArgs) -> None:
@@ -31,11 +28,7 @@ def do_evaluate(cfg: DictDefault, cli_args: TrainerCliArgs) -> None:
         cfg: Dictionary mapping `axolotl` config keys to values.
         cli_args: CLI arguments.
     """
-    # Enable expandable segments for cuda allocation to improve VRAM usage
-    patch_optimized_env()
-
     # pylint: disable=duplicate-code
-    print_axolotl_text_art()
     check_accelerate_default_config()
     if int(os.getenv("LOCAL_RANK", "0")) == 0:
         check_user_token()
@@ -66,5 +59,4 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs) -> None:
 
 
 if __name__ == "__main__":
-    load_dotenv()
     fire.Fire(do_cli)
diff --git a/src/axolotl/cli/inference.py b/src/axolotl/cli/inference.py
index a4906bbf3..83b567b64 100644
--- a/src/axolotl/cli/inference.py
+++ b/src/axolotl/cli/inference.py
@@ -1,7 +1,6 @@
 """CLI to run inference on a trained model."""
 
 import importlib
-import logging
 import sys
 from pathlib import Path
 from threading import Thread
@@ -10,11 +9,9 @@ from typing import Union
 import fire
 import torch
 import transformers
-from dotenv import load_dotenv
 from transformers import GenerationConfig, TextIteratorStreamer, TextStreamer
 
 from axolotl.cli.args import InferenceCliArgs
-from axolotl.cli.art import print_axolotl_text_art
 from axolotl.cli.config import load_cfg
 from axolotl.cli.utils import load_model_and_tokenizer
 from axolotl.utils.chat_templates import (
@@ -22,8 +19,9 @@ from axolotl.utils.chat_templates import (
     get_chat_template_from_config,
 )
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
 
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__)
 
 
 def get_multi_line_input() -> str:
@@ -255,7 +253,6 @@ def do_cli(
         kwargs: Additional keyword arguments to override config file values.
     """
     # pylint: disable=duplicate-code
-    print_axolotl_text_art()
     parsed_cfg = load_cfg(config, inference=True, rl=None, **kwargs)
     parsed_cfg.sample_packing = False
     parser = transformers.HfArgumentParser(InferenceCliArgs)
@@ -270,5 +267,4 @@ def do_cli(
 
 
 if __name__ == "__main__":
-    load_dotenv()
     fire.Fire(do_cli)
diff --git a/src/axolotl/cli/main.py b/src/axolotl/cli/main.py
index 601add709..e63392802 100644
--- a/src/axolotl/cli/main.py
+++ b/src/axolotl/cli/main.py
@@ -2,41 +2,51 @@
 
 # pylint: disable=redefined-outer-name
 
-import logging
 import os
 import subprocess  # nosec B404
-import tempfile
-from pathlib import Path
-from typing import Optional
+from typing import Literal, Optional
 
 import click
-import yaml
 from dotenv import load_dotenv
 
 import axolotl
 from axolotl.cli.args import (
     EvaluateCliArgs,
     PreprocessCliArgs,
+    QuantizeCliArgs,
     TrainerCliArgs,
     VllmServeCliArgs,
 )
-from axolotl.cli.sweeps import generate_sweep_configs
+from axolotl.cli.art import print_axolotl_text_art
 from axolotl.cli.utils import (
     add_options_from_config,
     add_options_from_dataclass,
     build_command,
     fetch_from_github,
     filter_none_kwargs,
+    generate_config_files,
+    launch_training,
 )
 from axolotl.integrations.lm_eval.cli import lm_eval
 from axolotl.utils import patch_optimized_env
+from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.config import AxolotlInputConfig
 
+LOG = get_logger(__name__)
+
+LAUNCHER_COMMAND_MAPPING = {
+    "accelerate": ["accelerate", "launch"],
+    "torchrun": ["torchrun"],
+}
+
 
 @click.group()
 @click.version_option(version=axolotl.__version__, prog_name="axolotl")
 def cli():
     """Axolotl CLI - Train and fine-tune large language models"""
+    print_axolotl_text_art()
+    load_dotenv()
+    patch_optimized_env()
 
 
 @cli.command()
@@ -45,7 +55,7 @@ def cli():
 @add_options_from_dataclass(PreprocessCliArgs)
 @add_options_from_config(AxolotlInputConfig)
 @filter_none_kwargs
-def preprocess(config: str, cloud: Optional[str] = None, **kwargs) -> None:
+def preprocess(config: str, cloud: Optional[str] = None, **kwargs):
     """
     Preprocess datasets before training.
 
@@ -55,7 +65,6 @@ def preprocess(config: str, cloud: Optional[str] = None, **kwargs) -> None:
         kwargs: Additional keyword arguments which correspond to CLI args or `axolotl`
             config options.
     """
-    patch_optimized_env()
 
     if cloud:
         from axolotl.cli.cloud import do_cli_preprocess
@@ -67,12 +76,15 @@ def preprocess(config: str, cloud: Optional[str] = None, **kwargs) -> None:
         do_cli(config=config, **kwargs)
 
 
-@cli.command()
+@cli.command(
+    context_settings={"ignore_unknown_options": True, "allow_extra_args": True}
+)
 @click.argument("config", type=click.Path(exists=True, path_type=str))
 @click.option(
-    "--accelerate/--no-accelerate",
-    default=True,
-    help="Use accelerate launch for multi-GPU training",
+    "--launcher",
+    type=click.Choice(["accelerate", "torchrun", "python"]),
+    default="accelerate",
+    help="Launcher to use for multi-GPU training",
 )
 @click.option("--cloud", default=None, type=click.Path(exists=True, path_type=str))
 @click.option(
@@ -83,126 +95,82 @@ def preprocess(config: str, cloud: Optional[str] = None, **kwargs) -> None:
 @add_options_from_dataclass(TrainerCliArgs)
 @add_options_from_config(AxolotlInputConfig)
 @filter_none_kwargs
+@click.pass_context
 def train(
+    ctx: click.Context,
     config: str,
-    accelerate: bool,
-    cloud: Optional[str] = None,
-    sweep: Optional[str] = None,
+    launcher: Literal["accelerate", "torchrun", "python"] = "accelerate",
+    cloud: str | None = None,
+    sweep: str | None = None,
     **kwargs,
-) -> None:
+):
     """
     Train or fine-tune a model.
 
     Args:
+        ctx: Click context for extra args.
         config: Path to `axolotl` config YAML file.
-        accelerate: Whether to use `accelerate` launcher.
+        launcher: Launcher to use for multi-GPU training ("accelerate", "torchrun", or "python").
         cloud: Path to a cloud accelerator configuration file
         sweep: Path to YAML config for sweeping hyperparameters.
         kwargs: Additional keyword arguments which correspond to CLI args or `axolotl`
             config options.
     """
-    # Enable expandable segments for cuda allocation to improve VRAM usage
-    patch_optimized_env()
+    # Extract launcher args from extra args (after --)
+    launcher_args = ctx.args if ctx.args else []
 
-    if "use_ray" in kwargs and kwargs["use_ray"]:
-        accelerate = False
-    if sweep:
-        # load the sweep configuration yaml file
-        with open(sweep, "r", encoding="utf-8") as fin:
-            sweep_config: dict[str, list] = yaml.safe_load(fin)
-        with open(config, "r", encoding="utf-8") as fin:
-            base_config: dict[str, list] = yaml.safe_load(fin)
+    # Handle Ray launcher override
+    _launcher = None if kwargs.get("use_ray") else launcher
 
-        # generate all possible configurations
-        permutations = generate_sweep_configs(base_config, sweep_config)
-
-        def iter_configs():
-            for perm in permutations:
-                # open temp directory for temporary configurations
-                with tempfile.TemporaryDirectory() as temp_dir:
-                    with open(
-                        Path(temp_dir) / "config.yaml", "w", encoding="utf-8"
-                    ) as fout:
-                        yaml.dump(perm, fout)
-                    yield str(Path(temp_dir) / "config.yaml")
-
-    else:
-
-        def iter_configs():
-            yield config
-
-    for cfg_file in iter_configs():
-        # handle errors from subprocess so we can continue rest of sweeps
+    # Process each configuration
+    for cfg_file, is_group in generate_config_files(config, sweep):
         try:
-            if accelerate:
-                if cloud:
-                    from axolotl.cli.cloud import do_cli_train
-
-                    cwd = os.getcwd()
-                    do_cli_train(
-                        cloud_config=cloud,
-                        config=config,
-                        accelerate=True,
-                        cwd=cwd,
-                        **kwargs,
-                    )
-                else:
-                    accelerate_args = []
-                    if "main_process_port" in kwargs:
-                        main_process_port = kwargs.pop("main_process_port", None)
-                        accelerate_args.append("--main_process_port")
-                        accelerate_args.append(str(main_process_port))
-                    if "num_processes" in kwargs:
-                        num_processes = kwargs.pop("num_processes", None)
-                        accelerate_args.append("--num_processes")
-                        accelerate_args.append(str(num_processes))
-
-                    base_cmd = ["accelerate", "launch"]
-                    base_cmd.extend(accelerate_args)
-                    base_cmd.extend(["-m", "axolotl.cli.train"])
-                    if cfg_file:
-                        base_cmd.append(cfg_file)
-                    cmd = build_command(base_cmd, kwargs)
-                    subprocess.run(cmd, check=True)  # nosec B603
-            else:
-                if cloud:
-                    from axolotl.cli.cloud import do_cli_train
-
-                    do_cli_train(
-                        cloud_config=cloud, config=config, accelerate=False, **kwargs
-                    )
-                else:
-                    from axolotl.cli.train import do_cli
-
-                    do_cli(config=cfg_file, **kwargs)
+            use_exec = is_group is not True
+            launch_training(cfg_file, _launcher, cloud, kwargs, launcher_args, use_exec)
         except subprocess.CalledProcessError as exc:
-            logging.error(f"Failed to train/fine-tune config '{cfg_file}': {exc}")
+            LOG.error(f"Failed to train/fine-tune config '{cfg_file}': {exc}")
             if not sweep:
                 raise exc
+        finally:
+            # Only delete temp files, not the original config
+            if cfg_file != config:
+                os.unlink(cfg_file)
 
 
-@cli.command()
+@cli.command(
+    context_settings={"ignore_unknown_options": True, "allow_extra_args": True}
+)
 @click.argument("config", type=click.Path(exists=True, path_type=str))
 @click.option(
-    "--accelerate/--no-accelerate",
-    default=True,
-    help="Use accelerate launch for multi-GPU training",
+    "--launcher",
+    type=click.Choice(["accelerate", "torchrun", "python"]),
+    default="accelerate",
+    help="Launcher to use for multi-GPU evaluation",
 )
 @add_options_from_dataclass(EvaluateCliArgs)
 @add_options_from_config(AxolotlInputConfig)
 @filter_none_kwargs
-def evaluate(config: str, accelerate: bool, **kwargs) -> None:
+@click.pass_context
+def evaluate(ctx: click.Context, config: str, launcher: str, **kwargs):
     """
     Evaluate a model.
 
     Args:
+        ctx: Click context for extra args.
         config: Path to `axolotl` config YAML file.
-        accelerate: Whether to use `accelerate` launcher.
+        launcher: Launcher to use for multi-GPU evaluation ("accelerate", "torchrun", or "python").
         kwargs: Additional keyword arguments which correspond to CLI args or `axolotl`
             config options.
     """
-    if accelerate:
-        base_cmd = ["accelerate", "launch", "-m", "axolotl.cli.evaluate"]
+    # Extract launcher args from extra args (after --)
+    launcher_args = ctx.args if ctx.args else []
+
+    if launcher in LAUNCHER_COMMAND_MAPPING:
+        base_cmd = (
+            LAUNCHER_COMMAND_MAPPING[launcher]
+            + launcher_args
+            + ["-m", "axolotl.cli.evaluate"]
+        )
         if config:
             base_cmd.append(config)
         cmd = build_command(base_cmd, kwargs)
@@ -213,30 +181,42 @@ def evaluate(config: str, accelerate: bool, **kwargs) -> None:
         do_cli(config=config, **kwargs)
 
 
-@cli.command()
+@cli.command(
+    context_settings={"ignore_unknown_options": True, "allow_extra_args": True}
+)
 @click.argument("config", type=click.Path(exists=True, path_type=str))
 @click.option(
-    "--accelerate/--no-accelerate",
-    default=False,
-    help="Use accelerate launch for multi-GPU inference",
+    "--launcher",
+    type=click.Choice(["accelerate", "torchrun", "python"]),
+    default="accelerate",
+    help="Launcher to use for multi-GPU inference",
 )
 @click.option("--gradio", is_flag=True, help="Launch Gradio interface")
 @add_options_from_dataclass(TrainerCliArgs)
 @add_options_from_config(AxolotlInputConfig)
 @filter_none_kwargs
-def inference(config: str, accelerate: bool, gradio: bool, **kwargs) -> None:
+@click.pass_context
+def inference(ctx: click.Context, config: str, launcher: str, gradio: bool, **kwargs):
     """
     Run inference with a trained model.
 
     Args:
+        ctx: Click context for extra args.
         config: Path to `axolotl` config YAML file.
-        accelerate: Whether to use `accelerate` launcher.
+        launcher: Launcher to use for multi-GPU inference ("accelerate", "torchrun", or "python").
         gradio: Whether to use Gradio browser interface or command line for inference.
         kwargs: Additional keyword arguments which correspond to CLI args or `axolotl`
             config options.
     """
-    if accelerate:
-        base_cmd = ["accelerate", "launch", "-m", "axolotl.cli.inference"]
+    # Extract launcher args from extra args (after --)
+    launcher_args = ctx.args if ctx.args else []
+
+    if launcher in LAUNCHER_COMMAND_MAPPING:
+        base_cmd = (
+            LAUNCHER_COMMAND_MAPPING[launcher]
+            + launcher_args
+            + ["-m", "axolotl.cli.inference"]
+        )
         if config:
             base_cmd.append(config)
         if gradio:
@@ -249,33 +229,42 @@ def inference(config: str, accelerate: bool, gradio: bool, **kwargs) -> None:
         do_cli(config=config, gradio=gradio, **kwargs)
 
 
-@cli.command()
+@cli.command(
+    context_settings={"ignore_unknown_options": True, "allow_extra_args": True}
+)
 @click.argument("config", type=click.Path(exists=True, path_type=str))
 @click.option(
-    "--accelerate/--no-accelerate",
-    default=True,
-    help="Use accelerate launch for weight merging",
+    "--launcher",
+    type=click.Choice(["accelerate", "torchrun", "python"]),
+    default="accelerate",
+    help="Launcher to use for weight merging",
 )
 @add_options_from_dataclass(TrainerCliArgs)
 @add_options_from_config(AxolotlInputConfig)
 @filter_none_kwargs
-def merge_sharded_fsdp_weights(config: str, accelerate: bool, **kwargs) -> None:
+@click.pass_context
+def merge_sharded_fsdp_weights(
+    ctx: click.Context, config: str, launcher: str, **kwargs
+):
     """
     Merge sharded FSDP model weights.
 
     Args:
+        ctx: Click context for extra args.
         config: Path to `axolotl` config YAML file.
-        accelerate: Whether to use `accelerate` launcher.
+        launcher: Launcher to use for weight merging ("accelerate", "torchrun", or "python").
         kwargs: Additional keyword arguments which correspond to CLI args or `axolotl`
             config options.
     """
-    if accelerate:
-        base_cmd = [
-            "accelerate",
-            "launch",
-            "-m",
-            "axolotl.cli.merge_sharded_fsdp_weights",
-        ]
+    # Extract launcher args from extra args (after --)
+    launcher_args = ctx.args if ctx.args else []
+
+    if launcher in LAUNCHER_COMMAND_MAPPING:
+        base_cmd = (
+            LAUNCHER_COMMAND_MAPPING[launcher]
+            + launcher_args
+            + ["-m", "axolotl.cli.merge_sharded_fsdp_weights"]
+        )
         if config:
             base_cmd.append(config)
         cmd = build_command(base_cmd, kwargs)
@@ -291,7 +280,7 @@ def merge_sharded_fsdp_weights(config: str, accelerate: bool, **kwargs) -> None:
 @add_options_from_dataclass(TrainerCliArgs)
 @add_options_from_config(AxolotlInputConfig)
 @filter_none_kwargs
-def merge_lora(config: str, **kwargs) -> None:
+def merge_lora(config: str, **kwargs):
     """
     Merge trained LoRA adapters into a base model.
 
@@ -308,7 +297,7 @@ def merge_lora(config: str, **kwargs) -> None:
 @cli.command()
 @click.argument("directory", type=click.Choice(["examples", "deepspeed_configs"]))
 @click.option("--dest", help="Destination directory")
-def fetch(directory: str, dest: Optional[str]) -> None:
+def fetch(directory: str, dest: Optional[str]):
     """
     Fetch example configs or other resources.
 
@@ -333,10 +322,20 @@ def vllm_serve(config: str, **cli_args: VllmServeCliArgs):
     do_vllm_serve(config, cli_args)
 
 
+@cli.command()
+@click.argument("config", type=click.Path(exists=True, path_type=str))
+@add_options_from_dataclass(QuantizeCliArgs)
+@filter_none_kwargs
+def quantize(config: str, **cli_args: QuantizeCliArgs):
+    from axolotl.cli.quantize import do_quantize
+
+    do_quantize(config, cli_args)
+
+
 @cli.command()
 @click.argument("model", type=click.Path(exists=True, path_type=str))
 @click.argument("output", type=click.Path(exists=False, path_type=str))
-def delinearize_llama4(model: str, output: str) -> None:
+def delinearize_llama4(model: str, output: str):
     from axolotl.cli.delinearize_llama4 import do_cli as do_delinearize_llama4
 
     do_delinearize_llama4(model, output)
@@ -350,5 +349,4 @@ def main():
 
 
 if __name__ == "__main__":
-    load_dotenv()
     main()
diff --git a/src/axolotl/cli/merge_lora.py b/src/axolotl/cli/merge_lora.py
index 5c8802dd1..31fad1b29 100644
--- a/src/axolotl/cli/merge_lora.py
+++ b/src/axolotl/cli/merge_lora.py
@@ -1,20 +1,16 @@
 """CLI to merge a trained LoRA into a base model."""
 
-import logging
 from pathlib import Path
 from typing import Union
 
 import fire
-import transformers
-from dotenv import load_dotenv
 
-from axolotl.cli.args import TrainerCliArgs
-from axolotl.cli.art import print_axolotl_text_art
 from axolotl.cli.config import load_cfg
 from axolotl.cli.utils import load_model_and_tokenizer
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
 
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__)
 
 
 def do_merge_lora(*, cfg: DictDefault) -> None:
@@ -25,8 +21,6 @@ def do_merge_lora(*, cfg: DictDefault) -> None:
     Args:
         cfg: Dictionary mapping `axolotl` config keys to values.
     """
-    print_axolotl_text_art()
-
     model, tokenizer, processor = load_model_and_tokenizer(cfg=cfg)
     safe_serialization = cfg.save_safetensors is True
 
@@ -68,12 +62,6 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs) -> None:
     Raises:
         ValueError: If target directory for LoRA merged model does not exist.
     """
-    # pylint: disable=duplicate-code
-    parser = transformers.HfArgumentParser(TrainerCliArgs)
-    parsed_cli_args, _ = parser.parse_args_into_dataclasses(
-        return_remaining_strings=True
-    )
-    parsed_cli_args.merge_lora = True
 
     parsed_cfg = load_cfg(
         config,
@@ -81,7 +69,7 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs) -> None:
         load_in_8bit=False,
         load_in_4bit=False,
         flash_attention=False,
-        sequence_parallel_degree=None,
+        context_parallel_size=None,
         deepspeed=None,
         fsdp=None,
         fsdp_config=None,
@@ -99,5 +87,4 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs) -> None:
 
 
 if __name__ == "__main__":
-    load_dotenv()
     fire.Fire(do_cli)
diff --git a/src/axolotl/cli/merge_sharded_fsdp_weights.py b/src/axolotl/cli/merge_sharded_fsdp_weights.py
index d4b36d92c..c08d30ec8 100644
--- a/src/axolotl/cli/merge_sharded_fsdp_weights.py
+++ b/src/axolotl/cli/merge_sharded_fsdp_weights.py
@@ -1,7 +1,6 @@
 """CLI to merge sharded FSDP model checkpoints into a single combined checkpoint."""
 
 import json
-import logging
 import os
 import shutil
 from pathlib import Path
@@ -11,7 +10,6 @@ import fire
 import torch
 import torch.distributed.checkpoint as dist_cp
 import torch.distributed.checkpoint.format_utils as dist_cp_format_utils
-import transformers
 from accelerate.utils import (
     SAFE_WEIGHTS_INDEX_NAME,
     SAFE_WEIGHTS_NAME,
@@ -19,16 +17,14 @@ from accelerate.utils import (
     WEIGHTS_NAME,
     is_torch_version,
 )
-from dotenv import load_dotenv
 from huggingface_hub import split_torch_state_dict_into_shards
 from safetensors.torch import save_file as safe_save_file
 from torch.distributed.checkpoint.format_utils import _EmptyStateDictLoadPlanner
 
-from axolotl.cli.args import TrainerCliArgs
-from axolotl.cli.art import print_axolotl_text_art
 from axolotl.cli.config import load_cfg
+from axolotl.utils.logging import get_logger
 
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__)
 
 
 class BFloat16CastPlanner(_EmptyStateDictLoadPlanner):
@@ -196,12 +192,6 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
         kwargs: Additional keyword arguments to override config file values.
     """
     # pylint: disable=duplicate-code
-    print_axolotl_text_art()
-    parser = transformers.HfArgumentParser(TrainerCliArgs)
-    parsed_cli_args, _ = parser.parse_args_into_dataclasses(
-        return_remaining_strings=True
-    )
-    parsed_cli_args.merge_lora = True
     parsed_cfg = load_cfg(config, **kwargs)
 
     fsdp_dir = Path(parsed_cfg.output_dir) / "pytorch_model_fsdp_0"
@@ -213,5 +203,4 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
 
 
 if __name__ == "__main__":
-    load_dotenv()
     fire.Fire(do_cli)
diff --git a/src/axolotl/cli/preprocess.py b/src/axolotl/cli/preprocess.py
index 2a4dcd288..5d692c315 100644
--- a/src/axolotl/cli/preprocess.py
+++ b/src/axolotl/cli/preprocess.py
@@ -1,6 +1,6 @@
 """CLI to run preprocessing of a dataset."""
 
-import logging
+import os
 import warnings
 from pathlib import Path
 from typing import Union
@@ -9,20 +9,19 @@ import fire
 import transformers
 from accelerate import init_empty_weights
 from colorama import Fore
-from dotenv import load_dotenv
 from transformers import AutoModelForCausalLM
 
 from axolotl.cli.args import PreprocessCliArgs
-from axolotl.cli.art import print_axolotl_text_art
 from axolotl.cli.checks import check_accelerate_default_config, check_user_token
 from axolotl.cli.config import load_cfg
 from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
 from axolotl.common.datasets import load_datasets, load_preference_datasets
 from axolotl.integrations.base import PluginManager
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
 from axolotl.utils.trainer import disable_datasets_caching
 
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__)
 
 
 def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None:
@@ -33,10 +32,16 @@ def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None:
         cfg: Dictionary mapping `axolotl` config keys to values.
         cli_args: Preprocessing-specific CLI arguments.
     """
-    print_axolotl_text_art()
     check_accelerate_default_config()
     check_user_token()
 
+    for key in ["skip_prepare_dataset", "pretraining_dataset"]:
+        if cfg.get(key):
+            LOG.error(
+                f"You have set `{key}:`. `preprocess` is not needed. Run the `axolotl train` CLI directly instead."
+            )
+            return
+
     if not cfg.dataset_prepared_path:
         msg = (
             Fore.RED
@@ -91,6 +96,7 @@ def do_cli(
         kwargs: Additional keyword arguments to override config file values.
     """
     # pylint: disable=duplicate-code
+    os.environ["AXOLOTL_IS_PREPROCESS"] = "1"
     parsed_cfg = load_cfg(config, **kwargs)
     parsed_cfg.is_preprocess = True
     parser = transformers.HfArgumentParser(PreprocessCliArgs)
@@ -102,5 +108,4 @@ def do_cli(
 
 
 if __name__ == "__main__":
-    load_dotenv()
     fire.Fire(do_cli)
diff --git a/src/axolotl/cli/quantize.py b/src/axolotl/cli/quantize.py
new file mode 100644
index 000000000..0782976fe
--- /dev/null
+++ b/src/axolotl/cli/quantize.py
@@ -0,0 +1,88 @@
+"""
+CLI to post-training quantize a model using torchao
+"""
+
+from pathlib import Path
+from typing import Union
+
+from transformers import AutoModelForCausalLM
+
+from axolotl.cli.config import load_cfg
+from axolotl.loaders import load_tokenizer
+from axolotl.utils.logging import get_logger
+from axolotl.utils.quantization import TorchIntDType, quantize_model_for_ptq
+
+LOG = get_logger(__name__)
+
+
+def do_quantize(
+    config: Union[Path, str],
+    cli_args: dict,
+):
+    """
+    Quantizes a model's model's weights
+
+    Args:
+        config (Union[Path, str]): The path to the config file
+        cli_args (dict): Additional command-line arguments
+    """
+
+    cfg = load_cfg(config)
+
+    if cfg.qat and cfg.quantization:
+        raise ValueError(
+            "QAT and quantization cannot be used together. Please specify only one of qat or quantization in your config file."
+        )
+
+    if cfg.qat:
+        quantize_cfg = cfg.qat
+    elif cfg.quantization:
+        quantize_cfg = cfg.quantization
+    else:
+        raise ValueError(
+            "No quantization configuration found. Please specify either qat or quantization in your config file."
+        )
+
+    model_path = cli_args.get("model_path") or cfg.output_dir
+    if weight_dtype := cli_args.get("weight_dtype"):
+        weight_dtype = TorchIntDType[weight_dtype]
+    else:
+        weight_dtype = quantize_cfg.weight_dtype
+    if activation_dtype := cli_args.get("activation_dtype"):
+        activation_dtype = TorchIntDType[activation_dtype]
+    else:
+        activation_dtype = quantize_cfg.activation_dtype
+    group_size = cli_args.get("group_size") or quantize_cfg.group_size
+    quantize_embedding = (
+        cli_args.get("quantize_embedding") or quantize_cfg.quantize_embedding
+    )
+    output_dir = cli_args.get("output_dir") or cfg.output_dir
+
+    LOG.info(f"Loading model from {model_path}...")
+    tokenizer = load_tokenizer(cfg)
+    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
+
+    LOG.info(
+        f"Quantizing model with configuration: \n"
+        f"\tweight_dtype: {weight_dtype}\n"
+        f"\tactivation_dtype: {activation_dtype}\n"
+        f"\tgroup_size: {group_size}\n"
+        f"\tquantize_embedding: {quantize_embedding}"
+    )
+
+    quantize_model_for_ptq(
+        model, weight_dtype, group_size, activation_dtype, quantize_embedding
+    )
+
+    LOG.info(f"Saving quantized model to: {str(Path(output_dir) / 'quantized')}...")
+    model.save_pretrained(
+        str(Path(output_dir) / "quantized"),
+        safe_serialization=False,
+        progressbar=True,
+    )
+    tokenizer.save_pretrained(
+        str(Path(output_dir) / "quantized"),
+        safe_serialization=False,
+        progressbar=True,
+    )
+    LOG.info(f"Quantized model saved to: {str(Path(output_dir) / 'quantized')}...")
diff --git a/src/axolotl/cli/train.py b/src/axolotl/cli/train.py
index 777d84885..7f0b0bdd2 100644
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -1,29 +1,23 @@
 """CLI to run training on a model."""
 
 import gc
-import logging
 import os
 from pathlib import Path
 from typing import Union
 
 import fire
 from accelerate import Accelerator
-from dotenv import load_dotenv
 from transformers.hf_argparser import HfArgumentParser
 
 from axolotl.cli.args import TrainerCliArgs
-from axolotl.cli.art import print_axolotl_text_art
 from axolotl.cli.checks import check_accelerate_default_config, check_user_token
 from axolotl.cli.config import load_cfg
 from axolotl.common.datasets import load_datasets, load_preference_datasets
 from axolotl.integrations.base import PluginManager
 from axolotl.train import train
-from axolotl.utils import patch_optimized_env
 from axolotl.utils.config import normalize_config, resolve_dtype
 from axolotl.utils.dict import DictDefault
 
-LOG = logging.getLogger(__name__)
-
 
 def do_train(cfg: DictDefault, cli_args: TrainerCliArgs):
     """
@@ -35,10 +29,6 @@ def do_train(cfg: DictDefault, cli_args: TrainerCliArgs):
         cfg: Dictionary mapping `axolotl` config keys to values.
         cli_args: Training-specific CLI arguments.
     """
-    # Enable expandable segments for cuda allocation to improve VRAM usage
-    patch_optimized_env()
-
-    print_axolotl_text_art()
     check_accelerate_default_config()
     if int(os.getenv("LOCAL_RANK", "0")) == 0:
         check_user_token()
@@ -114,11 +104,17 @@ def ray_train_func(kwargs: dict):
     # initialize accelerator before model instantiation
     Accelerator(gradient_accumulation_steps=cfg.gradient_accumulation_steps)
 
+    # Register plugins in Ray workers
+    if cfg.get("plugins"):
+        from axolotl.cli.config import plugin_set_cfg, prepare_plugins
+
+        prepare_plugins(cfg)
+        plugin_set_cfg(cfg)
+
     kwargs["cfg"] = cfg
 
     do_train(**kwargs)
 
 
 if __name__ == "__main__":
-    load_dotenv()
     fire.Fire(do_cli)
diff --git a/src/axolotl/cli/utils.py b/src/axolotl/cli/utils.py
deleted file mode 100644
index e681589f3..000000000
--- a/src/axolotl/cli/utils.py
+++ /dev/null
@@ -1,330 +0,0 @@
-"""Utility methods for axolotl CLI."""
-
-import concurrent.futures
-import dataclasses
-import hashlib
-import json
-import logging
-from functools import wraps
-from pathlib import Path
-from types import NoneType
-from typing import Any, Callable, Type, Union, get_args, get_origin
-
-import click
-import requests
-from pydantic import BaseModel
-from transformers import (
-    PreTrainedModel,
-    PreTrainedTokenizer,
-    PreTrainedTokenizerFast,
-    ProcessorMixin,
-)
-
-from axolotl.loaders import load_processor, load_tokenizer
-from axolotl.loaders.model import ModelLoader
-from axolotl.utils.dict import DictDefault
-
-LOG = logging.getLogger(__name__)
-
-
-def strip_optional_type(field_type: type | str | None):
-    """
-    Extracts the non-`None` type from an `Optional` / `Union` type.
-
-    Args:
-        field_type: Type of field for Axolotl CLI command.
-
-    Returns:
-        If the input type is `Union[T, None]` or `Optional[T]`, returns `T`. Otherwise
-            returns the input type unchanged.
-    """
-    if get_origin(field_type) is Union and type(None) in get_args(field_type):
-        field_type = next(
-            t for t in get_args(field_type) if not isinstance(t, NoneType)
-        )
-
-    return field_type
-
-
-def filter_none_kwargs(func: Callable) -> Callable:
-    """
-    Wraps function to remove `None`-valued `kwargs`.
-
-    Args:
-        func: Function to wrap.
-
-    Returns:
-        Wrapped function.
-    """
-
-    @wraps(func)
-    def wrapper(*args, **kwargs) -> Callable:
-        """Filters out `None`-valued `kwargs`."""
-        filtered_kwargs = {k: v for k, v in kwargs.items() if v is not None}
-
-        return func(*args, **filtered_kwargs)
-
-    return wrapper
-
-
-def add_options_from_dataclass(config_class: Type[Any]) -> Callable:
-    """
-    Create Click options from the fields of a dataclass.
-
-    Args:
-        config_class: Dataclass with fields to parse from the CLI.
-
-    Returns:
-        Function decorator for Axolotl CLI command.
-    """
-
-    def decorator(function: Callable) -> Callable:
-        # Process dataclass fields in reverse order for correct option ordering
-        for field in reversed(dataclasses.fields(config_class)):
-            field_type = strip_optional_type(field.type)
-
-            if field_type == bool:
-                field_name = field.name.replace("_", "-")
-                option_name = f"--{field_name}/--no-{field_name}"
-                function = click.option(
-                    option_name,
-                    default=field.default,
-                    help=field.metadata.get("description"),
-                )(function)
-            else:
-                option_name = f"--{field.name.replace('_', '-')}"
-                function = click.option(
-                    option_name,
-                    type=field_type,
-                    default=field.default,
-                    help=field.metadata.get("description"),
-                )(function)
-
-        return function
-
-    return decorator
-
-
-def add_options_from_config(config_class: Type[BaseModel]) -> Callable:
-    """
-    Create Click options from the fields of a Pydantic model.
-
-    Args:
-        config_class: PyDantic model with fields to parse from the CLI
-
-    Returns:
-        Function decorator for Axolotl CLI command.
-    """
-
-    def decorator(function: Callable) -> Callable:
-        # Process model fields in reverse order for correct option ordering
-        for name, field in reversed(config_class.model_fields.items()):
-            field_type = strip_optional_type(field.annotation)
-
-            if field_type == bool:
-                field_name = name.replace("_", "-")
-                option_name = f"--{field_name}/--no-{field_name}"
-                function = click.option(
-                    option_name, default=None, help=field.description
-                )(function)
-            else:
-                option_name = f"--{name.replace('_', '-')}"
-                function = click.option(
-                    option_name, default=None, help=field.description
-                )(function)
-
-        return function
-
-    return decorator
-
-
-def build_command(base_cmd: list[str], options: dict[str, Any]) -> list[str]:
-    """
-    Build command list from base command and options.
-
-    Args:
-        base_cmd: Command without options.
-        options: Options to parse and append to base command.
-
-    Returns:
-        List of strings giving shell command.
-    """
-    cmd = base_cmd.copy()
-
-    for key, value in options.items():
-        if value is None:
-            continue
-
-        key = key.replace("_", "-")
-
-        if isinstance(value, bool):
-            if value:
-                cmd.append(f"--{key}")
-        else:
-            cmd.extend([f"--{key}", str(value)])
-
-    return cmd
-
-
-def download_file(
-    file_info: tuple, raw_base_url: str, dest_path: Path, dir_prefix: str
-) -> tuple[str, str]:
-    """
-    Download a single file and return its processing status.
-
-    Args:
-        file_info: Tuple of (file_path, remote_sha).
-        raw_base_url: Base URL for raw GitHub content.
-        dest_path: Local destination directory.
-        dir_prefix: Directory prefix to filter files.
-
-    Returns:
-        Tuple of (file_path, status) where status is 'new', 'updated', or 'unchanged'.
-    """
-    file_path, remote_sha = file_info
-    raw_url = f"{raw_base_url}/{file_path}"
-    dest_file = dest_path / file_path.split(dir_prefix)[-1]
-
-    # Check if file exists and needs updating
-    if dest_file.exists():
-        with open(dest_file, "rb") as file:
-            content = file.read()
-            # Calculate git blob SHA
-            blob = b"blob " + str(len(content)).encode() + b"\0" + content
-            local_sha = hashlib.sha1(blob, usedforsecurity=False).hexdigest()
-
-        if local_sha == remote_sha:
-            print(f"Skipping {file_path} (unchanged)")
-            return file_path, "unchanged"
-
-        print(f"Updating {file_path}")
-        status = "new"
-    else:
-        print(f"Downloading {file_path}")
-        status = "new"
-
-    # Create directories if needed
-    dest_file.parent.mkdir(parents=True, exist_ok=True)
-
-    # Download and save file
-    try:
-        response = requests.get(raw_url, timeout=30)
-        response.raise_for_status()
-
-        with open(dest_file, "wb") as file:
-            file.write(response.content)
-
-        return file_path, status
-    except (requests.RequestException, IOError) as request_error:
-        print(f"Error downloading {file_path}: {str(request_error)}")
-        return file_path, "error"
-
-
-def fetch_from_github(
-    dir_prefix: str, dest_dir: str | None = None, max_workers: int = 5
-) -> None:
-    """
-    Sync files from a specific directory in the GitHub repository.
-    Only downloads files that don't exist locally or have changed.
-
-    Args:
-        dir_prefix: Directory prefix to filter files (e.g., 'examples/',
-            'deepspeed_configs/').
-        dest_dir: Local destination directory.
-        max_workers: Maximum number of concurrent downloads.
-    """
-    api_url = "https://api.github.com/repos/axolotl-ai-cloud/axolotl/git/trees/main?recursive=1"
-    raw_base_url = "https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main"
-
-    # Get repository tree with timeout
-    response = requests.get(api_url, timeout=30)
-    response.raise_for_status()
-    tree = json.loads(response.text)
-
-    # Filter for files and get their SHA
-    files = {
-        item["path"]: item["sha"]
-        for item in tree["tree"]
-        if item["type"] == "blob" and item["path"].startswith(dir_prefix)
-    }
-
-    if not files:
-        raise click.ClickException(f"No files found in {dir_prefix}")
-
-    # Default destination directory is the last part of dir_prefix
-    default_dest = Path(dir_prefix.rstrip("/"))
-    dest_path = Path(dest_dir) if dest_dir else default_dest
-
-    # Keep track of processed files for summary
-    files_processed: dict[str, list[str]] = {
-        "new": [],
-        "updated": [],
-        "unchanged": [],
-        "error": [],
-    }
-
-    # Process files in parallel using ThreadPoolExecutor
-    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-        future_to_file = {
-            executor.submit(
-                download_file,
-                (file_path, remote_sha),
-                raw_base_url,
-                dest_path,
-                dir_prefix,
-            ): file_path
-            for file_path, remote_sha in files.items()
-        }
-
-        # Process completed tasks as they finish
-        for future in concurrent.futures.as_completed(future_to_file):
-            file_path = future_to_file[future]
-            try:
-                file_path, status = future.result()
-                files_processed[status].append(file_path)
-            except (requests.RequestException, IOError) as request_error:
-                print(f"Error processing {file_path}: {str(request_error)}")
-                files_processed["error"].append(file_path)
-
-    # Log summary
-    LOG.info("\nSync Summary:")
-    LOG.info(f"New files: {len(files_processed['new'])}")
-    LOG.info(f"Updated files: {len(files_processed['updated'])}")
-    LOG.info(f"Unchanged files: {len(files_processed['unchanged'])}")
-    if files_processed["error"]:
-        LOG.info(f"Failed files: {len(files_processed['error'])}")
-
-
-def load_model_and_tokenizer(
-    *,
-    cfg: DictDefault,
-    inference: bool = False,
-) -> tuple[
-    PreTrainedModel,
-    PreTrainedTokenizer | PreTrainedTokenizerFast | Any,
-    ProcessorMixin | None,
-]:
-    """
-    Helper function for loading a model, tokenizer, and processor specified in the given `axolotl`
-    config.
-
-    Args:
-        cfg: Dictionary mapping `axolotl` config keys to values.
-        inference: Boolean denoting inference mode.
-
-    Returns:
-        Tuple of (PreTrainedModel, PreTrainedTokenizer, ProcessorMixin).
-    """
-    LOG.info(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
-    tokenizer = load_tokenizer(cfg)
-
-    LOG.info("loading model...")
-    model_loader = ModelLoader(cfg, tokenizer, inference=inference)
-    model, _ = model_loader.load()
-
-    processor = None
-    if cfg.is_multimodal:
-        LOG.info("loading processor...")
-        processor = load_processor(cfg, tokenizer)
-
-    return model, tokenizer, processor
diff --git a/src/axolotl/cli/utils/__init__.py b/src/axolotl/cli/utils/__init__.py
new file mode 100644
index 000000000..583130339
--- /dev/null
+++ b/src/axolotl/cli/utils/__init__.py
@@ -0,0 +1,23 @@
+"""Init for axolotl.cli.utils module."""
+
+from .args import (
+    add_options_from_config,
+    add_options_from_dataclass,
+    filter_none_kwargs,
+)
+from .fetch import fetch_from_github
+from .load import load_model_and_tokenizer
+from .sweeps import generate_sweep_configs
+from .train import build_command, generate_config_files, launch_training
+
+__all__ = [
+    "filter_none_kwargs",
+    "add_options_from_dataclass",
+    "add_options_from_config",
+    "build_command",
+    "generate_config_files",
+    "generate_sweep_configs",
+    "load_model_and_tokenizer",
+    "launch_training",
+    "fetch_from_github",
+]
diff --git a/src/axolotl/cli/utils/args.py b/src/axolotl/cli/utils/args.py
new file mode 100644
index 000000000..3aea1a378
--- /dev/null
+++ b/src/axolotl/cli/utils/args.py
@@ -0,0 +1,120 @@
+"""Utilities for axolotl CLI args."""
+
+import dataclasses
+from functools import wraps
+from types import NoneType
+from typing import Any, Callable, Type, Union, get_args, get_origin
+
+import click
+from pydantic import BaseModel
+
+
+def _strip_optional_type(field_type: type | str | None):
+    """
+    Extracts the non-`None` type from an `Optional` / `Union` type.
+
+    Args:
+        field_type: Type of field for Axolotl CLI command.
+
+    Returns:
+        If the input type is `Union[T, None]` or `Optional[T]`, returns `T`. Otherwise
+            returns the input type unchanged.
+    """
+    if get_origin(field_type) is Union and type(None) in get_args(field_type):
+        field_type = next(
+            t for t in get_args(field_type) if not isinstance(t, NoneType)
+        )
+
+    return field_type
+
+
+def filter_none_kwargs(func: Callable) -> Callable:
+    """
+    Wraps function to remove `None`-valued `kwargs`.
+
+    Args:
+        func: Function to wrap.
+
+    Returns:
+        Wrapped function.
+    """
+
+    @wraps(func)
+    def wrapper(*args, **kwargs) -> Callable:
+        """Filters out `None`-valued `kwargs`."""
+        filtered_kwargs = {k: v for k, v in kwargs.items() if v is not None}
+
+        return func(*args, **filtered_kwargs)
+
+    return wrapper
+
+
+def add_options_from_dataclass(config_class: Type[Any]) -> Callable:
+    """
+    Create Click options from the fields of a dataclass.
+
+    Args:
+        config_class: Dataclass with fields to parse from the CLI.
+
+    Returns:
+        Function decorator for Axolotl CLI command.
+    """
+
+    def decorator(function: Callable) -> Callable:
+        # Process dataclass fields in reverse order for correct option ordering
+        for field in reversed(dataclasses.fields(config_class)):
+            field_type = _strip_optional_type(field.type)
+
+            if field_type == bool:
+                field_name = field.name.replace("_", "-")
+                option_name = f"--{field_name}/--no-{field_name}"
+                function = click.option(
+                    option_name,
+                    default=field.default,
+                    help=field.metadata.get("description"),
+                )(function)
+            else:
+                option_name = f"--{field.name.replace('_', '-')}"
+                function = click.option(
+                    option_name,
+                    type=field_type,
+                    default=field.default,
+                    help=field.metadata.get("description"),
+                )(function)
+
+        return function
+
+    return decorator
+
+
+def add_options_from_config(config_class: Type[BaseModel]) -> Callable:
+    """
+    Create Click options from the fields of a Pydantic model.
+
+    Args:
+        config_class: PyDantic model with fields to parse from the CLI
+
+    Returns:
+        Function decorator for Axolotl CLI command.
+    """
+
+    def decorator(function: Callable) -> Callable:
+        # Process model fields in reverse order for correct option ordering
+        for name, field in reversed(config_class.model_fields.items()):
+            field_type = _strip_optional_type(field.annotation)
+
+            if field_type == bool:
+                field_name = name.replace("_", "-")
+                option_name = f"--{field_name}/--no-{field_name}"
+                function = click.option(
+                    option_name, default=None, help=field.description
+                )(function)
+            else:
+                option_name = f"--{name.replace('_', '-')}"
+                function = click.option(
+                    option_name, default=None, help=field.description
+                )(function)
+
+        return function
+
+    return decorator
diff --git a/src/axolotl/cli/utils/fetch.py b/src/axolotl/cli/utils/fetch.py
new file mode 100644
index 000000000..441b7f6f7
--- /dev/null
+++ b/src/axolotl/cli/utils/fetch.py
@@ -0,0 +1,142 @@
+"""Utilities for axolotl fetch CLI command."""
+
+import concurrent.futures
+import hashlib
+import json
+from pathlib import Path
+
+import click
+import requests
+
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+
+def _download_file(
+    file_info: tuple, raw_base_url: str, dest_path: Path, dir_prefix: str
+) -> tuple[str, str]:
+    """
+    Download a single file and return its processing status.
+
+    Args:
+        file_info: Tuple of (file_path, remote_sha).
+        raw_base_url: Base URL for raw GitHub content.
+        dest_path: Local destination directory.
+        dir_prefix: Directory prefix to filter files.
+
+    Returns:
+        Tuple of (file_path, status) where status is 'new', 'updated', or 'unchanged'.
+    """
+    file_path, remote_sha = file_info
+    raw_url = f"{raw_base_url}/{file_path}"
+    dest_file = dest_path / file_path.split(dir_prefix)[-1]
+
+    # Check if file exists and needs updating
+    if dest_file.exists():
+        with open(dest_file, "rb") as file:
+            content = file.read()
+            # Calculate git blob SHA
+            blob = b"blob " + str(len(content)).encode() + b"\0" + content
+            local_sha = hashlib.sha1(blob, usedforsecurity=False).hexdigest()
+
+        if local_sha == remote_sha:
+            print(f"Skipping {file_path} (unchanged)")
+            return file_path, "unchanged"
+
+        print(f"Updating {file_path}")
+        status = "updated"
+    else:
+        print(f"Downloading {file_path}")
+        status = "new"
+
+    # Create directories if needed
+    dest_file.parent.mkdir(parents=True, exist_ok=True)
+
+    # Download and save file
+    try:
+        response = requests.get(raw_url, timeout=30)
+        response.raise_for_status()
+
+        with open(dest_file, "wb") as file:
+            file.write(response.content)
+
+        return file_path, status
+    except (requests.RequestException, IOError) as request_error:
+        print(f"Error downloading {file_path}: {str(request_error)}")
+        return file_path, "error"
+
+
+def fetch_from_github(
+    dir_prefix: str, dest_dir: str | None = None, max_workers: int = 5
+) -> None:
+    """
+    Sync files from a specific directory in the GitHub repository.
+    Only downloads files that don't exist locally or have changed.
+
+    Args:
+        dir_prefix: Directory prefix to filter files (e.g., 'examples/',
+            'deepspeed_configs/').
+        dest_dir: Local destination directory.
+        max_workers: Maximum number of concurrent downloads.
+    """
+    api_url = "https://api.github.com/repos/axolotl-ai-cloud/axolotl/git/trees/main?recursive=1"
+    raw_base_url = "https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main"
+
+    # Get repository tree with timeout
+    response = requests.get(api_url, timeout=30)
+    response.raise_for_status()
+    tree = json.loads(response.text)
+
+    # Filter for files and get their SHA
+    files = {
+        item["path"]: item["sha"]
+        for item in tree["tree"]
+        if item["type"] == "blob" and item["path"].startswith(dir_prefix)
+    }
+
+    if not files:
+        raise click.ClickException(f"No files found in {dir_prefix}")
+
+    # Default destination directory is the last part of dir_prefix
+    default_dest = Path(dir_prefix.rstrip("/"))
+    dest_path = Path(dest_dir) if dest_dir else default_dest
+
+    # Keep track of processed files for summary
+    files_processed: dict[str, list[str]] = {
+        "new": [],
+        "updated": [],
+        "unchanged": [],
+        "error": [],
+    }
+
+    # Process files in parallel using ThreadPoolExecutor
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+        future_to_file = {
+            executor.submit(
+                _download_file,
+                (file_path, remote_sha),
+                raw_base_url,
+                dest_path,
+                dir_prefix,
+            ): file_path
+            for file_path, remote_sha in files.items()
+        }
+
+        # Process completed tasks as they finish
+        for future in concurrent.futures.as_completed(future_to_file):
+            file_path = future_to_file[future]
+            try:
+                file_path, status = future.result()
+                files_processed[status].append(file_path)
+            except (requests.RequestException, IOError) as request_error:
+                print(f"Error processing {file_path}: {str(request_error)}")
+                files_processed["error"].append(file_path)
+
+    # Log summary
+    LOG.info("\nSync Summary:")
+    LOG.info(f"New files: {len(files_processed['new'])}")
+    LOG.info(f"Updated files: {len(files_processed['updated'])}")
+    LOG.info(f"Unchanged files: {len(files_processed['unchanged'])}")
+    if files_processed["error"]:
+        LOG.info(f"Failed files: {len(files_processed['error'])}")
diff --git a/src/axolotl/cli/utils/load.py b/src/axolotl/cli/utils/load.py
new file mode 100644
index 000000000..610a81306
--- /dev/null
+++ b/src/axolotl/cli/utils/load.py
@@ -0,0 +1,52 @@
+"""Utilities for model, tokenizer, etc. loading."""
+
+from typing import Any
+
+from transformers import (
+    PreTrainedModel,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerFast,
+    ProcessorMixin,
+)
+
+from axolotl.loaders import load_processor, load_tokenizer
+from axolotl.loaders.model import ModelLoader
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+
+def load_model_and_tokenizer(
+    *,
+    cfg: DictDefault,
+    inference: bool = False,
+) -> tuple[
+    PreTrainedModel,
+    PreTrainedTokenizer | PreTrainedTokenizerFast | Any,
+    ProcessorMixin | None,
+]:
+    """
+    Helper function for loading a model, tokenizer, and processor specified in the
+    given `axolotl` config.
+
+    Args:
+        cfg: Dictionary mapping `axolotl` config keys to values.
+        inference: Boolean denoting inference mode.
+
+    Returns:
+        Tuple of (PreTrainedModel, PreTrainedTokenizer, ProcessorMixin).
+    """
+    LOG.info(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
+    tokenizer = load_tokenizer(cfg)
+
+    LOG.info("loading model...")
+    model_loader = ModelLoader(cfg, tokenizer, inference=inference)
+    model, _ = model_loader.load()
+
+    processor = None
+    if cfg.is_multimodal:
+        LOG.info("loading processor...")
+        processor = load_processor(cfg, tokenizer)
+
+    return model, tokenizer, processor
diff --git a/src/axolotl/cli/sweeps.py b/src/axolotl/cli/utils/sweeps.py
similarity index 100%
rename from src/axolotl/cli/sweeps.py
rename to src/axolotl/cli/utils/sweeps.py
diff --git a/src/axolotl/cli/utils/train.py b/src/axolotl/cli/utils/train.py
new file mode 100644
index 000000000..f1ac857b3
--- /dev/null
+++ b/src/axolotl/cli/utils/train.py
@@ -0,0 +1,222 @@
+"""Utilities for axolotl train CLI command."""
+
+import os
+import subprocess  # nosec
+import sys
+import tempfile
+from typing import Any, Iterator, Literal
+
+import yaml
+
+from axolotl.cli.utils.sweeps import generate_sweep_configs
+
+
+def _add_default_rdzv_args(launcher_args: list[str]) -> list[str]:
+    """
+    Add default RDZV arguments if rdzv_endpoint is set but rdzv_backend/rdzv_id are missing.
+
+    Args:
+        launcher_args: List of launcher arguments
+
+    Returns:
+        Updated launcher args with defaults added if needed
+    """
+    args = launcher_args.copy()
+
+    # Check if rdzv_endpoint is present
+    has_rdzv_endpoint = any("--rdzv_endpoint" in arg for arg in args)
+
+    if has_rdzv_endpoint:
+        # Check if rdzv_backend is already provided
+        has_rdzv_backend = any("--rdzv_backend" in arg for arg in args)
+        if not has_rdzv_backend:
+            args.extend(["--rdzv_backend", "c10d"])
+
+        # Check if rdzv_id is already provided
+        has_rdzv_id = any("--rdzv_id" in arg for arg in args)
+        if not has_rdzv_id:
+            import uuid
+
+            args.extend(["--rdzv_id", str(uuid.uuid4())[:8]])
+
+    return args
+
+
+def build_command(base_cmd: list[str], options: dict[str, Any]) -> list[str]:
+    """
+    Build command list from base command and options.
+
+    Args:
+        base_cmd: Command without options.
+        options: Options to parse and append to base command.
+
+    Returns:
+        List of strings giving shell command.
+    """
+    cmd = base_cmd.copy()
+
+    for key, value in options.items():
+        if value is None:
+            continue
+
+        key = key.replace("_", "-")
+        cmd.append(f"--{key}={value}")
+
+    return cmd
+
+
+def generate_config_files(config: str, sweep: str | None) -> Iterator[tuple[str, bool]]:
+    """
+    Generate list of configuration files to process.
+
+    Args:
+        config: Base configuration file
+        sweep: Sweep configuration file
+
+    Yields:
+        Tuple of configuration file name and whether this is a group of configurations
+    """
+
+    if not sweep:
+        yield config, False
+        return
+
+    # Load sweep and base configurations
+    with open(sweep, "r", encoding="utf-8") as fin:
+        sweep_config: dict[str, list] = yaml.safe_load(fin)
+    with open(config, "r", encoding="utf-8") as fin:
+        base_config: dict[str, list] = yaml.safe_load(fin)
+
+    # Generate all possible configurations
+    permutations = generate_sweep_configs(base_config, sweep_config)
+    is_group = len(permutations) > 1
+    for permutation in permutations:
+        # pylint: disable=consider-using-with
+        temp_file = tempfile.NamedTemporaryFile(
+            mode="w",
+            suffix=".yaml",
+            delete=False,
+            encoding="utf-8",
+        )
+        yaml.dump(permutation, temp_file)
+        temp_file.close()
+        yield temp_file.name, is_group
+
+
+def launch_training(
+    cfg_file: str,
+    launcher: Literal["accelerate", "torchrun", "python"] | None,
+    cloud: str | None,
+    kwargs: dict,
+    launcher_args: list[str] | None = None,
+    use_exec: bool = False,
+) -> None:
+    """Execute training with the given configuration."""
+    launcher_args = launcher_args or []
+
+    if cloud:
+        _launch_cloud_training(cloud, cfg_file, launcher, kwargs, launcher_args)
+    elif launcher:
+        if launcher == "accelerate":
+            _launch_accelerate_training(cfg_file, kwargs, launcher_args, use_exec)
+        elif launcher == "torchrun":
+            _launch_torchrun_training(cfg_file, kwargs, launcher_args, use_exec)
+        elif launcher == "python":
+            _launch_python_training(cfg_file, kwargs)
+    elif launcher is None:
+        # handle ray train launch
+        _launch_python_training(cfg_file, kwargs)
+
+
+def _launch_cloud_training(
+    cloud: str,
+    cfg_file: str,
+    launcher: Literal["accelerate", "torchrun", "python"] | None,
+    kwargs: dict,
+    launcher_args: list[str] | None = None,
+) -> None:
+    """Execute training via cloud launcher."""
+    from axolotl.cli.cloud import do_cli_train
+
+    launcher_args = launcher_args or []
+    cwd = os.getcwd() if launcher else None
+
+    do_cli_train(
+        cloud_config=cloud,
+        config=cfg_file,
+        launcher=launcher or "accelerate",
+        launcher_args=launcher_args,
+        cwd=cwd,
+        **kwargs,
+    )
+
+
+def _launch_accelerate_training(
+    cfg_file: str,
+    kwargs: dict,
+    launcher_args: list[str] | None = None,
+    use_exec: bool = False,
+) -> None:
+    """Execute training via accelerate launcher."""
+    launcher_args = launcher_args or []
+    internal_launcher_args = []
+
+    # Extract launcher-specific arguments from kwargs (legacy support)
+    if "main_process_port" in kwargs:
+        main_process_port = kwargs.pop("main_process_port")
+        internal_launcher_args.extend(["--main_process_port", str(main_process_port)])
+
+    if "num_processes" in kwargs:
+        num_processes = kwargs.pop("num_processes")
+        internal_launcher_args.extend(["--num_processes", str(num_processes)])
+
+    # Combine internal args with user-provided launcher args
+    all_launcher_args = internal_launcher_args + launcher_args
+
+    base_cmd = (
+        ["accelerate", "launch"] + all_launcher_args + ["-m", "axolotl.cli.train"]
+    )
+    if cfg_file:
+        base_cmd.append(cfg_file)
+
+    cmd = build_command(base_cmd, kwargs)
+    if use_exec:
+        # make sure to flush stdout and stderr before replacing the process
+        sys.stdout.flush()
+        sys.stderr.flush()
+        os.execvpe(cmd[0], cmd, os.environ)  # nosec B606
+    else:
+        subprocess.run(cmd, check=True)  # nosec B603
+
+
+def _launch_torchrun_training(
+    cfg_file: str,
+    kwargs: dict,
+    launcher_args: list[str] | None = None,
+    use_exec: bool = False,
+) -> None:
+    """Execute training via torchrun launcher."""
+    launcher_args = launcher_args or []
+
+    # Add default RDZV arguments if rdzv_endpoint is set
+    launcher_args = _add_default_rdzv_args(launcher_args)
+
+    base_cmd = ["torchrun"] + launcher_args + ["-m", "axolotl.cli.train"]
+    if cfg_file:
+        base_cmd.append(cfg_file)
+
+    cmd = build_command(base_cmd, kwargs)
+    if use_exec:
+        # make sure to flush stdout and stderr before replacing the process
+        sys.stdout.flush()
+        sys.stderr.flush()
+        os.execvpe(cmd[0], cmd, os.environ)  # nosec B606
+    else:
+        subprocess.run(cmd, check=True)  # nosec B603
+
+
+def _launch_python_training(cfg_file: str, kwargs: dict) -> None:
+    """Execute training via python launcher."""
+    from axolotl.cli.train import do_cli
+
+    do_cli(config=cfg_file, **kwargs)
diff --git a/src/axolotl/cli/vllm_serve.py b/src/axolotl/cli/vllm_serve.py
index d3c4ad68d..cf687bea2 100644
--- a/src/axolotl/cli/vllm_serve.py
+++ b/src/axolotl/cli/vllm_serve.py
@@ -2,6 +2,7 @@
 CLI to start the vllm server for online RL
 """
 
+from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Union
 
@@ -10,6 +11,16 @@ from trl.scripts.vllm_serve import ScriptArguments
 from axolotl.cli.config import load_cfg
 
 
+@dataclass
+class AxolotlScriptArguments(ScriptArguments):
+    """
+    Additional arguments for the VLLM server
+    """
+
+    reasoning_parser: str = field(default="", kw_only=True)
+    enable_reasoning: bool | None = field(default=None, kw_only=True)
+
+
 def do_vllm_serve(
     config: Union[Path, str],
     cli_args: dict,
@@ -29,10 +40,17 @@ def do_vllm_serve(
 
     serve_module = cli_args.get("serve_module", "trl.scripts.vllm_serve")
     vllm_serve_main = getattr(__import__(serve_module, fromlist=["main"]), "main")
+    tensor_parallel_size = 1
+    data_parallel_size = 1
 
-    tensor_parallel_size = (
-        cli_args.get("tensor_parallel_size") or cfg.vllm.tensor_parallel_size
-    )
+    if cli_args.get("tensor_parallel_size") or cfg.vllm.tensor_parallel_size:
+        tensor_parallel_size = (
+            cli_args.get("tensor_parallel_size") or cfg.vllm.tensor_parallel_size
+        )
+    if cli_args.get("data_parallel_size") or cfg.vllm.data_parallel_size:
+        data_parallel_size = (
+            cli_args.get("data_parallel_size") or cfg.vllm.data_parallel_size
+        )
     host = cli_args.get("host") or cfg.vllm.host
     port = cli_args.get("port") or cfg.vllm.port
     gpu_memory_utilization = (
@@ -43,15 +61,25 @@ def do_vllm_serve(
     enable_prefix_caching = (
         cli_args.get("enable_prefix_caching") or cfg.vllm.enable_prefix_caching
     )
+    reasoning_parser = (
+        cli_args.get("reasoning_parser") or cfg.vllm.reasoning_parser or ""
+    )
+    enable_reasoning = (
+        cli_args.get("enable_reasoning") or cfg.vllm.enable_reasoning or False
+    )
 
-    vllm_script_args = ScriptArguments(
-        model,
+    # pylint: disable=unexpected-keyword-arg
+    vllm_script_args = AxolotlScriptArguments(
+        model=model,
         tensor_parallel_size=tensor_parallel_size,
+        data_parallel_size=data_parallel_size,
         host=host,
         port=port,
         gpu_memory_utilization=gpu_memory_utilization,
         dtype=dtype,
         max_model_len=max_model_len,
         enable_prefix_caching=enable_prefix_caching,
+        reasoning_parser=reasoning_parser,
+        enable_reasoning=enable_reasoning,
     )
     vllm_serve_main(vllm_script_args)
diff --git a/src/axolotl/common/architectures.py b/src/axolotl/common/architectures.py
index 2f77b613e..ce945e670 100644
--- a/src/axolotl/common/architectures.py
+++ b/src/axolotl/common/architectures.py
@@ -13,4 +13,5 @@ MOE_ARCH_BLOCK = {
     "qwen2_moe": "Qwen2MoeSparseMoeBlock",
     "qwen3_moe": "Qwen3MoeSparseMoeBlock",
     "deepseek_v2": "DeepseekV2MoE",
+    "gpt_oss": "GptOssDecoderLayer",
 }
diff --git a/src/axolotl/common/const.py b/src/axolotl/common/const.py
index fd34ad469..8aae06e99 100644
--- a/src/axolotl/common/const.py
+++ b/src/axolotl/common/const.py
@@ -1,5 +1,3 @@
-"""
-Various shared constants
-"""
+"""Various shared constants"""
 
 DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared"
diff --git a/src/axolotl/common/datasets.py b/src/axolotl/common/datasets.py
index e3ffb7ae9..761317dfb 100644
--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -1,23 +1,21 @@
 """Dataset loading utilities."""
 
-import logging
 import math
 import random
 from dataclasses import dataclass
-from typing import Optional, Union
 
 from datasets import Dataset
 
 import axolotl.monkeypatch.data.batch_dataset_fetcher  # pylint: disable=unused-import  # noqa: F401
 from axolotl.cli.args import PreprocessCliArgs, TrainerCliArgs
 from axolotl.loaders import load_processor, load_tokenizer
-from axolotl.utils.data import prepare_dataset
-from axolotl.utils.data.rl import load_prepare_preference_datasets
+from axolotl.utils.data import prepare_datasets, prepare_preference_datasets
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.enums import RLType
 from axolotl.utils.tokenization import check_dataset_labels
 
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__)
 
 
 @dataclass
@@ -30,16 +28,7 @@ class TrainDatasetMeta:
 
 
 def sample_dataset(dataset: Dataset, num_samples: int) -> Dataset:
-    """
-    Randomly sample `num_samples` samples from `dataset`.
-
-    Args:
-        dataset: Dataset.
-        num_samples: Number of samples to return.
-
-    Returns:
-        Random sample (with replacement) of examples in `dataset`.
-    """
+    """Randomly sample `num_samples` samples with replacement from `dataset`."""
     return dataset.select(
         [random.randrange(0, len(dataset) - 1) for _ in range(num_samples)]  # nosec
     )
@@ -51,55 +40,52 @@ def load_datasets(
     cli_args: PreprocessCliArgs | TrainerCliArgs | None = None,
     debug: bool = False,
 ) -> TrainDatasetMeta:
-    """
-    Loads one or more training or evaluation datasets, calling
-    `axolotl.utils.data.prepare_dataset`. Optionally, logs out debug information.
+    """Loads one or more training or evaluation datasets, calling
+    `axolotl.utils.data.prepare_datasets`. Optionally, logs out debug information.
 
     Args:
         cfg: Dictionary mapping `axolotl` config keys to values.
         cli_args: Command-specific CLI arguments.
-        debug: Whether to print out tokenization of sample
+        debug: Whether to print out tokenization of sample. This is duplicated in
+            `cfg` and `cli_args`, but is kept due to use in our Colab notebooks.
 
     Returns:
         Dataclass with fields for training and evaluation datasets and the computed
-        `total_num_steps`.
+            `total_num_steps`.
     """
     tokenizer = load_tokenizer(cfg)
     processor = load_processor(cfg, tokenizer=tokenizer) if cfg.processor_type else None
-    preprocess_iterable = (
-        cli_args
-        and hasattr(cli_args, "iterable")
-        and cli_args.iterable is not None
-        and cli_args.iterable
-    )
+    preprocess_iterable = getattr(cli_args, "iterable", False)
 
-    train_dataset, eval_dataset, total_num_steps, prompters = prepare_dataset(
+    train_dataset, eval_dataset, total_num_steps, prompters = prepare_datasets(
         cfg,
         tokenizer,
         processor=processor,
         preprocess_iterable=preprocess_iterable,
     )
 
-    if (  # pylint: disable=too-many-boolean-expressions
-        cli_args
-        and (
-            cli_args.debug
-            or cfg.debug
-            or cli_args.debug_text_only
-            or int(cli_args.debug_num_examples) > 0
-        )
-    ) or debug:
+    if (
+        cfg.debug
+        or getattr(cli_args, "debug", False)
+        or getattr(cli_args, "debug_text_only", False)
+        or getattr(cli_args, "debug_num_examples", 0) > 0
+        or debug
+    ):
         LOG.info("check_dataset_labels...")
 
         num_examples = cli_args.debug_num_examples if cli_args else 1
         text_only = cli_args.debug_text_only if cli_args else False
-        train_samples = sample_dataset(train_dataset, num_examples)
-        check_dataset_labels(
-            train_samples,
-            tokenizer,
-            num_examples=num_examples,
-            text_only=text_only,
-        )
+        try:
+            train_samples = sample_dataset(train_dataset, num_examples)
+            check_dataset_labels(
+                train_samples,
+                tokenizer,
+                num_examples=num_examples,
+                text_only=text_only,
+            )
+        except AttributeError:
+            # can't sample iterable datasets
+            pass
 
         LOG.info("printing prompters...")
         for prompter in prompters:
@@ -113,13 +99,10 @@ def load_datasets(
 
 
 def load_preference_datasets(
-    *,
-    cfg: DictDefault,
-    cli_args: Union[PreprocessCliArgs, TrainerCliArgs],
+    *, cfg: DictDefault, cli_args: PreprocessCliArgs | TrainerCliArgs | None = None
 ) -> TrainDatasetMeta:
-    """
-    Loads one or more training or evaluation datasets for RL training using paired
-    preference data, calling `axolotl.utils.data.rl.load_prepare_preference_datasets`.
+    """Loads one or more training or evaluation datasets for RL training using paired
+    preference data, calling `axolotl.utils.data.rl.prepare_preference_datasets`.
     Optionally, logs out debug information.
 
     Args:
@@ -130,23 +113,28 @@ def load_preference_datasets(
         Dataclass with fields for training and evaluation datasets and the computed
         `total_num_steps`.
     """
-    train_dataset, eval_dataset = load_prepare_preference_datasets(cfg)
-    total_num_steps: Optional[int] = int(
-        math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
-    )
-    if cfg.rl is RLType.GRPO:
-        total_num_steps = None
+    tokenizer = load_tokenizer(cfg)
+    train_dataset, eval_dataset = prepare_preference_datasets(cfg, tokenizer)
 
-    if cli_args.debug or cfg.debug:
+    total_num_steps: int | None = None
+    if cfg.rl is not RLType.GRPO:
+        total_num_steps = int(
+            math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
+        )
+
+    if ((cli_args and cli_args.debug) or cfg.debug) and cfg.rl != RLType.ORPO:
         LOG.info("check_dataset_labels...")
 
+        num_examples = cli_args.debug_num_examples if cli_args else 1
+        text_only = cli_args.debug_text_only if cli_args else False
+
         tokenizer = load_tokenizer(cfg)
-        train_samples = sample_dataset(train_dataset, cli_args.debug_num_examples)
+        train_samples = sample_dataset(train_dataset, num_examples)
         check_dataset_labels(
-            train_samples,
-            tokenizer,
-            num_examples=cli_args.debug_num_examples,
-            text_only=cli_args.debug_text_only,
+            dataset=train_samples,
+            tokenizer=tokenizer,
+            num_examples=num_examples,
+            text_only=text_only,
             rl_mode=True,
         )
 
diff --git a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/__init__.py b/src/axolotl/core/attention/__init__.py
similarity index 100%
rename from src/axolotl/integrations/cut_cross_entropy/monkeypatch/__init__.py
rename to src/axolotl/core/attention/__init__.py
diff --git a/src/axolotl/core/attention/flex_block_mask.py b/src/axolotl/core/attention/flex_block_mask.py
new file mode 100644
index 000000000..fb9820f35
--- /dev/null
+++ b/src/axolotl/core/attention/flex_block_mask.py
@@ -0,0 +1,162 @@
+"""
+monkeypatch for flex + packing
+"""
+
+import sys
+from typing import Callable, Optional, Union
+
+import torch
+from torch.nn.attention.flex_attention import BlockMask
+from transformers import Cache, PretrainedConfig
+from transformers.masking_utils import (
+    ALL_MASK_ATTENTION_FUNCTIONS,
+    _preprocess_mask_arguments,
+    and_masks,
+    causal_mask_function,
+    or_masks,
+)
+from transformers.utils import is_torch_greater_or_equal
+
+_is_torch_greater_or_equal_than_2_6 = is_torch_greater_or_equal("2.6", accept_dev=True)
+
+
+def create_causal_mask(
+    config: PretrainedConfig,
+    input_embeds: torch.Tensor,
+    attention_mask: torch.Tensor,
+    cache_position: torch.Tensor,
+    past_key_values: Optional[Cache],
+    or_mask_function: Optional[Callable] = None,
+    and_mask_function: Optional[Callable] = None,
+) -> Optional[Union[torch.Tensor, BlockMask]]:
+    """
+    Create a standard causal mask based on the attention implementation used (stored in the config). If `past_key_values`
+    has an HybridCache structure, this function will return the mask corresponding to one of the "full_attention" layers (to align
+    to what is needed in the `modeling_xxx.py` files).
+
+    Args:
+        config (`PretrainedConfig`):
+            The model config.
+        input_embeds (`torch.Tensor`):
+            The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the
+            batch size, query length and dtype.
+        attention_mask (`torch.Tensor`, optional):
+            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length).
+            It can also be an already prepared 4D mask, in which case it is returned as-is.
+        cache_position (`torch.Tensor`):
+            A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
+        past_key_values (`Cache`, optional):
+            The past key values, if we use a cache.
+        or_mask_function (`Callable`, optional):
+            An optional mask function to combine with the causal mask function (by doing the union of both). This is
+            useful to easily overlay another mask on top of the causal one, for example for image tokens handling.
+        and_mask_function (`Callable`, optional):
+            An optional mask function to combine with the causal mask function (by doing the intersection of both). This is
+            useful to easily overlay another mask on top of the causal one, for example for image tokens handling.
+    """
+    # If we have an HybridCache structure, here we want to create the mask for the full layers
+    if (
+        past_key_values
+        and hasattr(past_key_values, "is_sliding")
+        and False in past_key_values.is_sliding
+    ):
+        layer_idx = past_key_values.is_sliding.index(False)
+    else:
+        layer_idx = 0
+
+    original_attention_mask = (
+        None
+        if attention_mask is None
+        else attention_mask.clone().to(cache_position.device)
+    )
+    early_exit, attention_mask, kv_length, kv_offset = _preprocess_mask_arguments(
+        config, input_embeds, attention_mask, cache_position, past_key_values, layer_idx
+    )
+    if early_exit:
+        return attention_mask
+
+    batch_size, total_seq_len = cache_position.shape
+    key_length = total_seq_len
+    document_ids = torch.nn.functional.pad(
+        original_attention_mask, value=0, pad=(0, key_length)
+    )
+
+    batch_size, dtype = input_embeds.shape[0], input_embeds.dtype
+    if attention_mask is not None:
+
+        def causal_doc_mask_mod(
+            batch_idx, head_idx, q_idx, kv_idx
+        ):  # pylint: disable=unused-argument
+            """
+            Defines the logic of a block causal mask by combining both a standard causal mask
+            and a block diagonal document mask.
+            See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
+            for an illustration.
+            """
+            causal_mask_ = q_idx >= kv_idx  # not valid when decoding
+            document_mask = (
+                document_ids[batch_idx, q_idx] == document_ids[batch_idx, kv_idx]
+            )
+            final_mask = causal_mask_ & document_mask
+            return final_mask
+
+        mask_factory_function = causal_doc_mask_mod
+    else:
+        mask_factory_function = causal_mask_function
+    mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[
+        config._attn_implementation  # pylint: disable=protected-access
+    ]
+
+    # Do not allow skip if we are compiling (this is to match BC)
+    allow_is_causal_skip = (
+        not past_key_values.is_compileable if past_key_values is not None else True
+    )
+
+    # Allow slight deviations from causal mask
+    if or_mask_function is not None:
+        if not _is_torch_greater_or_equal_than_2_6:
+            raise ValueError(
+                "Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6"
+            )
+        mask_factory_function = or_masks(mask_factory_function, or_mask_function)
+        allow_is_causal_skip = False
+    if and_mask_function is not None:
+        if not _is_torch_greater_or_equal_than_2_6:
+            raise ValueError(
+                "Using `or_mask_function` or `and_mask_function` arguments require torch>=2.6"
+            )
+        mask_factory_function = and_masks(mask_factory_function, and_mask_function)
+        allow_is_causal_skip = False
+
+    # We now create the mask
+    causal_mask = mask_interface(
+        batch_size=batch_size,
+        cache_position=cache_position,
+        kv_length=kv_length,
+        kv_offset=kv_offset,
+        mask_function=mask_factory_function,
+        attention_mask=attention_mask,
+        allow_is_causal_skip=allow_is_causal_skip,  # additional kwarg for sdpa
+        dtype=dtype,  # Additional kwarg for eager
+        config=config,  # Pass the config as well, in case someone wants to easily have their own mask_interface
+    )
+    return causal_mask
+
+
+def patch_create_causal_mask(model_type):
+    import transformers.masking_utils
+
+    transformers.masking_utils.create_causal_mask = create_causal_mask
+
+    if model_type:
+        try:
+            # Dynamically import the module and attention class
+            module_path = f"transformers.models.{model_type}.modeling_{model_type}"
+            module = __import__(module_path)
+            module.create_causal_mask = create_causal_mask
+            del sys.modules[module_path]
+        except (ImportError, AttributeError) as e:
+            raise ValueError(
+                f"Could not import attention class for model_type: {model_type}. "
+                f"Error: {str(e)}"
+            ) from e
diff --git a/src/axolotl/core/builders/__init__.py b/src/axolotl/core/builders/__init__.py
new file mode 100644
index 000000000..5bd244434
--- /dev/null
+++ b/src/axolotl/core/builders/__init__.py
@@ -0,0 +1,6 @@
+"""Trainer builder classes"""
+
+from .causal import HFCausalTrainerBuilder
+from .rl import HFRLTrainerBuilder
+
+__all__ = ["HFCausalTrainerBuilder", "HFRLTrainerBuilder"]
diff --git a/src/axolotl/core/builders/base.py b/src/axolotl/core/builders/base.py
new file mode 100644
index 000000000..e1f649715
--- /dev/null
+++ b/src/axolotl/core/builders/base.py
@@ -0,0 +1,540 @@
+# Copyright 2024 Axolotl AI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Base class for trainer builder"""
+
+import abc
+import importlib
+import logging
+import sys
+from abc import abstractmethod
+from contextlib import suppress
+from pathlib import Path
+from typing import Any
+
+import torch
+from transformers import (
+    TrainerCallback,
+)
+from transformers.trainer_pt_utils import AcceleratorConfig
+
+from axolotl.integrations.base import PluginManager
+from axolotl.monkeypatch.trainer.lr import patch_trainer_get_lr
+from axolotl.utils import is_comet_available, is_mlflow_available
+from axolotl.utils.callbacks import (
+    GCCallback,
+    SaveAxolotlConfigtoWandBCallback,
+    SaveModelOnFirstStepCallback,
+)
+from axolotl.utils.callbacks.profiler import PytorchProfilerCallback
+from axolotl.utils.distributed import build_parallelism_config
+from axolotl.utils.schemas.enums import CustomSupportedOptimizers
+
+LOG = logging.getLogger(__name__)
+
+with suppress(ImportError):
+    import torch._dynamo  # pylint: disable=ungrouped-imports
+
+
+class TrainerBuilderBase(abc.ABC):
+    """Base class for trainer builder."""
+
+    def __init__(self, cfg, model, tokenizer, processor=None):
+        self.cfg = cfg
+        self.model = model
+        self.tokenizer = tokenizer
+        self.processor = processor
+
+        self._train_dataset = None
+        self._eval_dataset = None
+        self._model_ref = None
+        self._peft_config = None
+
+        # If the model supports tagging, add the axolotl tag.
+        # This makes sure the tag is correctly pushed even if a user calls
+        # model.push_to_hub instead of trainer.push_to_hub.
+        if hasattr(model, "add_model_tags"):
+            model.add_model_tags(["axolotl"])
+
+        patch_trainer_get_lr()
+
+    @property
+    def model_ref(self):
+        return self._model_ref
+
+    @model_ref.setter
+    def model_ref(self, model):
+        self._model_ref = model
+
+    @property
+    def train_dataset(self):
+        return self._train_dataset
+
+    @train_dataset.setter
+    def train_dataset(self, dataset):
+        self._train_dataset = dataset
+
+    @property
+    def eval_dataset(self):
+        return self._eval_dataset
+
+    @eval_dataset.setter
+    def eval_dataset(self, dataset):
+        self._eval_dataset = dataset
+
+    @property
+    def peft_config(self):
+        return self._peft_config
+
+    @peft_config.setter
+    def peft_config(self, peft_config):
+        self._peft_config = peft_config
+
+    @abstractmethod
+    def build(self, total_num_steps):
+        pass
+
+    def get_callbacks(self) -> list[TrainerCallback]:
+        callbacks = []
+
+        plugin_manager = PluginManager.get_instance()
+        callbacks.extend(
+            plugin_manager.add_callbacks_pre_trainer(cfg=self.cfg, model=self.model)
+        )
+
+        if self.cfg.gc_steps:
+            callbacks.append(GCCallback(gc_steps=self.cfg.gc_steps))
+
+        if self.cfg.use_wandb:
+            callbacks.append(
+                SaveAxolotlConfigtoWandBCallback(self.cfg.axolotl_config_path)
+            )
+        if self.cfg.use_mlflow and is_mlflow_available():
+            from axolotl.utils.callbacks.mlflow_ import (
+                SaveAxolotlConfigtoMlflowCallback,
+            )
+
+            callbacks.extend(
+                [
+                    SaveAxolotlConfigtoMlflowCallback(self.cfg.axolotl_config_path),
+                ]
+            )
+        if self.cfg.use_comet and is_comet_available():
+            from axolotl.utils.callbacks.comet_ import SaveAxolotlConfigtoCometCallback
+
+            callbacks.append(
+                SaveAxolotlConfigtoCometCallback(self.cfg.axolotl_config_path)
+            )
+        if self.cfg.save_first_step:
+            callbacks.append(SaveModelOnFirstStepCallback())
+
+        if self.cfg.profiler_steps:
+            callbacks.append(
+                PytorchProfilerCallback(
+                    steps_to_profile=self.cfg.profiler_steps,
+                    profiler_steps_start=self.cfg.profiler_steps_start,
+                )
+            )
+
+        return callbacks
+
+    def get_post_trainer_create_callbacks(self, trainer):
+        """
+        Callbacks added after the trainer is created, usually b/c these need access to the trainer
+        """
+        callbacks = []
+        if self.cfg.plugins:
+            plugin_manager = PluginManager.get_instance()
+            callbacks.extend(
+                [
+                    cb
+                    for cb in plugin_manager.add_callbacks_post_trainer(
+                        self.cfg, trainer
+                    )
+                    if cb
+                ]
+            )
+        return callbacks
+
+    def hook_pre_create_training_args(self, training_arguments_kwargs):
+        # TODO
+        return training_arguments_kwargs
+
+    def hook_post_create_training_args(self, training_arguments):
+        # TODO
+        return training_arguments
+
+    def hook_pre_create_trainer(self, trainer_kwargs, trainer_cls):
+        # TODO
+        return trainer_kwargs, trainer_cls
+
+    def hook_post_create_trainer(self, trainer):
+        # TODO
+        return trainer
+
+    def _configure_warmup_and_logging(
+        self, total_num_steps: int, training_args_kwargs: dict
+    ):
+        warmup_steps = 0
+        warmup_ratio = 0.0
+        if self.cfg.warmup_steps:
+            warmup_steps = self.cfg.warmup_steps
+        elif self.cfg.warmup_ratio:
+            if total_num_steps:
+                warmup_steps = max(int(self.cfg.warmup_ratio * total_num_steps), 0)
+            else:
+                warmup_ratio = self.cfg.warmup_ratio
+        elif total_num_steps:
+            warmup_steps = min(int(0.03 * total_num_steps), 100)
+        else:
+            warmup_ratio = 0.03
+
+        if warmup_steps == 1:
+            warmup_steps = 2
+
+        if self.cfg.logging_steps is not None:
+            training_args_kwargs["logging_steps"] = self.cfg.logging_steps
+        else:
+            training_args_kwargs["logging_steps"] = (
+                500  # transformers defaults to 500
+                if not total_num_steps
+                else max(min(int(0.005 * total_num_steps), 10), 1)
+            )
+
+        training_args_kwargs["warmup_ratio"] = warmup_ratio
+        training_args_kwargs["warmup_steps"] = warmup_steps
+
+    def _configure_precision_settings(self, training_args_kwargs: dict):
+        training_args_kwargs["fp16"] = (self.cfg.fp16 and not self.cfg.bf16) or False
+        training_args_kwargs["tf32"] = self.cfg.tf32
+        if self.cfg.bf16 == "full":
+            training_args_kwargs["bf16_full_eval"] = True
+        else:
+            bf16 = self.cfg.bf16 or self.cfg.bfloat16
+            bf16 = bf16 if bf16 is not None else False
+            training_args_kwargs["bf16"] = bf16
+
+    def _configure_scheduler(self, training_args_kwargs: dict):
+        if self.cfg.lr_scheduler in ["one_cycle", "rex"]:
+            training_args_kwargs["lr_scheduler_type"] = "cosine"
+            training_args_kwargs["alternate_lr_scheduler_type"] = self.cfg.lr_scheduler
+        else:
+            training_args_kwargs["lr_scheduler_type"] = (
+                self.cfg.lr_scheduler if self.cfg.lr_scheduler else "cosine"
+            )
+        training_args_kwargs["lr_scheduler_kwargs"] = (
+            self.cfg.lr_scheduler_kwargs if self.cfg.lr_scheduler_kwargs else {}
+        )
+
+    def _configure_optimizer(self, training_args_kwargs: dict, trainer_kwargs: dict):
+        def _configure_custom_optimizer(
+            training_args_kwargs: dict, trainer_kwargs: dict
+        ):
+            # Common optimizer kwargs
+            optimizer_kwargs = {
+                "lr": training_args_kwargs["learning_rate"],
+                "weight_decay": training_args_kwargs["weight_decay"],
+            }
+
+            # Adam-specific kwargs
+            adam_kwargs: dict = {}
+            if training_args_kwargs.get("adam_beta1") and training_args_kwargs.get(
+                "adam_beta2"
+            ):
+                adam_kwargs["betas"] = (
+                    training_args_kwargs.get("adam_beta1"),
+                    training_args_kwargs.get("adam_beta2"),
+                )
+            if training_args_kwargs.get("adam_epsilon"):
+                adam_kwargs["eps"] = training_args_kwargs.get("adam_epsilon")
+
+            if self.cfg.optimizer == "muon":
+                from axolotl.contribs.mit.muon import (  # pylint: disable=no-name-in-module
+                    MuonOptimizerFactory,
+                )
+
+                optimizer_cls = MuonOptimizerFactory
+                optimizer_kwargs.update(adam_kwargs)
+            elif self.cfg.optimizer == "dion":
+                from axolotl.contribs.mit.dion import (  # pylint: disable=no-name-in-module
+                    DionOptimizerFactory,
+                )
+
+                optimizer_cls = DionOptimizerFactory
+                optimizer_kwargs["dion_lr"] = training_args_kwargs["dion_learning_rate"]
+                optimizer_kwargs["dion_mu"] = training_args_kwargs["dion_momentum"]
+                optimizer_kwargs.update(adam_kwargs)
+                _, device_mesh = build_parallelism_config(self.cfg)
+                if device_mesh is not None:
+                    optimizer_kwargs["device_mesh"] = device_mesh
+            elif self.cfg.optimizer == "optimi_adamw":
+                from optimi import AdamW
+
+                optimizer_kwargs["foreach"] = False
+                optimizer_cls = AdamW
+                optimizer_kwargs.update(adam_kwargs)
+            elif self.cfg.optimizer == "ao_adamw_fp8":
+                from torchao.prototype.low_bit_optim import AdamWFp8
+
+                optimizer_cls = AdamWFp8
+                optimizer_kwargs.update(adam_kwargs)
+            elif self.cfg.optimizer == "adopt_adamw":
+                from axolotl.utils.optimizers.adopt import ADOPT
+
+                optimizer_cls = ADOPT
+                adam_kwargs["decouple"] = True
+                optimizer_kwargs.update(adam_kwargs)
+            elif self.cfg.optimizer == "came_pytorch":
+                from came_pytorch import CAME
+
+                optimizer_cls = CAME
+
+                beta1 = training_args_kwargs.get("adam_beta1", 0.9)
+                beta2 = training_args_kwargs.get("adam_beta2", 0.999)
+                beta3 = training_args_kwargs.get("adam_beta3", 0.9999)
+                eps1 = training_args_kwargs.get("adam_epsilon", 1e-30)
+                eps2 = training_args_kwargs.get("adam_epsilon2", 1e-16)
+                adam_kwargs["betas"] = (beta1, beta2, beta3)
+                adam_kwargs["eps"] = (eps1, eps2)
+
+                optimizer_kwargs.update(adam_kwargs)
+            else:
+                raise ValueError(
+                    f"Unhandled optimizer: {self.cfg.optimizer}. Please raise an Issue."
+                )
+
+            # Parse any additional optimizer args from config
+            if self.cfg.optim_args:
+                if isinstance(self.cfg.optim_args, dict):
+                    optimizer_kwargs.update(self.cfg.optim_args)
+                else:
+                    # Parse string format "key1=value1,key2=value2"
+                    for mapping in self.cfg.optim_args.replace(" ", "").split(","):
+                        key, value = mapping.split("=")
+                        optimizer_kwargs[key] = value
+
+            # Note: This is not used in training_args_kwargs, but in trainer_kwargs
+            trainer_kwargs["optimizer_cls_and_kwargs"] = (
+                optimizer_cls,
+                optimizer_kwargs,
+            )
+
+        # Handle custom optimizer
+        custom_supported_optimizers = [opt.value for opt in CustomSupportedOptimizers]
+        if self.cfg.optimizer in custom_supported_optimizers:
+            _configure_custom_optimizer(training_args_kwargs, trainer_kwargs)
+        else:
+            # Use transformers' optimizer
+            training_args_kwargs["optim"] = self.cfg.optimizer
+
+            # Parse any additional optimizer args from config
+            if self.cfg.optim_args:
+                if isinstance(self.cfg.optim_args, dict):
+                    optim_args = ",".join(
+                        [f"{key}={value}" for key, value in self.cfg.optim_args.items()]
+                    )
+                else:
+                    optim_args = self.cfg.optim_args
+                training_args_kwargs["optim_args"] = optim_args
+
+            if (
+                self.cfg.optimizer == "adamw_anyprecision"
+                and Path(self.cfg.torchdistx_path).exists()
+            ):
+                sys.path.append(self.cfg.torchdistx_path)
+                importlib.import_module("torchdistx")
+
+    def _configure_hub_parameters(self, training_args_kwargs: dict):
+        if self.cfg.hub_model_id:
+            training_args_kwargs["hub_model_id"] = self.cfg.hub_model_id
+            training_args_kwargs["push_to_hub"] = True
+            training_args_kwargs["hub_private_repo"] = True
+            training_args_kwargs["hub_always_push"] = True
+
+            if self.cfg.hub_strategy:
+                training_args_kwargs["hub_strategy"] = self.cfg.hub_strategy
+
+    def _configure_save_and_eval_strategy(self, training_args_kwargs: dict):
+        # save_strategy and save_steps
+        if self.cfg.save_steps:
+            training_args_kwargs["save_strategy"] = "steps"
+            training_args_kwargs["save_steps"] = self.cfg.save_steps
+        elif self.cfg.save_strategy:
+            training_args_kwargs["save_strategy"] = self.cfg.save_strategy
+        else:
+            # default to saving each epoch if not defined
+            training_args_kwargs["save_strategy"] = "epoch"
+
+        training_args_kwargs["save_total_limit"] = (
+            self.cfg.save_total_limit if self.cfg.save_total_limit else 4
+        )
+
+        # eval_strategy and eval_steps
+        if not self.eval_dataset and self.cfg.val_set_size == 0:
+            # do not eval if no eval_dataset and val_set_size=0
+            training_args_kwargs["eval_strategy"] = "no"
+        elif self.cfg.eval_steps:
+            training_args_kwargs["eval_strategy"] = "steps"
+            training_args_kwargs["eval_steps"] = self.cfg.eval_steps
+            training_args_kwargs["eval_on_start"] = True
+        elif self.cfg.eval_strategy:
+            training_args_kwargs["eval_strategy"] = self.cfg.eval_strategy
+            training_args_kwargs["eval_on_start"] = True
+
+    def _configure_reporting(self, training_args_kwargs: dict):
+        report_to = []
+        if self.cfg.use_wandb:
+            report_to.append("wandb")
+        if self.cfg.use_mlflow:
+            report_to.append("mlflow")
+        if self.cfg.use_tensorboard:
+            report_to.append("tensorboard")
+        if self.cfg.use_comet:
+            report_to.append("comet_ml")
+
+        training_args_kwargs["report_to"] = report_to
+
+        if self.cfg.use_wandb:
+            training_args_kwargs["run_name"] = self.cfg.wandb_name
+        elif self.cfg.use_mlflow:
+            training_args_kwargs["run_name"] = self.cfg.mlflow_run_name
+        else:
+            training_args_kwargs["run_name"] = None
+
+    def _configure_torch_compile(self, training_args_kwargs: dict):
+        if self.cfg.torch_compile and getattr(torch, "_dynamo", None):
+            torch._dynamo.config.suppress_errors = (  # pylint: disable=protected-access
+                True
+            )
+            torch._dynamo.config.accumulated_cache_size_limit = (  # pylint: disable=protected-access
+                256
+            )
+            training_args_kwargs["torch_compile"] = self.cfg.torch_compile
+            if self.cfg.torch_compile_backend:
+                training_args_kwargs["torch_compile_backend"] = (
+                    self.cfg.torch_compile_backend
+                )
+            if self.cfg.torch_compile_mode:
+                training_args_kwargs["torch_compile_mode"] = self.cfg.torch_compile_mode
+
+    def _configure_accelerator_config(self, training_args_kwargs: dict):
+        if self.cfg.accelerator_config:
+            training_args_kwargs["accelerator_config"] = AcceleratorConfig(
+                **self.cfg.accelerator_config
+            )
+        else:
+            training_args_kwargs["accelerator_config"] = AcceleratorConfig()
+
+    def _configure_gradient_checkpointing(self, training_args_kwargs: dict):
+        if self.cfg.activation_offloading is True:
+            # don't use the HF gradient checkpointing, manually wrap
+            training_args_kwargs["gradient_checkpointing"] = False
+            training_args_kwargs["activation_offloading"] = True
+        elif self.cfg.gradient_checkpointing:
+            training_args_kwargs["gradient_checkpointing"] = (
+                self.cfg.gradient_checkpointing
+            )
+            if self.cfg.gradient_checkpointing_kwargs is not None:
+                training_args_kwargs["gradient_checkpointing_kwargs"] = (
+                    self.cfg.gradient_checkpointing_kwargs
+                )
+            else:
+                training_args_kwargs["gradient_checkpointing_kwargs"] = {
+                    "use_reentrant": False
+                }
+
+    def _set_base_training_args(
+        self, total_num_steps
+    ) -> tuple[dict[str, Any], dict[str, Any]]:
+        training_args_kwargs: dict[str, Any] = {}
+        trainer_kwargs: dict[str, Any] = {}
+
+        self._configure_warmup_and_logging(total_num_steps, training_args_kwargs)
+        self._configure_precision_settings(training_args_kwargs)
+        self._configure_save_and_eval_strategy(training_args_kwargs)
+        self._configure_gradient_checkpointing(training_args_kwargs)
+
+        # set arg into trainer_args_kwargs with same name if value not None
+        for arg in [
+            # optim/scheduler
+            "adam_beta1",
+            "adam_beta2",
+            "adam_beta3",
+            "adam_epsilon",
+            "adam_epsilon2",
+            "cosine_min_lr_ratio",
+            "cosine_constant_lr_ratio",
+            "optim_target_modules",
+            # trainer
+            "max_grad_norm",
+            "dataloader_num_workers",
+            "dataloader_pin_memory",
+            "dataloader_prefetch_factor",
+            "gradient_accumulation_steps",
+            "learning_rate",
+            "embedding_lr",
+            "embedding_lr_scale",
+            "lr_groups",
+            "loraplus_lr_ratio",
+            "loraplus_lr_embedding",
+            "output_dir",
+            "save_safetensors",
+            "save_only_model",
+            "include_tokens_per_second",
+            "weight_decay",
+            "seed",
+            "dion_momentum",
+            "dion_rank_fraction",
+            "dion_rank_multiple_of",
+        ]:
+            if hasattr(self.cfg, arg) and getattr(self.cfg, arg) is not None:
+                training_args_kwargs[arg] = getattr(self.cfg, arg)
+
+        arg_map = {
+            "dion_learning_rate": "dion_lr",
+        }
+        for kwarg, cfg_arg in arg_map.items():
+            if hasattr(self.cfg, cfg_arg) and getattr(self.cfg, cfg_arg) is not None:
+                training_args_kwargs[kwarg] = getattr(self.cfg, cfg_arg)
+
+        training_args_kwargs["per_device_train_batch_size"] = self.cfg.micro_batch_size
+        training_args_kwargs["average_tokens_across_devices"] = False
+
+        if self.cfg.eval_batch_size:
+            training_args_kwargs["per_device_eval_batch_size"] = (
+                self.cfg.eval_batch_size
+            )
+
+        training_args_kwargs["max_steps"] = self.cfg.max_steps or total_num_steps or -1
+        training_args_kwargs["num_train_epochs"] = self.cfg.num_epochs
+
+        if self.cfg.dataset_processes:
+            training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
+
+        # max_length is not used in CausalTrainer
+        if self.cfg.reward_model or self.cfg.rl:
+            training_args_kwargs["max_length"] = self.cfg.sequence_len
+
+        if self.cfg.fsdp_config or self.cfg.fsdp:
+            training_args_kwargs["fsdp_config"] = self.cfg.fsdp_config
+            training_args_kwargs["fsdp"] = self.cfg.fsdp if self.cfg.fsdp else True
+
+        self._configure_reporting(training_args_kwargs)
+        self._configure_hub_parameters(training_args_kwargs)
+        self._configure_scheduler(training_args_kwargs)
+        self._configure_optimizer(training_args_kwargs, trainer_kwargs)
+        self._configure_torch_compile(training_args_kwargs)
+        self._configure_accelerator_config(training_args_kwargs)
+
+        return training_args_kwargs, trainer_kwargs
diff --git a/src/axolotl/core/builders/causal.py b/src/axolotl/core/builders/causal.py
new file mode 100644
index 000000000..e5bc21762
--- /dev/null
+++ b/src/axolotl/core/builders/causal.py
@@ -0,0 +1,504 @@
+"""Builder for causal trainers"""
+
+import inspect
+import math
+import os
+from pathlib import Path
+from typing import Type, Union
+
+import transformers
+from transformers import (
+    DataCollatorWithFlattening,
+    EarlyStoppingCallback,
+)
+from trl.trainer.utils import RewardDataCollatorWithPadding
+
+from axolotl.core.builders.base import TrainerBuilderBase
+from axolotl.core.trainers import (
+    AxolotlMambaTrainer,
+    AxolotlPRMTrainer,
+    AxolotlRewardTrainer,
+    AxolotlTrainer,
+)
+from axolotl.integrations.base import PluginManager
+from axolotl.monkeypatch.multipack import SUPPORTED_MULTIPACK_MODEL_TYPES
+from axolotl.monkeypatch.relora import ReLoRACallback
+from axolotl.processing_strategies import get_processing_strategy
+from axolotl.utils import is_comet_available, is_mlflow_available
+from axolotl.utils.callbacks import (
+    LossWatchDogCallback,
+    SaveBetterTransformerModelCallback,
+    bench_eval_callback_factory,
+    causal_lm_bench_eval_callback_factory,
+    colab_inference_post_train_callback,
+    log_prediction_callback_factory,
+)
+from axolotl.utils.callbacks.lisa import lisa_callback_factory
+from axolotl.utils.callbacks.qat import QATCallback
+from axolotl.utils.chat_templates import get_chat_template_from_config
+from axolotl.utils.collators import (
+    BatchSamplerDataCollatorForSeq2Seq,
+    DataCollatorForSeq2Seq,
+    MambaDataCollator,
+    V2BatchSamplerDataCollatorForSeq2Seq,
+)
+from axolotl.utils.collators.mm_chat import MultiModalChatDataCollator
+from axolotl.utils.import_helper import get_cls_from_module_str
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+
+class HFCausalTrainerBuilder(TrainerBuilderBase):
+    """
+    Build the HuggingFace training args/trainer for causal models and reward modeling
+    using TRL.
+    """
+
+    def get_callbacks(self):
+        callbacks = super().get_callbacks()
+
+        if self.cfg.relora:
+            callbacks.append(ReLoRACallback(self.cfg))
+
+        if (
+            hasattr(self.model, "use_bettertransformer")
+            and self.model.use_bettertransformer is True
+        ):
+            callbacks.append(SaveBetterTransformerModelCallback())
+
+        # TODO: check if can move to base class
+        if self.cfg.loss_watchdog_threshold is not None:
+            callbacks.append(LossWatchDogCallback(self.cfg))
+
+        if self.cfg.qat:
+            callbacks.append(QATCallback(self.cfg.qat))
+
+        return callbacks
+
+    def get_post_trainer_create_callbacks(self, trainer):
+        callbacks = []
+        if self.cfg.use_wandb and self.cfg.eval_table_size > 0:
+            LogPredictionCallback = log_prediction_callback_factory(
+                trainer, self.tokenizer, "wandb"
+            )
+            callbacks.append(LogPredictionCallback(self.cfg))
+        if (
+            self.cfg.use_mlflow
+            and is_mlflow_available()
+            and self.cfg.eval_table_size > 0
+        ):
+            LogPredictionCallback = log_prediction_callback_factory(
+                trainer, self.tokenizer, "mlflow"
+            )
+            callbacks.append(LogPredictionCallback(self.cfg))
+        if self.cfg.use_comet and is_comet_available() and self.cfg.eval_table_size > 0:
+            LogPredictionCallback = log_prediction_callback_factory(
+                trainer, self.tokenizer, "comet_ml"
+            )
+            callbacks.append(LogPredictionCallback(self.cfg))
+
+        if self.cfg.do_bench_eval:
+            callbacks.append(bench_eval_callback_factory(trainer, self.tokenizer))
+        if self.cfg.do_causal_lm_eval:
+            CausalLMBenchEvalCallback = causal_lm_bench_eval_callback_factory(
+                trainer, self.tokenizer
+            )
+            callbacks.append(CausalLMBenchEvalCallback(self.cfg))
+
+        if self.cfg.early_stopping_patience:
+            early_stop_cb = EarlyStoppingCallback(
+                self.cfg.early_stopping_patience,
+            )
+            callbacks.append(early_stop_cb)
+
+        if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers:
+            callbacks.append(lisa_callback_factory(trainer))
+
+        if any("COLAB_" in key for key in os.environ):
+            ColabCallback = colab_inference_post_train_callback(trainer)
+            callbacks.append(ColabCallback(self.cfg))
+
+        callbacks.extend(super().get_post_trainer_create_callbacks(trainer=trainer))
+        return callbacks
+
+    def _get_trainer_cls(self):
+        """
+        Gets the trainer class for the given configuration.
+        """
+        if self.cfg.plugins:
+            plugin_manager = PluginManager.get_instance()
+            trainer_cls = plugin_manager.get_trainer_cls(self.cfg)
+            if trainer_cls:
+                return trainer_cls
+        if self.cfg.model_config_type == "mamba":
+            return AxolotlMambaTrainer
+        if self.cfg.reward_model:
+            return AxolotlRewardTrainer
+        if self.cfg.process_reward_model:
+            return AxolotlPRMTrainer
+
+        if self.cfg.trainer_cls:
+            # override the trainer cls
+            try:
+                trainer_cls = get_cls_from_module_str(self.cfg.trainer_cls)
+                LOG.debug(f"Using custom trainer class: {self.cfg.trainer_cls}")
+                return trainer_cls
+            except (ImportError, AttributeError, ValueError) as e:
+                raise ValueError(
+                    f"Failed to load custom trainer class '{self.cfg.trainer_cls}': {e}"
+                ) from e
+
+        return AxolotlTrainer
+
+    def build(self, total_num_steps):
+        from axolotl.core.training_args import (
+            AxolotlPRMConfig,
+            AxolotlRewardConfig,
+            AxolotlTrainingArguments,
+        )
+
+        training_arguments_kwargs, trainer_kwargs = self._set_base_training_args(
+            total_num_steps
+        )
+        if self.cfg.adapter == "qlora":
+            training_arguments_kwargs["qlora"] = True
+
+        # deepspeed
+        if self.cfg.deepspeed:
+            training_arguments_kwargs["deepspeed"] = self.cfg.deepspeed
+
+        if self.cfg.lr_quadratic_warmup is not None:
+            training_arguments_kwargs["lr_quadratic_warmup"] = (
+                self.cfg.lr_quadratic_warmup
+            )
+
+        if self.cfg.dataloader_drop_last is not None:
+            training_arguments_kwargs["dataloader_drop_last"] = (
+                self.cfg.dataloader_drop_last
+            )
+        elif self.cfg.sample_packing and self.cfg.eval_sample_packing is False:
+            training_arguments_kwargs["dataloader_drop_last"] = True
+
+        if self.cfg.remove_unused_columns is not None:
+            training_arguments_kwargs["remove_unused_columns"] = (
+                self.cfg.remove_unused_columns
+            )
+
+        if self.cfg.do_bench_eval:
+            training_arguments_kwargs["do_bench_eval"] = self.cfg.do_bench_eval
+            if self.cfg.bench_dataset:
+                training_arguments_kwargs["bench_dataset"] = self.cfg.bench_dataset
+        if self.cfg.do_causal_lm_eval:
+            training_arguments_kwargs["do_causal_lm_eval"] = self.cfg.do_causal_lm_eval
+        if self.cfg.metric_for_best_model:
+            training_arguments_kwargs["metric_for_best_model"] = (
+                self.cfg.metric_for_best_model
+            )
+        if self.cfg.greater_is_better:
+            training_arguments_kwargs["greater_is_better"] = self.cfg.greater_is_better
+
+        # DDP Config
+        if self.cfg.ddp_timeout:
+            training_arguments_kwargs["ddp_timeout"] = self.cfg.ddp_timeout
+        # see https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html
+        if self.cfg.ddp_bucket_cap_mb:
+            training_arguments_kwargs["ddp_bucket_cap_mb"] = self.cfg.ddp_bucket_cap_mb
+        if self.cfg.ddp_broadcast_buffers is not None:
+            training_arguments_kwargs["ddp_broadcast_buffers"] = (
+                self.cfg.ddp_broadcast_buffers
+            )
+
+        # these are all the "standard" kwargs that are def used
+        training_arguments_kwargs["max_seq_length"] = self.cfg.sequence_len
+
+        if self.cfg.auto_find_batch_size is not None:
+            training_arguments_kwargs["auto_find_batch_size"] = (
+                self.cfg.auto_find_batch_size
+            )
+
+        training_arguments_kwargs["eval_accumulation_steps"] = (
+            self.cfg.gradient_accumulation_steps
+        )
+
+        training_arguments_kwargs["load_best_model_at_end"] = (
+            (
+                self.cfg.load_best_model_at_end is not False
+                or self.cfg.early_stopping_patience
+            )
+            and (
+                (not self.cfg.test_datasets and self.cfg.val_set_size > 0)
+                or (self.cfg.test_datasets and self.cfg.val_set_size == 0)
+            )
+            and self.cfg.save_steps
+            and self.cfg.eval_steps
+            and self.cfg.save_steps % self.cfg.eval_steps == 0
+        ) or False
+
+        # handle ddp
+        ddp_find_unused_parameters = None
+        if self.cfg.ddp:
+            ddp_find_unused_parameters = bool(self.cfg.ddp_find_unused_parameters)
+        training_arguments_kwargs["ddp_find_unused_parameters"] = (
+            ddp_find_unused_parameters
+        )
+
+        training_arguments_kwargs["group_by_length"] = self.cfg.group_by_length
+        training_arguments_kwargs["curriculum_sampling"] = self.cfg.curriculum_sampling
+
+        training_arguments_kwargs["sample_packing"] = bool(self.cfg.sample_packing)
+        training_arguments_kwargs["sample_packing_drop_attention_mask"] = bool(
+            self.cfg.flash_attention
+            or self.cfg.xformers_attention
+            or self.cfg.flex_attention
+        )
+        training_arguments_kwargs["multipack_real_batches"] = (
+            self.cfg.multipack_real_batches
+            if self.cfg.multipack_real_batches is not None
+            else not (
+                self.cfg.flash_attention
+                or self.cfg.flex_attention
+                or self.cfg.xformers_attention
+            )
+        )
+        training_arguments_kwargs["eval_sample_packing"] = bool(
+            self.cfg.eval_sample_packing
+        )
+        if self.cfg.sample_packing_sequentially is not None:
+            training_arguments_kwargs["sample_packing_sequentially"] = (
+                self.cfg.sample_packing_sequentially
+            )
+        if self.cfg.sample_packing_bin_size is not None:
+            training_arguments_kwargs["sample_packing_bin_size"] = (
+                self.cfg.sample_packing_bin_size
+            )
+        if self.cfg.sample_packing_group_size is not None:
+            training_arguments_kwargs["sample_packing_group_size"] = (
+                self.cfg.sample_packing_group_size
+            )
+        if self.cfg.sample_packing_eff_est:
+            training_arguments_kwargs["sample_packing_efficiency"] = (
+                self.cfg.sample_packing_eff_est
+            )
+
+        if self.cfg.relora and self.cfg.jagged_restart_steps:
+            if self.cfg.relora_prune_ratio:
+                training_arguments_kwargs["relora_prune_ratio"] = (
+                    self.cfg.relora_prune_ratio
+                )
+
+        if self.cfg.jagged_restart_steps:
+            training_arguments_kwargs["jagged_restart_steps"] = (
+                self.cfg.jagged_restart_steps
+            )
+            if self.cfg.jagged_restart_warmup_steps:
+                training_arguments_kwargs["jagged_restart_warmup_steps"] = (
+                    self.cfg.jagged_restart_warmup_steps
+                )
+            if self.cfg.jagged_restart_anneal_steps:
+                training_arguments_kwargs["jagged_restart_anneal_steps"] = (
+                    self.cfg.jagged_restart_anneal_steps
+                )
+
+        if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers:
+            training_arguments_kwargs["lisa_n_layers"] = self.cfg.lisa_n_layers
+            training_arguments_kwargs["lisa_step_interval"] = (
+                self.cfg.lisa_step_interval
+            )
+            training_arguments_kwargs["lisa_layers_attribute"] = (
+                self.cfg.lisa_layers_attribute
+            )
+
+        training_arguments_kwargs = self.hook_pre_create_training_args(
+            training_arguments_kwargs
+        )
+        training_arguments_kwargs["model_type"] = self.cfg.model_config_type
+        training_arguments_kwargs["pretraining"] = bool(self.cfg.pretraining_dataset)
+        if self.cfg.chat_template:
+            training_arguments_kwargs["chat_template"] = get_chat_template_from_config(
+                cfg=self.cfg,
+                tokenizer=self.tokenizer,
+            )
+
+        if self.cfg.neftune_noise_alpha is not None:
+            training_arguments_kwargs["neftune_noise_alpha"] = (
+                self.cfg.neftune_noise_alpha
+            )
+
+        if self.cfg.image_size:
+            training_arguments_kwargs["image_size"] = self.cfg.image_size
+        if self.cfg.image_resize_algorithm:
+            training_arguments_kwargs["image_resize_algorithm"] = (
+                self.cfg.image_resize_algorithm
+            )
+
+        if self.cfg.plugins:
+            plugin_manager = PluginManager.get_instance()
+            plugin_training_args = plugin_manager.get_training_args(self.cfg)
+            if plugin_training_args:
+                training_arguments_kwargs.update(plugin_training_args)
+
+        if self.cfg.reward_model:
+            training_args_cls = AxolotlRewardConfig
+        elif self.cfg.process_reward_model:
+            training_args_cls = AxolotlPRMConfig
+        else:
+            training_args_cls = AxolotlTrainingArguments
+        training_args = training_args_cls(  # pylint: disable=unexpected-keyword-arg
+            **training_arguments_kwargs,
+        )
+        training_args = self.hook_post_create_training_args(training_args)
+
+        # unset run_name so wandb sets up experiment names
+        if self.cfg.use_wandb and training_args.run_name == training_args.output_dir:
+            training_args.run_name = (  # pylint: disable=attribute-defined-outside-init
+                None
+            )
+
+        data_collator_kwargs = {
+            "padding": True,  # True/"longest" is the default
+        }
+        multiple = 64
+        if self.cfg.pad_to_sequence_len:
+            data_collator_kwargs["pad_to_multiple_of"] = multiple * math.ceil(
+                self.cfg.sequence_len / multiple
+            )
+        elif self.cfg.pad_to_sequence_len is None:
+            # A100 is best at 64, while others at 8. Let's use the larger so we don't have to check
+            # https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html
+            data_collator_kwargs["pad_to_multiple_of"] = multiple
+
+        trainer_cls = self._get_trainer_cls()
+
+        trainer_kwargs, trainer_cls = self.hook_pre_create_trainer(
+            trainer_kwargs, trainer_cls
+        )
+        if eval_data_collator := self.build_collator(
+            training_args, is_eval=True, **data_collator_kwargs
+        ):
+            if not (self.cfg.reward_model or self.cfg.process_reward_model):
+                trainer_kwargs["eval_data_collator"] = eval_data_collator
+        if not (self.cfg.reward_model or self.cfg.process_reward_model):
+            trainer_kwargs["bench_data_collator"] = transformers.DataCollatorForSeq2Seq(
+                self.tokenizer,
+                return_tensors="pt",
+                **data_collator_kwargs,
+            )
+        sig = inspect.signature(trainer_cls)
+        if "processing_class" in sig.parameters:
+            trainer_kwargs["processing_class"] = self.tokenizer
+        elif "tokenizer" in sig.parameters:
+            trainer_kwargs["tokenizer"] = self.tokenizer
+        if (
+            trainer_cls not in [AxolotlRewardTrainer, AxolotlPRMTrainer]
+            and self.cfg.datasets is not None
+        ):
+            trainer_kwargs["dataset_tags"] = [
+                d["path"] for d in self.cfg.datasets if not Path(d["path"]).is_dir()
+            ]
+        trainer = trainer_cls(
+            model=self.model,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+            args=training_args,
+            data_collator=self.build_collator(training_args, **data_collator_kwargs),
+            callbacks=self.get_callbacks(),
+            **trainer_kwargs,
+        )
+        trainer = self.hook_post_create_trainer(trainer)
+        for callback in self.get_post_trainer_create_callbacks(trainer):
+            trainer.add_callback(callback)
+
+        if self.cfg.deepspeed and self.cfg.sample_packing:
+            trainer.accelerator.state.deepspeed_plugin.deepspeed_config[
+                "train_micro_batch_size_per_gpu"
+            ] = self.cfg.micro_batch_size
+
+        return trainer
+
+    def build_collator(
+        self,
+        training_args,  # type: "AxolotlTrainingArguments"  # type: ignore
+        is_eval=False,
+        **kwargs,
+    ):
+        if training_args.pretraining:
+            if (
+                self.cfg.pretraining_sample_concatenation is False
+                or self.cfg.micro_batch_size > 1
+            ):
+                return DataCollatorForSeq2Seq(self.tokenizer, **kwargs)
+            if not (self.cfg.sample_packing and self.cfg.pretrain_multipack_attn):
+                return None
+
+        if self.cfg.model_config_type == "mamba":
+            return MambaDataCollator(tokenizer=self.tokenizer)
+
+        use_batch_sampler_collator = False
+        if is_eval is False and training_args.sample_packing:
+            use_batch_sampler_collator = True
+        if is_eval and training_args.eval_sample_packing:
+            use_batch_sampler_collator = True
+
+        collator: Type[
+            Union[
+                V2BatchSamplerDataCollatorForSeq2Seq,
+                BatchSamplerDataCollatorForSeq2Seq,
+                DataCollatorForSeq2Seq,
+                DataCollatorWithFlattening,
+                RewardDataCollatorWithPadding,
+            ]
+        ]
+        collator_args = [self.tokenizer]
+
+        collator_cls_and_kwargs = None
+        if self.cfg.plugins:
+            plugin_manager = PluginManager.get_instance()
+            collator_cls_and_kwargs = plugin_manager.get_collator_cls_and_kwargs(
+                self.cfg, is_eval=is_eval
+            )
+
+        if collator_cls_and_kwargs:
+            collator = collator_cls_and_kwargs[0]
+            if kwargs and isinstance(kwargs, dict):
+                kwargs.update(collator_cls_and_kwargs[1])
+        elif self.cfg.reward_model:
+            collator = RewardDataCollatorWithPadding
+        elif use_batch_sampler_collator:
+            # Use V2BatchSamplerDataCollatorForSeq2Seq for flex attention,
+            # supported multipack models, or non-flash-attention llama
+            if (
+                self.cfg.flex_attention
+                or self.cfg.model_config_type in SUPPORTED_MULTIPACK_MODEL_TYPES
+                or (
+                    self.cfg.model_config_type in ["llama"]
+                    and self.cfg.flash_attention is not True
+                )
+            ):
+                collator = V2BatchSamplerDataCollatorForSeq2Seq
+            else:
+                collator = BatchSamplerDataCollatorForSeq2Seq
+        else:
+            if self.cfg.processor_type and self.processor:
+                collator = MultiModalChatDataCollator
+                kwargs["processing_strategy"] = get_processing_strategy(
+                    self.processor,
+                    training_args.chat_template,
+                    self.cfg.chat_template,
+                    image_size=training_args.image_size,
+                    image_resize_algorithm=training_args.image_resize_algorithm,
+                )
+            elif self.cfg.batch_flattening:
+                collator = DataCollatorWithFlattening
+                collator_args.pop(0)
+                kwargs.pop("pad_to_multiple_of", None)
+                kwargs.pop("padding", None)
+            else:
+                collator = DataCollatorForSeq2Seq
+
+        kwargs["return_tensors"] = "pt"
+
+        return collator(
+            *collator_args,
+            **kwargs,
+        )
diff --git a/src/axolotl/core/builders/rl.py b/src/axolotl/core/builders/rl.py
new file mode 100644
index 000000000..bc7816807
--- /dev/null
+++ b/src/axolotl/core/builders/rl.py
@@ -0,0 +1,231 @@
+"""Builder for RLHF trainers"""
+
+import inspect
+from pathlib import Path
+
+from axolotl.core.builders.base import TrainerBuilderBase
+from axolotl.core.trainers import (
+    AxolotlCPOTrainer,
+    AxolotlKTOTrainer,
+    AxolotlORPOTrainer,
+)
+from axolotl.core.trainers.dpo import DPOStrategy
+from axolotl.core.trainers.dpo.args import AxolotlDPOConfig
+from axolotl.core.trainers.grpo import GRPOStrategy
+from axolotl.integrations.base import PluginManager
+from axolotl.loaders.utils import ensure_dtype
+from axolotl.utils.callbacks.qat import QATCallback
+from axolotl.utils.import_helper import get_cls_from_module_str
+from axolotl.utils.logging import get_logger
+from axolotl.utils.schemas.enums import RLType
+
+LOG = get_logger(__name__)
+
+
+class HFRLTrainerBuilder(TrainerBuilderBase):
+    """Trainer factory class for TRL-based RLHF trainers (e.g. DPO)"""
+
+    def get_callbacks(self):
+        callbacks = super().get_callbacks()
+
+        if self.cfg.qat:
+            callbacks.append(QATCallback(self.cfg.qat))
+
+        return callbacks
+
+    def get_post_trainer_create_callbacks(self, trainer):
+        callbacks = super().get_post_trainer_create_callbacks(trainer=trainer)
+        return callbacks
+
+    def _get_trainer_cls(self, trainer_kwargs: dict):
+        """
+        Returns trainer_cls and trainer_cls_args
+        """
+        if self.cfg.plugins:
+            plugin_manager = PluginManager.get_instance()
+            trainer_cls = plugin_manager.get_trainer_cls(self.cfg)
+            trainer_cls_args = []  # type: ignore
+
+            if trainer_cls is not None:
+                return trainer_cls, trainer_cls_args
+
+        trainer_cls = None
+        trainer_cls_args = [self.model]
+
+        if self.cfg.rl is RLType.GRPO:
+            trainer_cls = GRPOStrategy.get_trainer_class(
+                sequence_parallel=self.cfg.context_parallel_size > 1
+            )
+            trainer_cls_args.extend(GRPOStrategy.set_trainer_args(self.cfg))
+
+            trainer_kwargs.update(GRPOStrategy.set_trainer_kwargs(self.cfg))
+
+        elif self.cfg.rl in [RLType.DPO, RLType.IPO]:
+            trainer_cls = DPOStrategy.get_trainer_class()
+            trainer_cls_args.append(self.model_ref)
+
+        elif self.cfg.rl is RLType.ORPO:
+            trainer_cls = AxolotlORPOTrainer
+        elif self.cfg.rl is RLType.KTO:
+            trainer_cls = AxolotlKTOTrainer
+        elif self.cfg.rl is RLType.SIMPO:
+            trainer_cls = AxolotlCPOTrainer
+        else:
+            raise ValueError(f"Unsupported RL: {self.cfg.rl}")
+
+        if self.cfg.trainer_cls:
+            # override the trainer cls
+            try:
+                trainer_cls = get_cls_from_module_str(self.cfg.trainer_cls)
+                LOG.debug(f"Using custom trainer class: {self.cfg.trainer_cls}")
+            except (ImportError, AttributeError, ValueError) as e:
+                raise ValueError(
+                    f"Failed to load custom trainer class '{self.cfg.trainer_cls}': {e}"
+                ) from e
+
+        return trainer_cls, trainer_cls_args
+
+    def _build_training_arguments(self, total_num_steps):
+        """
+        Returns training_args and trainer_kwargs
+        """
+        from axolotl.core.training_args import (
+            AxolotlCPOConfig,
+            AxolotlKTOConfig,
+            AxolotlORPOConfig,
+        )
+
+        training_args_kwargs, trainer_kwargs = self._set_base_training_args(
+            total_num_steps=total_num_steps
+        )
+
+        if self.cfg.remove_unused_columns is not None:
+            training_args_kwargs["remove_unused_columns"] = (
+                self.cfg.remove_unused_columns
+            )
+        else:
+            training_args_kwargs["remove_unused_columns"] = False
+
+        if self.cfg.trl and self.cfg.trl.beta is not None:
+            training_args_kwargs["beta"] = self.cfg.trl.beta
+        elif self.cfg.rl_beta is not None:
+            training_args_kwargs["beta"] = self.cfg.rl_beta
+        elif self.cfg.orpo_alpha is not None:
+            # trl does some odd mapping of alpha to beta to reuse the beta parameter ???
+            training_args_kwargs["beta"] = self.cfg.orpo_alpha
+
+        if self.cfg.rpo_alpha is not None:
+            training_args_kwargs["rpo_alpha"] = self.cfg.rpo_alpha
+
+        if self.cfg.use_wandb:
+            training_args_kwargs["run_name"] = self.cfg.wandb_name
+
+        training_args_cls = None
+        blocklist_args_kwargs = []
+        if self.cfg.rl is RLType.SIMPO:
+            training_args_cls = AxolotlCPOConfig
+            training_args_kwargs["loss_type"] = "simpo"
+            training_args_kwargs["simpo_gamma"] = self.cfg.simpo_gamma
+            if self.cfg.cpo_alpha is not None:
+                training_args_kwargs["cpo_alpha"] = self.cfg.cpo_alpha
+
+        elif self.cfg.rl is RLType.ORPO:
+            training_args_cls = AxolotlORPOConfig
+            if self.cfg.max_prompt_len:
+                training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len
+
+        elif self.cfg.rl is RLType.KTO:
+            training_args_cls = AxolotlKTOConfig
+
+            training_args_kwargs["desirable_weight"] = (
+                self.cfg.kto_desirable_weight or 1.0
+            )
+            training_args_kwargs["undesirable_weight"] = (
+                self.cfg.kto_undesirable_weight or 1.0
+            )
+
+            if self.cfg.max_prompt_len:
+                training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len
+
+        elif self.cfg.rl is RLType.GRPO:
+            training_args_cls = GRPOStrategy.get_training_args_class()
+            training_args_kwargs.update(GRPOStrategy.set_training_args_kwargs(self.cfg))
+            blocklist_args_kwargs = GRPOStrategy.get_blocklist_args_kwargs()
+
+        elif self.cfg.rl in [RLType.DPO, RLType.IPO]:
+            training_args_cls = AxolotlDPOConfig
+            training_args_kwargs.update(DPOStrategy.set_training_args_kwargs(self.cfg))
+        else:
+            raise ValueError(f"Unsupported RL: {self.cfg.rl}")
+
+        for blocklist_key in blocklist_args_kwargs:
+            if blocklist_key in training_args_kwargs:
+                del training_args_kwargs[blocklist_key]
+
+        if self.cfg.plugins:
+            plugin_manager = PluginManager.get_instance()
+            plugin_training_args = plugin_manager.get_training_args(self.cfg)
+            if plugin_training_args:
+                training_args_kwargs.update(plugin_training_args)
+
+        training_args = training_args_cls(  # pylint: disable=unexpected-keyword-arg
+            logging_first_step=True,
+            **training_args_kwargs,
+        )
+
+        # unset run_name so wandb sets up experiment names
+        if self.cfg.use_wandb and training_args.run_name == training_args.output_dir:
+            training_args.run_name = (  # pylint: disable=attribute-defined-outside-init
+                None
+            )
+
+        return training_args, trainer_kwargs
+
+    def build(self, total_num_steps):
+        training_args, trainer_kwargs = self._build_training_arguments(total_num_steps)
+
+        if self.eval_dataset:
+            trainer_kwargs["eval_dataset"] = self.eval_dataset
+        if self.cfg.adapter and self.peft_config and self.cfg.rl is not RLType.GRPO:
+            trainer_kwargs["peft_config"] = self.peft_config
+        if self.cfg.precompute_ref_log_probs is not None:
+            trainer_kwargs["precompute_ref_log_probs"] = (
+                self.cfg.precompute_ref_log_probs
+            )
+
+        trainer_cls, trainer_cls_args = self._get_trainer_cls(trainer_kwargs)
+
+        sig = inspect.signature(trainer_cls)
+        if "tokenizer" in sig.parameters:
+            trainer_kwargs["tokenizer"] = self.tokenizer
+        else:
+            trainer_kwargs["processing_class"] = self.tokenizer
+
+        if self.cfg.datasets is not None and (
+            trainer_cls is DPOStrategy.get_trainer_class()
+        ):
+            trainer_kwargs["dataset_tags"] = [
+                d["path"] for d in self.cfg.datasets if not Path(d["path"]).is_dir()
+            ]
+
+        trainer_kwargs, trainer_cls = self.hook_pre_create_trainer(
+            trainer_kwargs, trainer_cls
+        )
+
+        trainer = trainer_cls(
+            *trainer_cls_args,
+            args=training_args,
+            train_dataset=self.train_dataset,
+            callbacks=self.get_callbacks(),
+            **trainer_kwargs,
+        )
+        if self.cfg.fsdp_config or self.cfg.fsdp:
+            ensure_dtype(trainer.model, dtype=self.cfg.torch_dtype)
+            if self.cfg.rl in [RLType.DPO, RLType.IPO] and trainer.ref_model:
+                ensure_dtype(trainer.ref_model, dtype=self.cfg.torch_dtype)
+
+        trainer = self.hook_post_create_trainer(trainer)
+        for callback in self.get_post_trainer_create_callbacks(trainer):
+            trainer.add_callback(callback)
+
+        return trainer
diff --git a/src/axolotl/core/chat/messages.py b/src/axolotl/core/chat/messages.py
index 88ff2b7ad..923b177c1 100644
--- a/src/axolotl/core/chat/messages.py
+++ b/src/axolotl/core/chat/messages.py
@@ -156,7 +156,6 @@ class Messages(BaseModel):
                         len(input_ids) : len(input_ids) + len(pending_input_ids)
                     ]
                     if new_pending_inputs != pending_input_ids:
-                        # logging.warning("tokenization mismatch from concatenation.")
                         pending_input_ids = new_pending_inputs
                     input_ids.extend(pending_input_ids)
                     if pending_weight:
diff --git a/src/axolotl/core/datasets/chat.py b/src/axolotl/core/datasets/chat.py
index 724f12866..a4dc300d9 100644
--- a/src/axolotl/core/datasets/chat.py
+++ b/src/axolotl/core/datasets/chat.py
@@ -2,7 +2,6 @@
 chat dataset module
 """
 
-import os
 from typing import Callable, Optional, Union
 
 from datasets import Dataset
@@ -41,14 +40,10 @@ class TokenizedChatDataset(Dataset):
                 )
             return ex.tokenized(model_transform)
 
-        process_or_cpu_count: int = (
-            process_count or os.cpu_count()  # type: ignore[assignment]
-        )
-        num_proc = min(32, process_or_cpu_count)
         features = data.features.keys()
         tokenized_data = data.map(
             map_fn,
-            num_proc=num_proc,
+            num_proc=process_count,
             keep_in_memory=keep_in_memory,
             remove_columns=features,
             desc="Tokenizing Chats",
diff --git a/src/axolotl/core/trainer_builder.py b/src/axolotl/core/trainer_builder.py
deleted file mode 100755
index 9709f0fd4..000000000
--- a/src/axolotl/core/trainer_builder.py
+++ /dev/null
@@ -1,1248 +0,0 @@
-# Copyright 2024 Axolotl AI. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pylint: disable=too-many-lines
-"""Builder for the training args and trainer"""
-
-import abc
-import importlib
-import importlib.util
-import inspect
-import logging
-import math
-import os
-import sys
-from abc import abstractmethod
-from pathlib import Path
-from typing import List, Type, Union
-
-import torch
-import transformers
-from transformers import (
-    DataCollatorWithFlattening,
-    EarlyStoppingCallback,
-    TrainerCallback,
-)
-from transformers.training_args import OptimizerNames
-from trl.trainer.utils import RewardDataCollatorWithPadding
-
-from axolotl.core.trainers import (
-    AxolotlCPOTrainer,
-    AxolotlKTOTrainer,
-    AxolotlMambaTrainer,
-    AxolotlORPOTrainer,
-    AxolotlPRMTrainer,
-    AxolotlRewardTrainer,
-    AxolotlTrainer,
-    ReLoRATrainer,
-)
-from axolotl.core.trainers.dpo import DPOStrategy
-from axolotl.core.trainers.dpo.args import AxolotlDPOConfig
-from axolotl.core.trainers.grpo import GRPOStrategy
-from axolotl.core.training_args import (
-    AxolotlCPOConfig,
-    AxolotlKTOConfig,
-    AxolotlORPOConfig,
-    AxolotlPRMConfig,
-    AxolotlRewardConfig,
-    AxolotlTrainingArguments,
-)
-from axolotl.integrations.base import PluginManager
-from axolotl.loaders.utils import ensure_dtype
-from axolotl.monkeypatch.multipack import SUPPORTED_MULTIPACK_MODEL_TYPES
-from axolotl.monkeypatch.relora import ReLoRACallback
-from axolotl.monkeypatch.trainer.lr import patch_trainer_get_lr
-from axolotl.processing_strategies import get_processing_strategy
-from axolotl.utils import is_comet_available, is_mlflow_available
-from axolotl.utils.callbacks import (
-    EvalFirstStepCallback,
-    GCCallback,
-    GPUStatsCallback,
-    LossWatchDogCallback,
-    SaveAxolotlConfigtoWandBCallback,
-    SaveBetterTransformerModelCallback,
-    bench_eval_callback_factory,
-    causal_lm_bench_eval_callback_factory,
-    colab_inference_post_train_callback,
-    log_prediction_callback_factory,
-)
-from axolotl.utils.callbacks.lisa import lisa_callback_factory
-from axolotl.utils.callbacks.profiler import PytorchProfilerCallback
-from axolotl.utils.chat_templates import get_chat_template_from_config
-from axolotl.utils.collators import (
-    BatchSamplerDataCollatorForSeq2Seq,
-    DataCollatorForSeq2Seq,
-    MambaDataCollator,
-    V2BatchSamplerDataCollatorForSeq2Seq,
-)
-from axolotl.utils.collators.mm_chat import MultiModalChatDataCollator
-from axolotl.utils.schemas.enums import CustomSupportedOptimizers, RLType
-
-try:
-    import torch._dynamo  # pylint: disable=ungrouped-imports
-except ImportError:
-    pass
-
-LOG = logging.getLogger(__name__)
-
-
-class TrainerBuilderBase(abc.ABC):
-    """Base class for trainer builder."""
-
-    _train_dataset = None
-    _eval_dataset = None
-    _model_ref = None
-    _peft_config = None
-
-    def __init__(self, cfg, model, tokenizer, processor=None):
-        self.cfg = cfg
-        self.model = model
-        self.tokenizer = tokenizer
-        self.processor = processor
-
-        # If the model supports tagging, add the axolotl tag.
-        # This makes sure the tag is correctly pushed even if a user calls
-        # model.push_to_hub instead of trainer.push_to_hub.
-        if hasattr(model, "add_model_tags"):
-            model.add_model_tags(["axolotl"])
-
-        patch_trainer_get_lr()
-
-    @property
-    def model_ref(self):
-        return self._model_ref
-
-    @model_ref.setter
-    def model_ref(self, model):
-        self._model_ref = model
-
-    @property
-    def train_dataset(self):
-        return self._train_dataset
-
-    @train_dataset.setter
-    def train_dataset(self, dataset):
-        self._train_dataset = dataset
-
-    @property
-    def eval_dataset(self):
-        return self._eval_dataset
-
-    @eval_dataset.setter
-    def eval_dataset(self, dataset):
-        self._eval_dataset = dataset
-
-    @property
-    def peft_config(self):
-        return self._peft_config
-
-    @peft_config.setter
-    def peft_config(self, peft_config):
-        self._peft_config = peft_config
-
-    @abstractmethod
-    def build(self, total_num_steps):
-        pass
-
-    def get_callbacks(self) -> List[TrainerCallback]:
-        callbacks = []
-
-        plugin_manager = PluginManager.get_instance()
-        callbacks.extend(
-            plugin_manager.add_callbacks_pre_trainer(cfg=self.cfg, model=self.model)
-        )
-
-        if self.cfg.profiler_steps:
-            callbacks.append(
-                PytorchProfilerCallback(
-                    steps_to_profile=self.cfg.profiler_steps,
-                )
-            )
-
-        if self.cfg.gc_steps:
-            callbacks.append(GCCallback(gc_steps=self.cfg.gc_steps))
-
-        if self.cfg.use_wandb:
-            callbacks.append(
-                SaveAxolotlConfigtoWandBCallback(self.cfg.axolotl_config_path)
-            )
-        if self.cfg.use_mlflow and is_mlflow_available():
-            from axolotl.utils.callbacks.mlflow_ import (
-                SaveAxolotlConfigtoMlflowCallback,
-            )
-
-            callbacks.extend(
-                [
-                    SaveAxolotlConfigtoMlflowCallback(self.cfg.axolotl_config_path),
-                ]
-            )
-        if self.cfg.use_comet and is_comet_available():
-            from axolotl.utils.callbacks.comet_ import SaveAxolotlConfigtoCometCallback
-
-            callbacks.append(
-                SaveAxolotlConfigtoCometCallback(self.cfg.axolotl_config_path)
-            )
-
-        return callbacks
-
-    def get_post_trainer_create_callbacks(self, trainer):
-        """
-        Callbacks added after the trainer is created, usually b/c these need access to the trainer
-        """
-        callbacks = []
-        if self.cfg.plugins:
-            plugin_manager = PluginManager.get_instance()
-            callbacks.extend(
-                [
-                    cb
-                    for cb in plugin_manager.add_callbacks_post_trainer(
-                        self.cfg, trainer
-                    )
-                    if cb
-                ]
-            )
-        return callbacks
-
-    def hook_pre_create_training_args(self, training_arguments_kwargs):
-        # TODO
-        return training_arguments_kwargs
-
-    def hook_post_create_training_args(self, training_arguments):
-        # TODO
-        return training_arguments
-
-    def hook_pre_create_trainer(self, trainer_kwargs, trainer_cls):
-        # TODO
-        return trainer_kwargs, trainer_cls
-
-    def hook_post_create_trainer(self, trainer):
-        # TODO
-        return trainer
-
-
-class HFCausalTrainerBuilder(TrainerBuilderBase):
-    """
-    Build the HuggingFace training args/trainer for causal models and reward modeling
-    using TRL.
-    """
-
-    def get_callbacks(self):
-        callbacks = super().get_callbacks()
-        callbacks.append(GPUStatsCallback(self.cfg))
-        callbacks.append(EvalFirstStepCallback())
-
-        if self.cfg.relora_steps:
-            callbacks.append(ReLoRACallback(self.cfg))
-
-        if (
-            hasattr(self.model, "use_bettertransformer")
-            and self.model.use_bettertransformer is True
-        ):
-            callbacks.append(SaveBetterTransformerModelCallback())
-
-        if self.cfg.loss_watchdog_threshold is not None:
-            callbacks.append(LossWatchDogCallback(self.cfg))
-
-        return callbacks
-
-    def get_post_trainer_create_callbacks(self, trainer):
-        callbacks = []
-        if self.cfg.use_wandb and self.cfg.eval_table_size > 0:
-            LogPredictionCallback = log_prediction_callback_factory(
-                trainer, self.tokenizer, "wandb"
-            )
-            callbacks.append(LogPredictionCallback(self.cfg))
-        if (
-            self.cfg.use_mlflow
-            and is_mlflow_available()
-            and self.cfg.eval_table_size > 0
-        ):
-            LogPredictionCallback = log_prediction_callback_factory(
-                trainer, self.tokenizer, "mlflow"
-            )
-            callbacks.append(LogPredictionCallback(self.cfg))
-        if self.cfg.use_comet and is_comet_available() and self.cfg.eval_table_size > 0:
-            LogPredictionCallback = log_prediction_callback_factory(
-                trainer, self.tokenizer, "comet_ml"
-            )
-            callbacks.append(LogPredictionCallback(self.cfg))
-
-        if self.cfg.do_bench_eval:
-            callbacks.append(bench_eval_callback_factory(trainer, self.tokenizer))
-        if self.cfg.do_causal_lm_eval:
-            CausalLMBenchEvalCallback = causal_lm_bench_eval_callback_factory(
-                trainer, self.tokenizer
-            )
-            callbacks.append(CausalLMBenchEvalCallback(self.cfg))
-
-        if self.cfg.early_stopping_patience:
-            early_stop_cb = EarlyStoppingCallback(
-                self.cfg.early_stopping_patience,
-            )
-            callbacks.append(early_stop_cb)
-
-        if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers:
-            callbacks.append(lisa_callback_factory(trainer))
-
-        if any("COLAB_" in key for key in os.environ):
-            ColabCallback = colab_inference_post_train_callback(trainer)
-            callbacks.append(ColabCallback(self.cfg))
-
-        callbacks.extend(super().get_post_trainer_create_callbacks(trainer=trainer))
-        return callbacks
-
-    def _get_trainer_cls(self):
-        if self.cfg.plugins:
-            plugin_manager = PluginManager.get_instance()
-            trainer_cls = plugin_manager.get_trainer_cls(self.cfg)
-            if trainer_cls:
-                return trainer_cls
-        if self.cfg.relora_steps:
-            return ReLoRATrainer
-        if self.cfg.model_config_type == "mamba":
-            return AxolotlMambaTrainer
-        if self.cfg.reward_model:
-            return AxolotlRewardTrainer
-        if self.cfg.process_reward_model:
-            return AxolotlPRMTrainer
-        return AxolotlTrainer
-
-    def build(self, total_num_steps):
-        warmup_steps = None
-        if self.cfg.warmup_steps is not None:
-            warmup_steps = self.cfg.warmup_steps
-        elif self.cfg.warmup_ratio is not None:
-            warmup_steps = max(int(self.cfg.warmup_ratio * total_num_steps), 0)
-        else:
-            warmup_steps = min(int(0.03 * total_num_steps), 100)
-        if warmup_steps == 1:
-            warmup_steps = 2
-
-        logging_steps = (
-            self.cfg.logging_steps
-            if self.cfg.logging_steps is not None
-            else max(min(int(0.005 * total_num_steps), 10), 1)
-        )
-
-        training_arguments_kwargs = {}
-
-        if self.cfg.include_tokens_per_second is not None:
-            training_arguments_kwargs["include_tokens_per_second"] = (
-                self.cfg.include_tokens_per_second
-            )
-
-        if self.cfg.bf16 == "full":
-            training_arguments_kwargs["bf16_full_eval"] = True
-        else:
-            training_arguments_kwargs["bf16"] = self.cfg.bf16
-        training_arguments_kwargs["fp16"] = (
-            self.cfg.fp16 and not self.cfg.bf16
-        ) or False
-        training_arguments_kwargs["tf32"] = self.cfg.tf32
-        training_arguments_kwargs["warmup_steps"] = warmup_steps
-        training_arguments_kwargs["logging_steps"] = logging_steps
-
-        if self.cfg.seed is not None:
-            training_arguments_kwargs["seed"] = self.cfg.seed
-
-        if self.cfg.gradient_checkpointing:
-            training_arguments_kwargs["gradient_checkpointing"] = (
-                self.cfg.gradient_checkpointing
-            )
-            if self.cfg.gradient_checkpointing_kwargs is not None:
-                training_arguments_kwargs["gradient_checkpointing_kwargs"] = (
-                    self.cfg.gradient_checkpointing_kwargs
-                )
-        if self.cfg.fsdp:
-            training_arguments_kwargs["fsdp"] = self.cfg.fsdp
-            if self.cfg.fsdp_config:
-                training_arguments_kwargs["fsdp_config"] = {
-                    k.lstrip("fsdp_"): v for k, v in dict(self.cfg.fsdp_config).items()
-                }
-
-        if self.cfg.adapter == "qlora":
-            training_arguments_kwargs["qlora"] = True
-
-        # deepspeed
-        if self.cfg.deepspeed:
-            training_arguments_kwargs["deepspeed"] = self.cfg.deepspeed
-
-        if self.cfg.lr_quadratic_warmup is not None:
-            training_arguments_kwargs["lr_quadratic_warmup"] = (
-                self.cfg.lr_quadratic_warmup
-            )
-
-        if self.cfg.adam_beta1:
-            training_arguments_kwargs["adam_beta1"] = self.cfg.adam_beta1
-        if self.cfg.adam_beta2:
-            training_arguments_kwargs["adam_beta2"] = self.cfg.adam_beta2
-        if self.cfg.adam_beta3:
-            training_arguments_kwargs["adam_beta3"] = self.cfg.adam_beta3
-        if self.cfg.adam_epsilon:
-            training_arguments_kwargs["adam_epsilon"] = self.cfg.adam_epsilon
-        if self.cfg.adam_epsilon2:
-            training_arguments_kwargs["adam_epsilon2"] = self.cfg.adam_epsilon2
-        if self.cfg.max_grad_norm:
-            training_arguments_kwargs["max_grad_norm"] = self.cfg.max_grad_norm
-
-        if self.cfg.hub_model_id:
-            training_arguments_kwargs["hub_model_id"] = self.cfg.hub_model_id
-            training_arguments_kwargs["push_to_hub"] = True
-            training_arguments_kwargs["hub_private_repo"] = True
-            training_arguments_kwargs["hub_always_push"] = True
-
-            if self.cfg.hub_strategy:
-                training_arguments_kwargs["hub_strategy"] = self.cfg.hub_strategy
-
-        if self.cfg.save_safetensors is not None:
-            training_arguments_kwargs["save_safetensors"] = self.cfg.save_safetensors
-
-        if self.cfg.dataloader_pin_memory is not None:
-            training_arguments_kwargs["dataloader_pin_memory"] = (
-                self.cfg.dataloader_pin_memory
-            )
-        if self.cfg.dataloader_num_workers is not None:
-            training_arguments_kwargs["dataloader_num_workers"] = (
-                self.cfg.dataloader_num_workers
-            )
-        if self.cfg.dataloader_prefetch_factor is not None:
-            training_arguments_kwargs["dataloader_prefetch_factor"] = (
-                self.cfg.dataloader_prefetch_factor
-            )
-        if self.cfg.dataloader_drop_last is not None:
-            training_arguments_kwargs["dataloader_drop_last"] = (
-                self.cfg.dataloader_drop_last
-            )
-        elif self.cfg.sample_packing and self.cfg.eval_sample_packing is False:
-            training_arguments_kwargs["dataloader_drop_last"] = True
-
-        if self.cfg.remove_unused_columns is not None:
-            training_arguments_kwargs["remove_unused_columns"] = (
-                self.cfg.remove_unused_columns
-            )
-
-        if not self.cfg.test_datasets and self.cfg.val_set_size == 0:
-            # no eval set, so don't eval
-            training_arguments_kwargs["eval_strategy"] = "no"
-        elif self.cfg.eval_steps:
-            training_arguments_kwargs["eval_strategy"] = "steps"
-            training_arguments_kwargs["eval_steps"] = self.cfg.eval_steps
-        elif self.cfg.eval_strategy:
-            training_arguments_kwargs["eval_strategy"] = self.cfg.eval_strategy
-        else:
-            # we have an eval set, but no steps defined, default to use epoch
-            training_arguments_kwargs["eval_strategy"] = "epoch"
-
-        if self.cfg.save_steps:
-            training_arguments_kwargs["save_strategy"] = "steps"
-            training_arguments_kwargs["save_steps"] = self.cfg.save_steps
-        elif self.cfg.save_strategy:
-            training_arguments_kwargs["save_strategy"] = self.cfg.save_strategy
-        else:
-            # default to saving each epoch if not defined
-            training_arguments_kwargs["save_strategy"] = "epoch"
-
-        training_arguments_kwargs["save_only_model"] = self.cfg.save_only_model
-
-        if self.cfg.do_bench_eval:
-            training_arguments_kwargs["do_bench_eval"] = self.cfg.do_bench_eval
-            if self.cfg.bench_dataset:
-                training_arguments_kwargs["bench_dataset"] = self.cfg.bench_dataset
-        if self.cfg.do_causal_lm_eval:
-            training_arguments_kwargs["do_causal_lm_eval"] = self.cfg.do_causal_lm_eval
-        if self.cfg.metric_for_best_model:
-            training_arguments_kwargs["metric_for_best_model"] = (
-                self.cfg.metric_for_best_model
-            )
-        if self.cfg.greater_is_better:
-            training_arguments_kwargs["greater_is_better"] = self.cfg.greater_is_better
-
-        if self.cfg.torch_compile:
-            if torch.__version__ < "2.1.0":  # pylint: disable=protected-access
-                LOG.warning("torch>=2.1.0 required for torch_compile to work properly")
-            elif torch._dynamo:  # pylint: disable=protected-access
-                torch._dynamo.config.suppress_errors = (  # pylint: disable=protected-access
-                    True
-                )
-                training_arguments_kwargs["torch_compile"] = self.cfg.torch_compile
-                if self.cfg.torch_compile_backend:
-                    training_arguments_kwargs["torch_compile_backend"] = (
-                        self.cfg.torch_compile_backend
-                    )
-                if self.cfg.torch_compile_mode:
-                    training_arguments_kwargs["torch_compile_mode"] = (
-                        self.cfg.torch_compile_mode
-                    )
-
-        # DDP Config
-        if self.cfg.ddp_timeout:
-            training_arguments_kwargs["ddp_timeout"] = self.cfg.ddp_timeout
-        # see https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html
-        if self.cfg.ddp_bucket_cap_mb:
-            training_arguments_kwargs["ddp_bucket_cap_mb"] = self.cfg.ddp_bucket_cap_mb
-        if self.cfg.ddp_broadcast_buffers is not None:
-            training_arguments_kwargs["ddp_broadcast_buffers"] = (
-                self.cfg.ddp_broadcast_buffers
-            )
-
-        # these are all the "standard" kwargs that are def used
-        training_arguments_kwargs["max_steps"] = (
-            self.cfg.max_steps if self.cfg.max_steps else -1
-        )
-        training_arguments_kwargs["max_seq_length"] = self.cfg.sequence_len
-        training_arguments_kwargs["per_device_train_batch_size"] = (
-            self.cfg.micro_batch_size
-        )
-        if self.cfg.eval_batch_size:
-            training_arguments_kwargs["per_device_eval_batch_size"] = (
-                self.cfg.eval_batch_size
-            )
-        if self.cfg.auto_find_batch_size is not None:
-            training_arguments_kwargs["auto_find_batch_size"] = (
-                self.cfg.auto_find_batch_size
-            )
-        training_arguments_kwargs["gradient_accumulation_steps"] = (
-            self.cfg.gradient_accumulation_steps
-        )
-        training_arguments_kwargs["eval_accumulation_steps"] = (
-            self.cfg.gradient_accumulation_steps
-        )
-        training_arguments_kwargs["num_train_epochs"] = self.cfg.num_epochs
-        training_arguments_kwargs["learning_rate"] = self.cfg.learning_rate
-        training_arguments_kwargs["output_dir"] = self.cfg.output_dir
-        training_arguments_kwargs["save_total_limit"] = (
-            self.cfg.save_total_limit if self.cfg.save_total_limit else 4
-        )
-        training_arguments_kwargs["load_best_model_at_end"] = (
-            (
-                self.cfg.load_best_model_at_end is not False
-                or self.cfg.early_stopping_patience
-            )
-            and (
-                (not self.cfg.test_datasets and self.cfg.val_set_size > 0)
-                or (self.cfg.test_datasets and self.cfg.val_set_size == 0)
-            )
-            and self.cfg.save_steps
-            and self.cfg.eval_steps
-            and self.cfg.save_steps % self.cfg.eval_steps == 0
-        ) or False
-
-        # handle ddp
-        ddp_find_unused_parameters = None
-        if self.cfg.ddp:
-            ddp_find_unused_parameters = bool(self.cfg.ddp_find_unused_parameters)
-        training_arguments_kwargs["ddp_find_unused_parameters"] = (
-            ddp_find_unused_parameters
-        )
-
-        training_arguments_kwargs["group_by_length"] = self.cfg.group_by_length
-        training_arguments_kwargs["curriculum_sampling"] = self.cfg.curriculum_sampling
-        report_to = []
-        if self.cfg.use_wandb:
-            report_to.append("wandb")
-        if self.cfg.use_mlflow:
-            report_to.append("mlflow")
-        if self.cfg.use_tensorboard:
-            report_to.append("tensorboard")
-        if self.cfg.use_comet:
-            report_to.append("comet_ml")
-
-        training_arguments_kwargs["report_to"] = report_to
-        if self.cfg.use_wandb:
-            training_arguments_kwargs["run_name"] = self.cfg.wandb_name
-        elif self.cfg.use_mlflow:
-            training_arguments_kwargs["run_name"] = self.cfg.mlflow_run_name
-        else:
-            training_arguments_kwargs["run_name"] = None
-
-        if self.cfg.lr_scheduler in ["one_cycle", "rex", "log_sweep"]:
-            training_arguments_kwargs["lr_scheduler_type"] = "cosine"
-            training_arguments_kwargs["alternate_lr_scheduler_type"] = (
-                self.cfg.lr_scheduler
-            )
-        else:
-            training_arguments_kwargs["lr_scheduler_type"] = (
-                self.cfg.lr_scheduler if self.cfg.lr_scheduler else "cosine"
-            )
-        training_arguments_kwargs["lr_scheduler_kwargs"] = (
-            self.cfg.lr_scheduler_kwargs if self.cfg.lr_scheduler_kwargs else {}
-        )
-        training_arguments_kwargs["cosine_min_lr_ratio"] = self.cfg.cosine_min_lr_ratio
-        training_arguments_kwargs["cosine_constant_lr_ratio"] = (
-            self.cfg.cosine_constant_lr_ratio
-        )
-        training_arguments_kwargs["weight_decay"] = (
-            self.cfg.weight_decay if self.cfg.weight_decay is not None else 0.0
-        )
-
-        training_arguments_kwargs["sample_packing"] = bool(self.cfg.sample_packing)
-        training_arguments_kwargs["multipack_real_batches"] = (
-            not self.cfg.flash_attention or self.cfg.multipack_real_batches
-        )
-        training_arguments_kwargs["eval_sample_packing"] = bool(
-            self.cfg.eval_sample_packing
-        )
-        if self.cfg.sample_packing_bin_size is not None:
-            training_arguments_kwargs["sample_packing_bin_size"] = (
-                self.cfg.sample_packing_bin_size
-            )
-        if self.cfg.sample_packing_group_size is not None:
-            training_arguments_kwargs["sample_packing_group_size"] = (
-                self.cfg.sample_packing_group_size
-            )
-        if self.cfg.sample_packing_eff_est:
-            training_arguments_kwargs["sample_packing_efficiency"] = (
-                self.cfg.sample_packing_eff_est
-            )
-
-        if self.cfg.relora_steps:
-            training_arguments_kwargs["relora_steps"] = self.cfg.relora_steps
-            training_arguments_kwargs["relora_warmup_steps"] = (
-                self.cfg.relora_warmup_steps
-            )
-            if self.cfg.relora_anneal_steps:
-                training_arguments_kwargs["relora_anneal_steps"] = (
-                    self.cfg.relora_anneal_steps
-                )
-            if self.cfg.relora_prune_ratio:
-                training_arguments_kwargs["relora_prune_ratio"] = (
-                    self.cfg.relora_prune_ratio
-                )
-
-        if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers:
-            training_arguments_kwargs["lisa_n_layers"] = self.cfg.lisa_n_layers
-            training_arguments_kwargs["lisa_step_interval"] = (
-                self.cfg.lisa_step_interval
-            )
-            training_arguments_kwargs["lisa_layers_attribute"] = (
-                self.cfg.lisa_layers_attribute
-            )
-
-        training_arguments_kwargs = self.hook_pre_create_training_args(
-            training_arguments_kwargs
-        )
-        training_arguments_kwargs["model_type"] = self.cfg.model_config_type
-        training_arguments_kwargs["pretraining"] = bool(self.cfg.pretraining_dataset)
-        if self.cfg.chat_template:
-            training_arguments_kwargs["chat_template"] = get_chat_template_from_config(
-                cfg=self.cfg,
-                tokenizer=self.tokenizer,
-            )
-
-        if self.cfg.neftune_noise_alpha is not None:
-            training_arguments_kwargs["neftune_noise_alpha"] = (
-                self.cfg.neftune_noise_alpha
-            )
-
-        trainer_kwargs = {}
-
-        if self.cfg.reward_model:
-            training_arguments_kwargs["max_length"] = self.cfg.sequence_len
-
-        # Handle custom optimizer
-        custom_supported_optimizers = [opt.value for opt in CustomSupportedOptimizers]
-        if self.cfg.optimizer in custom_supported_optimizers:
-            # Common optimizer kwargs
-            optimizer_kwargs = {
-                "lr": training_arguments_kwargs.get("learning_rate"),
-                "weight_decay": training_arguments_kwargs.get("weight_decay"),
-            }
-
-            # Adam-specific kwargs
-            adam_kwargs = {}
-            if training_arguments_kwargs.get(
-                "adam_beta1"
-            ) and training_arguments_kwargs.get("adam_beta2"):
-                adam_kwargs["betas"] = (
-                    training_arguments_kwargs.get("adam_beta1"),
-                    training_arguments_kwargs.get("adam_beta2"),
-                )
-            if training_arguments_kwargs.get("adam_epsilon"):
-                adam_kwargs["eps"] = training_arguments_kwargs.get("adam_epsilon")
-
-            if self.cfg.optimizer == "muon":
-                from axolotl.contribs.mit.muon import (  # pylint: disable=no-name-in-module
-                    MuonOptimizerFactory,
-                )
-
-                optimizer_cls = MuonOptimizerFactory
-                optimizer_kwargs.update(adam_kwargs)
-            elif self.cfg.optimizer == "optimi_adamw":
-                from optimi import AdamW
-
-                optimizer_kwargs["foreach"] = False
-                optimizer_cls = AdamW
-                optimizer_kwargs.update(adam_kwargs)
-            elif self.cfg.optimizer == "ao_adamw_4bit":
-                # TODO remove 20250401
-                from torchao.prototype.low_bit_optim import AdamW4bit
-
-                optimizer_cls = AdamW4bit
-                optimizer_kwargs.update(adam_kwargs)
-
-                LOG.warning(
-                    f"`ao_adamw_4bit` will be deprecated soon. Please use `{OptimizerNames.ADAMW_TORCH_4BIT}` instead."
-                )
-            elif self.cfg.optimizer == "ao_adamw_8bit":
-                from torchao.prototype.low_bit_optim import AdamW8bit
-
-                optimizer_cls = AdamW8bit
-                optimizer_kwargs.update(adam_kwargs)
-            elif self.cfg.optimizer == "ao_adamw_fp8":
-                from torchao.prototype.low_bit_optim import AdamWFp8
-
-                optimizer_cls = AdamWFp8
-                optimizer_kwargs.update(adam_kwargs)
-            elif self.cfg.optimizer == "adopt_adamw":
-                from axolotl.utils.optimizers.adopt import ADOPT
-
-                optimizer_cls = ADOPT
-                adam_kwargs["decouple"] = True
-                optimizer_kwargs.update(adam_kwargs)
-            elif self.cfg.optimizer == "came_pytorch":
-                from came_pytorch import CAME
-
-                optimizer_cls = CAME
-
-                beta1 = training_arguments_kwargs.get("adam_beta1", 0.9)
-                beta2 = training_arguments_kwargs.get("adam_beta2", 0.999)
-                beta3 = training_arguments_kwargs.get("adam_beta3", 0.9999)
-                eps1 = training_arguments_kwargs.get("adam_epsilon", 1e-30)
-                eps2 = training_arguments_kwargs.get("adam_epsilon2", 1e-16)
-                adam_kwargs["betas"] = (beta1, beta2, beta3)
-                adam_kwargs["eps"] = (eps1, eps2)
-
-                optimizer_kwargs.update(adam_kwargs)
-
-            # Parse any additional optimizer args from config
-            if self.cfg.optim_args:
-                if isinstance(self.cfg.optim_args, dict):
-                    optimizer_kwargs.update(self.cfg.optim_args)
-                else:
-                    # Parse string format "key1=value1,key2=value2"
-                    for mapping in self.cfg.optim_args.replace(" ", "").split(","):
-                        key, value = mapping.split("=")
-                        optimizer_kwargs[key] = value
-
-            trainer_kwargs["optimizer_cls_and_kwargs"] = (
-                optimizer_cls,
-                optimizer_kwargs,
-            )
-        else:
-            # Use transformers' optimizer
-            training_arguments_kwargs["optim"] = self.cfg.optimizer
-
-            # Parse any additional optimizer args from config
-            if self.cfg.optim_args:
-                if isinstance(self.cfg.optim_args, dict):
-                    optim_args = ",".join(
-                        [f"{key}={value}" for key, value in self.cfg.optim_args.items()]
-                    )
-                else:
-                    optim_args = self.cfg.optim_args
-                training_arguments_kwargs["optim_args"] = optim_args
-
-        if self.cfg.optimizer == "adamw_anyprecision":
-            if Path(self.cfg.torchdistx_path).exists():
-                sys.path.append(self.cfg.torchdistx_path)
-                importlib.import_module("torchdistx")
-
-        if self.cfg.optim_target_modules:
-            training_arguments_kwargs["optim_target_modules"] = (
-                self.cfg.optim_target_modules
-            )
-
-        training_arguments_kwargs["embedding_lr"] = self.cfg.embedding_lr
-        training_arguments_kwargs["embedding_lr_scale"] = self.cfg.embedding_lr_scale
-
-        training_arguments_kwargs["loraplus_lr_ratio"] = self.cfg.loraplus_lr_ratio
-        training_arguments_kwargs["loraplus_lr_embedding"] = (
-            self.cfg.loraplus_lr_embedding
-        )
-        training_arguments_kwargs["lr_groups"] = self.cfg.lr_groups
-
-        if self.cfg.accelerator_config:
-            training_arguments_kwargs["accelerator_config"] = (
-                self.cfg.accelerator_config
-            )
-
-        if self.cfg.image_size:
-            training_arguments_kwargs["image_size"] = self.cfg.image_size
-        if self.cfg.image_resize_algorithm:
-            training_arguments_kwargs["image_resize_algorithm"] = (
-                self.cfg.image_resize_algorithm
-            )
-        if self.cfg.kd_ce_alpha is not None:
-            training_arguments_kwargs["kd_ce_alpha"] = self.cfg.kd_ce_alpha
-        if self.cfg.kd_alpha is not None:
-            training_arguments_kwargs["kd_alpha"] = self.cfg.kd_alpha
-        if self.cfg.kd_temperature is not None:
-            training_arguments_kwargs["kd_temperature"] = self.cfg.kd_temperature
-        if self.cfg.kd_zscore_base_temp is not None:
-            training_arguments_kwargs["kd_zscore_base_temp"] = (
-                self.cfg.kd_zscore_base_temp
-            )
-        if self.cfg.kd_top_k_before_softmax is not None:
-            training_arguments_kwargs["kd_top_k_before_softmax"] = (
-                self.cfg.kd_top_k_before_softmax
-            )
-
-        if self.cfg.reward_model:
-            training_args_cls = AxolotlRewardConfig
-        elif self.cfg.process_reward_model:
-            training_args_cls = AxolotlPRMConfig
-        else:
-            training_args_cls = AxolotlTrainingArguments
-        training_args = training_args_cls(  # pylint: disable=unexpected-keyword-arg
-            **training_arguments_kwargs,
-        )
-        training_args = self.hook_post_create_training_args(training_args)
-
-        # unset run_name so wandb sets up experiment names
-        if self.cfg.use_wandb and training_args.run_name == training_args.output_dir:
-            training_args.run_name = (  # pylint: disable=attribute-defined-outside-init
-                None
-            )
-
-        data_collator_kwargs = {
-            "padding": True,  # True/"longest" is the default
-        }
-        multiple = 64
-        if self.cfg.pad_to_sequence_len:
-            data_collator_kwargs["pad_to_multiple_of"] = multiple * math.ceil(
-                self.cfg.sequence_len / multiple
-            )
-        else:
-            # A100 is best at 64, while others at 8. Let's use the larger so we don't have to check
-            # https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html
-            data_collator_kwargs["pad_to_multiple_of"] = multiple
-
-        if self.cfg.reward_model:
-            data_collator_kwargs["max_length"] = self.cfg.sequence_len
-
-        trainer_cls = self._get_trainer_cls()
-        trainer_kwargs, trainer_cls = self.hook_pre_create_trainer(
-            trainer_kwargs, trainer_cls
-        )
-        if eval_data_collator := self.build_collator(
-            training_args, is_eval=True, **data_collator_kwargs
-        ):
-            if not (self.cfg.reward_model or self.cfg.process_reward_model):
-                trainer_kwargs["eval_data_collator"] = eval_data_collator
-        if not (self.cfg.reward_model or self.cfg.process_reward_model):
-            trainer_kwargs["bench_data_collator"] = transformers.DataCollatorForSeq2Seq(
-                self.tokenizer,
-                return_tensors="pt",
-                **data_collator_kwargs,
-            )
-        sig = inspect.signature(trainer_cls)
-        if "processing_class" in sig.parameters.keys():
-            trainer_kwargs["processing_class"] = self.tokenizer
-        else:
-            trainer_kwargs["tokenizer"] = self.tokenizer
-        if (
-            not (trainer_cls in [AxolotlRewardTrainer, AxolotlPRMTrainer])
-            and self.cfg.datasets is not None
-        ):
-            trainer_kwargs["dataset_tags"] = [
-                d["path"] for d in self.cfg.datasets if not Path(d["path"]).is_dir()
-            ]
-        trainer = trainer_cls(
-            model=self.model,
-            train_dataset=self.train_dataset,
-            eval_dataset=self.eval_dataset,
-            args=training_args,
-            data_collator=self.build_collator(training_args, **data_collator_kwargs),
-            callbacks=self.get_callbacks(),
-            **trainer_kwargs,
-        )
-        trainer = self.hook_post_create_trainer(trainer)
-        for callback in self.get_post_trainer_create_callbacks(trainer):
-            trainer.add_callback(callback)
-
-        if self.cfg.deepspeed and self.cfg.sample_packing:
-            trainer.accelerator.state.deepspeed_plugin.deepspeed_config[
-                "train_micro_batch_size_per_gpu"
-            ] = self.cfg.micro_batch_size
-
-        return trainer
-
-    def build_collator(
-        self, training_args: AxolotlTrainingArguments, is_eval=False, **kwargs
-    ):
-        if training_args.pretraining:
-            if (
-                self.cfg.pretraining_sample_concatenation is False
-                or self.cfg.micro_batch_size > 1
-            ):
-                return DataCollatorForSeq2Seq(self.tokenizer, **kwargs)
-            return None
-
-        if self.cfg.model_config_type == "mamba":
-            return MambaDataCollator(tokenizer=self.tokenizer)
-
-        use_batch_sampler_collator = False
-        if is_eval is False and training_args.sample_packing:
-            use_batch_sampler_collator = True
-        if is_eval and training_args.eval_sample_packing:
-            use_batch_sampler_collator = True
-
-        collator: Type[
-            Union[
-                V2BatchSamplerDataCollatorForSeq2Seq,
-                BatchSamplerDataCollatorForSeq2Seq,
-                DataCollatorForSeq2Seq,
-                DataCollatorWithFlattening,
-                RewardDataCollatorWithPadding,
-            ]
-        ]
-        collator_args = [self.tokenizer]
-        if self.cfg.reward_model:
-            collator = RewardDataCollatorWithPadding
-            if "max_length" in kwargs:
-                kwargs.pop("max_length")
-        elif use_batch_sampler_collator:
-            if self.cfg.flex_attention:
-                collator = V2BatchSamplerDataCollatorForSeq2Seq
-            elif self.cfg.model_config_type in SUPPORTED_MULTIPACK_MODEL_TYPES:
-                collator = V2BatchSamplerDataCollatorForSeq2Seq
-            elif (
-                self.cfg.model_config_type in ["llama"]
-                and self.cfg.flash_attention is not True
-            ):
-                collator = V2BatchSamplerDataCollatorForSeq2Seq
-            else:
-                collator = BatchSamplerDataCollatorForSeq2Seq
-        else:
-            if self.cfg.processor_type and self.processor:
-                collator = MultiModalChatDataCollator
-                kwargs["processing_strategy"] = get_processing_strategy(
-                    self.processor,
-                    training_args.chat_template,
-                    self.cfg.chat_template,
-                    image_size=training_args.image_size,
-                    image_resize_algorithm=training_args.image_resize_algorithm,
-                )
-            elif self.cfg.batch_flattening:
-                collator = DataCollatorWithFlattening
-                collator_args.pop(0)
-                kwargs.pop("pad_to_multiple_of", None)
-                kwargs.pop("padding", None)
-            elif self.cfg.kd_trainer:
-                from axolotl.integrations.kd.collator import (
-                    DataCollatorForKD,
-                    KDBatchSamplerDataCollatorForSeq2Seq,
-                )
-
-                if self.cfg.sample_packing:
-                    collator = KDBatchSamplerDataCollatorForSeq2Seq
-                else:
-                    collator = DataCollatorForKD
-            else:
-                collator = DataCollatorForSeq2Seq
-
-        kwargs["return_tensors"] = "pt"
-
-        return collator(
-            *collator_args,
-            **kwargs,
-        )
-
-
-class HFRLTrainerBuilder(TrainerBuilderBase):
-    """Trainer factory class for TRL-based RLHF trainers (e.g. DPO)"""
-
-    def get_callbacks(self):
-        callbacks = super().get_callbacks()
-
-        return callbacks
-
-    def get_post_trainer_create_callbacks(self, trainer):
-        callbacks = super().get_post_trainer_create_callbacks(trainer=trainer)
-        return callbacks
-
-    def build_training_arguments(self, total_num_steps):
-        training_args_kwargs = {}
-        for arg in [
-            "adam_beta1",
-            "adam_beta2",
-            "adam_epsilon",
-            "dataloader_num_workers",
-            "dataloader_pin_memory",
-        ]:
-            if hasattr(self.cfg, arg) and getattr(self.cfg, arg) is not None:
-                training_args_kwargs[arg] = getattr(self.cfg, arg)
-
-        if self.cfg.hub_model_id:
-            training_args_kwargs["hub_model_id"] = self.cfg.hub_model_id
-            training_args_kwargs["push_to_hub"] = True
-            training_args_kwargs["hub_private_repo"] = True
-            training_args_kwargs["hub_always_push"] = True
-
-            if self.cfg.hub_strategy:
-                training_args_kwargs["hub_strategy"] = self.cfg.hub_strategy
-
-        if self.cfg.save_safetensors is not None:
-            training_args_kwargs["save_safetensors"] = self.cfg.save_safetensors
-
-        if self.eval_dataset:
-            training_args_kwargs["eval_strategy"] = "steps"
-            training_args_kwargs["eval_steps"] = self.cfg.eval_steps
-        else:
-            training_args_kwargs["eval_strategy"] = "no"
-
-        if self.cfg.bf16 or self.cfg.bfloat16:
-            training_args_kwargs["bf16"] = True
-
-        training_args_kwargs["loraplus_lr_ratio"] = self.cfg.loraplus_lr_ratio
-        training_args_kwargs["loraplus_lr_embedding"] = self.cfg.loraplus_lr_embedding
-        training_args_kwargs["lr_scheduler_type"] = (
-            self.cfg.lr_scheduler if self.cfg.lr_scheduler else "cosine"
-        )
-        training_args_kwargs["lr_scheduler_kwargs"] = (
-            self.cfg.lr_scheduler_kwargs if self.cfg.lr_scheduler_kwargs else {}
-        )
-        if self.cfg.remove_unused_columns is not None:
-            training_args_kwargs["remove_unused_columns"] = (
-                self.cfg.remove_unused_columns
-            )
-        else:
-            training_args_kwargs["remove_unused_columns"] = False
-
-        if self.cfg.dataloader_pin_memory is not None:
-            training_args_kwargs["dataloader_pin_memory"] = (
-                self.cfg.dataloader_pin_memory
-            )
-        if self.cfg.dataloader_num_workers is not None:
-            training_args_kwargs["dataloader_num_workers"] = (
-                self.cfg.dataloader_num_workers
-            )
-        if self.cfg.dataloader_prefetch_factor is not None:
-            training_args_kwargs["dataloader_prefetch_factor"] = (
-                self.cfg.dataloader_prefetch_factor
-            )
-
-        if self.cfg.seed is not None:
-            training_args_kwargs["seed"] = self.cfg.seed
-
-        if self.cfg.gradient_checkpointing:
-            training_args_kwargs["gradient_checkpointing"] = (
-                self.cfg.gradient_checkpointing
-            )
-            if self.cfg.gradient_checkpointing_kwargs is not None:
-                training_args_kwargs["gradient_checkpointing_kwargs"] = (
-                    self.cfg.gradient_checkpointing_kwargs
-                )
-            else:
-                training_args_kwargs["gradient_checkpointing_kwargs"] = {
-                    "use_reentrant": False
-                }
-
-        # set save_strategy and save_steps
-        if self.cfg.save_steps:
-            training_args_kwargs["save_strategy"] = "steps"
-            training_args_kwargs["save_steps"] = self.cfg.save_steps
-        elif self.cfg.save_strategy:
-            training_args_kwargs["save_strategy"] = self.cfg.save_strategy
-        else:
-            # default to saving each epoch if not defined
-            training_args_kwargs["save_strategy"] = "epoch"
-
-        training_args_kwargs["save_only_model"] = self.cfg.save_only_model
-
-        if self.cfg.dataset_processes:
-            training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
-
-        if self.cfg.trl and self.cfg.trl.beta is not None:
-            training_args_kwargs["beta"] = self.cfg.trl.beta
-        elif self.cfg.rl_beta is not None:
-            training_args_kwargs["beta"] = self.cfg.rl_beta
-        elif self.cfg.orpo_alpha is not None:
-            # trl does some odd mapping of alpha to beta to reuse the beta parameter ???
-            training_args_kwargs["beta"] = self.cfg.orpo_alpha
-
-        if self.cfg.rpo_alpha is not None:
-            training_args_kwargs["rpo_alpha"] = self.cfg.rpo_alpha
-
-        if self.cfg.use_wandb:
-            training_args_kwargs["run_name"] = self.cfg.wandb_name
-
-        training_args_cls = None
-        blocklist_args_kwargs = []
-        if self.cfg.rl is RLType.SIMPO:
-            training_args_cls = AxolotlCPOConfig
-            training_args_kwargs["loss_type"] = "simpo"
-            training_args_kwargs["max_length"] = self.cfg.sequence_len
-            training_args_kwargs["simpo_gamma"] = self.cfg.simpo_gamma
-            if self.cfg.cpo_alpha is not None:
-                training_args_kwargs["cpo_alpha"] = self.cfg.cpo_alpha
-
-        elif self.cfg.rl is RLType.ORPO:
-            training_args_cls = AxolotlORPOConfig
-            training_args_kwargs["max_length"] = self.cfg.sequence_len
-            if self.cfg.max_prompt_len:
-                training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len
-
-        elif self.cfg.rl is RLType.KTO:
-            training_args_cls = AxolotlKTOConfig
-
-            training_args_kwargs["desirable_weight"] = (
-                self.cfg.kto_desirable_weight or 1.0
-            )
-            training_args_kwargs["undesirable_weight"] = (
-                self.cfg.kto_undesirable_weight or 1.0
-            )
-
-            training_args_kwargs["max_length"] = self.cfg.sequence_len
-            if self.cfg.max_prompt_len:
-                training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len
-
-        elif self.cfg.rl is RLType.GRPO:
-            training_args_cls = GRPOStrategy.get_training_args_class()
-            training_args_kwargs.update(GRPOStrategy.set_training_args_kwargs(self.cfg))
-            blocklist_args_kwargs = GRPOStrategy.get_blocklist_args_kwargs()
-
-        else:
-            training_args_cls = AxolotlDPOConfig
-            if self.cfg.rl is RLType.IPO:
-                training_args_kwargs["loss_type"] = "ipo"
-            training_args_kwargs["max_length"] = self.cfg.sequence_len
-            training_args_kwargs["max_completion_length"] = None
-            training_args_kwargs["max_prompt_length"] = self.cfg.sequence_len
-            training_args_kwargs["generate_during_eval"] = self.cfg.use_wandb
-            if self.cfg.dpo_use_weighting is not None:
-                training_args_kwargs["use_weighting"] = self.cfg.dpo_use_weighting
-            if self.cfg.dpo_use_logits_to_keep is not None:
-                training_args_kwargs["use_logits_to_keep"] = (
-                    self.cfg.dpo_use_logits_to_keep
-                )
-
-        for blocklist_key in blocklist_args_kwargs:
-            if blocklist_key in training_args_kwargs:
-                del training_args_kwargs[blocklist_key]
-
-        max_steps = self.cfg.max_steps or total_num_steps or -1
-        training_args_kwargs["num_train_epochs"] = self.cfg.num_epochs
-        training_args = training_args_cls(  # pylint: disable=unexpected-keyword-arg
-            self.cfg.output_dir,
-            per_device_train_batch_size=self.cfg.micro_batch_size,
-            max_steps=max_steps,
-            gradient_accumulation_steps=self.cfg.gradient_accumulation_steps,
-            learning_rate=self.cfg.learning_rate,
-            warmup_steps=self.cfg.warmup_steps,
-            logging_first_step=True,
-            logging_steps=1,
-            optim=self.cfg.optimizer,
-            save_total_limit=self.cfg.save_total_limit or 5,
-            **training_args_kwargs,
-        )
-
-        # unset run_name so wandb sets up experiment names
-        if self.cfg.use_wandb and training_args.run_name == training_args.output_dir:
-            training_args.run_name = (  # pylint: disable=attribute-defined-outside-init
-                None
-            )
-
-        return training_args
-
-    def build(self, total_num_steps):
-        training_args = self.build_training_arguments(total_num_steps)
-        trainer_kwargs = {}
-        if self.cfg.rl is RLType.IPO:
-            if self.cfg.dpo_label_smoothing:
-                trainer_kwargs["label_smoothing"] = self.cfg.dpo_label_smoothing
-        if self.eval_dataset:
-            trainer_kwargs["eval_dataset"] = self.eval_dataset
-        if self.cfg.adapter and self.peft_config:
-            if self.cfg.rl is not RLType.GRPO:
-                trainer_kwargs["peft_config"] = self.peft_config
-        if self.cfg.precompute_ref_log_probs is not None:
-            trainer_kwargs["precompute_ref_log_probs"] = (
-                self.cfg.precompute_ref_log_probs
-            )
-        if self.cfg.rl is RLType.GRPO:
-            trainer_cls = GRPOStrategy.get_trainer_class(
-                sequence_parallel=self.cfg.sequence_parallel_degree > 1
-            )
-            trainer_cls_args = [self.model]
-            trainer_cls_args.extend(GRPOStrategy.set_trainer_args(self.cfg))
-            trainer_kwargs.update(GRPOStrategy.set_trainer_kwargs(self.cfg))
-        elif self.cfg.rl in [RLType.DPO, RLType.IPO]:
-            trainer_cls = DPOStrategy.get_trainer_class()
-            trainer_cls_args = [self.model, self.model_ref]
-        elif self.cfg.rl is RLType.ORPO:
-            trainer_cls = AxolotlORPOTrainer
-            trainer_cls_args = [self.model]
-        elif self.cfg.rl is RLType.KTO:
-            trainer_cls = AxolotlKTOTrainer
-            trainer_cls_args = [self.model]
-        elif self.cfg.rl is RLType.SIMPO:
-            trainer_cls = AxolotlCPOTrainer
-            trainer_cls_args = [self.model]
-        else:
-            raise ValueError(f"Unsupported RL: {self.cfg.rl}")
-
-        if self.cfg.plugins:
-            plugin_manager = PluginManager.get_instance()
-            temp_trainer_cls = plugin_manager.get_trainer_cls(self.cfg)
-            if temp_trainer_cls is not None:
-                trainer_cls = temp_trainer_cls
-
-        sig = inspect.signature(trainer_cls)
-        if "tokenizer" in sig.parameters.keys():
-            trainer_kwargs["tokenizer"] = self.tokenizer
-        else:
-            trainer_kwargs["processing_class"] = self.tokenizer
-
-        if self.cfg.datasets is not None and (
-            trainer_cls is DPOStrategy.get_trainer_class()
-        ):
-            trainer_kwargs["dataset_tags"] = [
-                d["path"] for d in self.cfg.datasets if not Path(d["path"]).is_dir()
-            ]
-        trainer = trainer_cls(
-            *trainer_cls_args,
-            args=training_args,
-            train_dataset=self.train_dataset,
-            callbacks=self.get_callbacks(),
-            **trainer_kwargs,
-        )
-        if self.cfg.fsdp:
-            ensure_dtype(trainer.model, dtype=self.cfg.torch_dtype)
-            if self.cfg.rl in [RLType.DPO, RLType.IPO] and trainer.ref_model:
-                ensure_dtype(trainer.ref_model, dtype=self.cfg.torch_dtype)
-
-        trainer = self.hook_post_create_trainer(trainer)
-        for callback in self.get_post_trainer_create_callbacks(trainer):
-            trainer.add_callback(callback)
-
-        return trainer
-
-
-class HFPPOTrainerBuilder(TrainerBuilderBase):
-    """
-    HF Factory class for PPO Trainer
-    """
-
-    def get_callbacks(self):
-        callbacks = super().get_callbacks()
-        return callbacks
-
-    def get_post_trainer_create_callbacks(self, trainer):
-        callbacks = super().get_post_trainer_create_callbacks(trainer=trainer)
-        return callbacks
-
-    def build(self, total_num_steps):
-        # build PPOConfig
-        pass
diff --git a/src/axolotl/core/trainers/__init__.py b/src/axolotl/core/trainers/__init__.py
index 2cdc9c195..5f97e387a 100644
--- a/src/axolotl/core/trainers/__init__.py
+++ b/src/axolotl/core/trainers/__init__.py
@@ -7,12 +7,10 @@ from .base import AxolotlTrainer
 from .dpo.trainer import AxolotlDPOTrainer
 from .grpo.trainer import AxolotlGRPOSequenceParallelTrainer, AxolotlGRPOTrainer
 from .mamba import AxolotlMambaTrainer
-from .relora import ReLoRATrainer
 from .trl import (
     AxolotlCPOTrainer,
     AxolotlKTOTrainer,
     AxolotlORPOTrainer,
     AxolotlPRMTrainer,
     AxolotlRewardTrainer,
-    TRLPPOTrainer,
 )
diff --git a/src/axolotl/core/trainers/base.py b/src/axolotl/core/trainers/base.py
index d5cfc23df..0f9f6e4c4 100644
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -4,15 +4,17 @@
 
 from __future__ import annotations
 
-import logging
 import os
 from collections import defaultdict
-from functools import wraps
-from typing import Literal
+from functools import partial, wraps
+from typing import Any, Callable, Literal, Optional
 
 import datasets
+import safetensors
 import torch
+from accelerate.state import AcceleratorState
 from datasets import Dataset
+from peft import PeftModel
 from torch.utils.data import (
     BatchSampler,
     DataLoader,
@@ -20,13 +22,19 @@ from torch.utils.data import (
     Sampler,
     SequentialSampler,
 )
-from transformers import Trainer
-from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, seed_worker
+from transformers import PreTrainedModel, Trainer
+from transformers.trainer import TRAINING_ARGS_NAME
+from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, has_length, seed_worker
+from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME, is_peft_available
 from trl.trainer.utils import pad_to_length
 from typing_extensions import override
 
 from axolotl.core.trainers.mixins import (
+    ActivationOffloadingMixin,
+    CheckpointSaveMixin,
+    DistributedParallelMixin,
     OptimizerMixin,
+    PackingMixin,
     RngLoaderMixin,
     SchedulerMixin,
 )
@@ -34,12 +42,25 @@ from axolotl.core.trainers.utils import (
     sanitize_kwargs_for_ds_tagging,
     sanitize_kwargs_for_tagging,
 )
+from axolotl.utils import get_not_null
+from axolotl.utils.bench import get_gpu_memory_usage
+from axolotl.utils.distributed import is_main_process
+from axolotl.utils.logging import get_logger
 from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
 
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__)
 
 
-class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
+class AxolotlTrainer(
+    PackingMixin,
+    SchedulerMixin,
+    OptimizerMixin,
+    RngLoaderMixin,
+    CheckpointSaveMixin,
+    ActivationOffloadingMixin,
+    DistributedParallelMixin,
+    Trainer,
+):
     """Extend the base Trainer for axolotl helpers"""
 
     args = None  # type: "AxolotlTrainingArguments"  # type: ignore[name-defined]
@@ -65,18 +86,6 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
         if self.args.orpo_alpha:
             self.loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
 
-    def _wrap_model(self, model, training=True, dataloader=None):
-        if self.args.torch_compile:
-            torch._dynamo.config.accumulated_cache_size_limit = (  # pylint: disable=protected-access
-                256
-            )
-            model = torch.compile(
-                model,
-                backend=self.args.torch_compile_backend,
-                mode=self.args.torch_compile_mode,
-            )
-        return super()._wrap_model(model, training=training, dataloader=dataloader)
-
     def _create_multipack_sampler(
         self, base_sampler: Sampler, dataset: Dataset
     ) -> MultipackBatchSampler:
@@ -101,7 +110,7 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
             )
             batch_max_len = train_batch_size * self.args.max_seq_length
 
-        return MultipackBatchSampler(
+        sampler = MultipackBatchSampler(
             base_sampler,
             lengths=get_dataset_lengths(dataset),
             packing_efficiency_estimate=self.args.sample_packing_efficiency,
@@ -111,9 +120,16 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
             bin_size=self.args.sample_packing_bin_size,
             sequential=self.args.sample_packing_sequentially,
             drop_last=True,
+            num_processes=self.args.dataset_num_proc,
+            mp_start_method=self.args.sample_packing_mp_start_method or "fork",
         )
 
-    def _get_train_sampler(self) -> Sampler | None:
+        len(sampler)
+        return sampler
+
+    def _get_train_sampler(
+        self, train_dataset: Dataset | None = None
+    ) -> Sampler | None:
         """
         Helper method to get the sampler for training. Handles cases for sample packing
         and curriculum sampling (sequential).
@@ -122,22 +138,28 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
             If the dataset is non-empty, a sampler is returned, the type of which
                 depends on the passed training args.
         """
+        # from https://github.com/huggingface/transformers/blob/2166b6b4ff09f6dd3867ab982f262f66482aa968/src/transformers/trainer.py#L969C1-L972C24
+        if train_dataset is None:
+            train_dataset = self.train_dataset
+        if train_dataset is None or not has_length(train_dataset):
+            return None
+
         use_sample_packing = self.args.sample_packing and not self.args.pretraining
 
         # Determine the base sampler first
         if self.args.curriculum_sampling:
-            base_sampler = SequentialSampler(self.train_dataset)
+            base_sampler = SequentialSampler(train_dataset)
         elif use_sample_packing:
-            base_sampler = RandomSampler(self.train_dataset)
+            base_sampler = RandomSampler(train_dataset)
         else:
             # Default to parent class implementation for standard random sampling
-            return super()._get_train_sampler()
+            return super()._get_train_sampler(train_dataset)
 
         # Apply multipack wrapper if needed
         if use_sample_packing:
             return self._create_multipack_sampler(
                 base_sampler=base_sampler,
-                dataset=self.train_dataset,
+                dataset=train_dataset,
             )
 
         return base_sampler
@@ -150,7 +172,9 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
             If the dataset is non-empty, a sampler is returned, the type of which
                 depends on the passed training args.
         """
-        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
+        # from https://github.com/huggingface/transformers/blob/2166b6b4ff09f6dd3867ab982f262f66482aa968/src/transformers/trainer.py#L1065C9-L1066C24
+        if eval_dataset is None or not has_length(eval_dataset):
+            return None
 
         # Multipacking enabled if training is enabled and eval is not explicitly disabled
         use_multipack = (
@@ -172,125 +196,101 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
 
         return base_sampler
 
-    def _create_dataloader_params(self, is_eval=False, custom_batch_size=None):
-        """Create common dataloader parameters for train or eval."""
-        batch_size = custom_batch_size or (
-            self.args.eval_batch_size if is_eval else self._train_batch_size
-        )
+    def _get_dataloader(
+        self,
+        dataset: Dataset,
+        description: str,
+        batch_size: int,
+        sampler_fn: Optional[Callable[[Dataset], torch.utils.data.Sampler]] = None,
+        is_training: bool = False,
+        dataloader_key: Optional[str] = None,
+    ) -> DataLoader:
+        """Create a [`~torch.utils.data.DataLoader`] from the given dataset."""
 
-        params = {
+        data_collator = self.data_collator if is_training else self.eval_data_collator
+
+        if dataset.column_names and "length" in dataset.column_names:
+            dataset = dataset.remove_columns(["length"])
+        if (
+            dataset.column_names
+            and "position_ids" in dataset.column_names
+            and "attention_mask" in dataset.column_names
+            and self.args.sample_packing
+            and self.args.sample_packing_drop_attention_mask
+        ):
+            dataset = dataset.remove_columns(["attention_mask"])
+
+        if isinstance(dataset, datasets.Dataset):
+            if is_training:
+                if not self.args.sample_packing or self.args.pretraining:
+                    dataset = self._remove_unused_columns(
+                        dataset, description="training"
+                    )
+            elif (
+                not is_training
+                and self.args.sample_packing
+                and self.args.eval_sample_packing is not False
+            ):
+                batch_size = (
+                    batch_size
+                    if self.args.sample_packing
+                    else self.args.per_device_eval_batch_size
+                )
+            else:
+                dataset = self._remove_unused_columns(dataset, description=description)
+        else:
+            data_collator = self._get_collator_with_removed_columns(
+                self.data_collator, description=description
+            )
+
+        dataloader_params = {
             "batch_size": batch_size,
-            "collate_fn": self.data_collator,
+            "collate_fn": data_collator,
             "num_workers": self.args.dataloader_num_workers,
             "pin_memory": self.args.dataloader_pin_memory,
+            "persistent_workers": self.args.dataloader_persistent_workers,
         }
 
-        # Add persistent workers only for training
-        if not is_eval and hasattr(self.args, "dataloader_persistent_workers"):
-            params["persistent_workers"] = self.args.dataloader_persistent_workers
-
-        # Add prefetch factor if specified
-        if self.args.dataloader_prefetch_factor:
-            params["prefetch_factor"] = self.args.dataloader_prefetch_factor
-
-        return params
-
-    def _prepare_dataloader(
-        self, dataset, sampler, is_eval=False, custom_batch_size=None
-    ):
-        """Prepare a dataloader with the given dataset and sampler."""
-        # Get base parameters
-        dataloader_params = self._create_dataloader_params(is_eval, custom_batch_size)
-
-        # Add sampler configuration
         if not isinstance(dataset, torch.utils.data.IterableDataset):
-            if isinstance(sampler, BatchSampler):
-                # batch_size and batch_sampler are mutually exclusive
-                dataloader_params["batch_sampler"] = sampler
-                del dataloader_params["batch_size"]
-            else:
-                dataloader_params["sampler"] = sampler
-                dataloader_params["drop_last"] = self.args.dataloader_drop_last
-
-            if not is_eval:
-                dataloader_params["worker_init_fn"] = seed_worker
-
-        # Create the dataloader
-        dataloader = DataLoader(dataset, **dataloader_params)
+            dataloader_params["drop_last"] = get_not_null(
+                self.args.dataloader_drop_last, True
+            )
+            if sampler_fn is not None:
+                sampler = sampler_fn(dataset)
+                if isinstance(sampler, BatchSampler):
+                    # batch_size and batch_sampler are mutually exclusive
+                    dataloader_params["batch_sampler"] = sampler
+                    del dataloader_params["batch_size"]
+                    del dataloader_params["drop_last"]
+                else:
+                    dataloader_params["sampler"] = sampler
 
+            dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor
+            if is_training:
+                dataloader_params["worker_init_fn"] = partial(
+                    seed_worker,
+                    num_workers=self.args.dataloader_num_workers,
+                    rank=self.args.process_index,
+                )
         if self.args.sample_packing and (
-            (not is_eval and not self.args.pretraining)
-            or (is_eval and self.args.eval_sample_packing is not False)
+            (is_training and not self.args.pretraining)
+            or (not is_training and self.args.eval_sample_packing is not False)
         ):
             self.accelerator.even_batches = False
 
-        return self.accelerator.prepare_data_loader(dataloader)
+        dataloader = DataLoader(dataset, **dataloader_params)
 
-    def get_train_dataloader(self) -> DataLoader:
-        """Get dataloader for training"""
-        train_dataset = self.train_dataset
-        data_collator = self.data_collator  # type: ignore
+        # Accelerator.free_memory() will destroy the references, so
+        # we need to store the non-prepared version for eval dataloaders.
+        # fmt: off
+        if dataloader_key is not None and self.args.dataloader_persistent_workers:
+            if hasattr(self, "_eval_dataloaders"):
+                self._eval_dataloaders[dataloader_key] = dataloader  # type: ignore  # pylint: disable=access-member-before-definition
+            else:
+                self._eval_dataloaders = {dataloader_key: dataloader}  # pylint: disable=attribute-defined-outside-init
+        # fmt: on
 
-        # Handle dataset preprocessing
-        if isinstance(train_dataset, datasets.Dataset):
-            if self.args.sample_packing and not self.args.pretraining:
-                train_dataset = train_dataset.remove_columns(["length"])
-            if not self.args.sample_packing or self.args.pretraining:
-                train_dataset = self._remove_unused_columns(
-                    train_dataset, description="training"
-                )
-        else:
-            self.data_collator = self._get_collator_with_removed_columns(  # pylint: disable=attribute-defined-outside-init
-                data_collator,
-                description="training",
-            )
-
-        # Get sampler and create dataloader
-        sampler = self._get_train_sampler()
-        return self._prepare_dataloader(train_dataset, sampler, is_eval=False)
-
-    def get_eval_dataloader(self, eval_dataset: Dataset | None = None) -> DataLoader:
-        """Get dataloader for evaluation"""
-        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
-
-        # Handle special case: sample packing is enabled but eval_sample_packing is False
-        if self.args.sample_packing and self.args.eval_sample_packing is False:
-            self.data_collator = (  # pylint: disable=attribute-defined-outside-init
-                self.eval_data_collator
-            )
-            if "length" in eval_dataset.column_names:
-                eval_dataset = eval_dataset.remove_columns(["length"])
-            dataloader = super().get_eval_dataloader(eval_dataset)
-            self.data_collator = (  # pylint: disable=attribute-defined-outside-init
-                self.train_data_collator
-            )
-
-            return dataloader
-
-        if self.args.sample_packing and self.args.eval_sample_packing is not False:
-            # Get appropriate data collator
-            self.data_collator = (  # pylint: disable=attribute-defined-outside-init
-                self.eval_data_collator
-                if hasattr(self, "eval_data_collator") and self.eval_data_collator
-                else self.data_collator
-            )
-            if "length" in eval_dataset.column_names:
-                eval_dataset = eval_dataset.remove_columns(["length"])
-
-            # Use eval_batch_size for sample packing, per_device_eval_batch_size otherwise
-            batch_size = (
-                self.args.eval_batch_size
-                if self.args.sample_packing
-                else self.args.per_device_eval_batch_size
-            )
-            sampler = self._get_eval_sampler(eval_dataset)
-            dataloader = self._prepare_dataloader(
-                eval_dataset, sampler, is_eval=True, custom_batch_size=batch_size
-            )
-
-            return dataloader
-
-        return super().get_eval_dataloader(eval_dataset)
+        return self.accelerator.prepare(dataloader)
 
     def _get_bench_sampler(
         self, bench_dataset: Dataset
@@ -520,7 +520,18 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
 
     @wraps(Trainer.create_accelerator_and_postprocess)
     def create_accelerator_and_postprocess(self):
-        res = super().create_accelerator_and_postprocess()
+        # cleanup the PartialState states so Accelerate automatically configures everything from the env vars
+        accelerator_config = self.args.accelerator_config.to_dict()
+        use_configured_state = accelerator_config.get("use_configured_state", False)
+        if not use_configured_state:
+            AcceleratorState._reset_state(  # pylint: disable=protected-access
+                reset_partial_state=True
+            )
+
+        super().create_accelerator_and_postprocess()
+
+        # now we need to put parallelism_config back on the PartialState since we rely on that info in other places
+        # PartialState().parallelism_config = self.accelerator.state.parallelism_config
 
         if self.is_fsdp_enabled:
             if (
@@ -529,17 +540,25 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
             ):
                 self.accelerator.state.fsdp_plugin.limit_all_gathers = True
 
-        return res
-
+    # pylint: disable=unused-argument
     def additional_accelerator_args(
-        self, fp8=None, **kwargs
-    ):  # pylint: disable=unused-argument
+        self, fp8: bool = False, enable_fsdp_float8_all_gather: bool = False, **kwargs
+    ) -> dict[str, Any]:
         ret_kwargs = {}
         if fp8:
             from accelerate.utils import AORecipeKwargs
+            from torchao.float8 import Float8LinearConfig
+
+            # By default, Float8LinearConfig is instantiated using the "tensorwise"
+            # scaling strategy. See more details here:
+            # https://github.com/pytorch/ao/tree/main/torchao/float8.
+            config = Float8LinearConfig(
+                enable_fsdp_float8_all_gather=enable_fsdp_float8_all_gather,
+                force_recompute_fp8_weight_in_bwd=enable_fsdp_float8_all_gather is True,
+            )
 
             ret_kwargs["mixed_precision"] = "fp8"
-            ret_kwargs["kwargs_handlers"] = [AORecipeKwargs()]
+            ret_kwargs["kwargs_handlers"] = [AORecipeKwargs(config=config)]  # type: ignore
             os.environ["ACCELERATE_MIXED_PRECISION"] = "fp8"
 
         return ret_kwargs
@@ -557,6 +576,17 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
         # Add averaged stored metrics to logs
         for key, metrics in self._stored_metrics[train_eval].items():
             logs[key] = torch.tensor(metrics).mean().item()
+
+        if is_main_process():
+            # Add memory usage
+            try:
+                active, allocated, reserved = get_gpu_memory_usage()
+                logs["memory/max_mem_active(gib)"] = round(active, 2)
+                logs["memory/max_mem_allocated(gib)"] = round(allocated, 2)
+                logs["memory/device_mem_reserved(gib)"] = round(reserved, 2)
+            except (ValueError, TypeError, FileNotFoundError):
+                pass
+
         del self._stored_metrics[train_eval]
 
         return super().log(logs, start_time)
@@ -574,3 +604,64 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
         output_dir = os.path.join(run_dir, checkpoint_folder)
         os.makedirs(output_dir, exist_ok=True)
         return super()._save_checkpoint(model, trial, **kwargs)
+
+    # TODO(wing): remove once https://github.com/huggingface/transformers/pull/39866/files is merged
+    def _save(self, output_dir: Optional[str] = None, state_dict=None):
+        # If we are executing this function, we are the process zero, so we don't check for that.
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+        os.makedirs(output_dir, exist_ok=True)
+        LOG.info(f"Saving model checkpoint to {output_dir}")
+        supported_classes = (
+            (PreTrainedModel,)
+            if not is_peft_available()
+            else (PreTrainedModel, PeftModel)
+        )
+        # Save a trained model and configuration using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        if not isinstance(self.model, supported_classes):
+            if state_dict is None:
+                state_dict = self.model.state_dict()
+            if isinstance(
+                self.accelerator.unwrap_model(self.model, keep_torch_compile=False),
+                supported_classes,
+            ):
+                self.accelerator.unwrap_model(
+                    self.model, keep_torch_compile=False
+                ).save_pretrained(
+                    output_dir,
+                    state_dict=state_dict,
+                    safe_serialization=self.args.save_safetensors,
+                )
+            else:
+                LOG.info(
+                    "Trainer.model is not a `PreTrainedModel`, only saving its state dict."
+                )
+                if self.args.save_safetensors:
+                    safetensors.torch.save_file(
+                        state_dict,
+                        os.path.join(output_dir, SAFE_WEIGHTS_NAME),
+                        metadata={"format": "pt"},
+                    )
+                else:
+                    torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
+        else:
+            self.model.save_pretrained(
+                output_dir,
+                state_dict=state_dict,
+                safe_serialization=self.args.save_safetensors,
+                is_main_process=self.accelerator.is_main_process,
+            )
+
+            if self.processing_class is not None:
+                self.processing_class.save_pretrained(output_dir)
+            elif (
+                self.data_collator is not None
+                and hasattr(self.data_collator, "tokenizer")
+                and self.data_collator.tokenizer is not None
+            ):
+                LOG.info(
+                    "Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`"
+                )
+                self.data_collator.tokenizer.save_pretrained(output_dir)
+            # Good practice: save your training arguments together with the trained model
+            torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
diff --git a/src/axolotl/core/trainers/dpo/__init__.py b/src/axolotl/core/trainers/dpo/__init__.py
index 603fdf0b6..4b40d4085 100644
--- a/src/axolotl/core/trainers/dpo/__init__.py
+++ b/src/axolotl/core/trainers/dpo/__init__.py
@@ -22,10 +22,19 @@ class DPOStrategy:
         training_args_kwargs = {}
         if cfg.rl is RLType.IPO:
             training_args_kwargs["loss_type"] = "ipo"
-        training_args_kwargs["max_length"] = cfg.sequence_len
+        # Label smoothing is not compatible with IPO
+        if cfg.rl is RLType.DPO and cfg.dpo_label_smoothing:
+            training_args_kwargs["label_smoothing"] = cfg.dpo_label_smoothing
         training_args_kwargs["max_completion_length"] = None
+        training_args_kwargs["max_length"] = cfg.sequence_len
         training_args_kwargs["max_prompt_length"] = cfg.sequence_len
-        training_args_kwargs["generate_during_eval"] = cfg.use_wandb
+        training_args_kwargs["generate_during_eval"] = cfg.dpo_generate_during_eval
         if cfg.dpo_use_weighting is not None:
             training_args_kwargs["use_weighting"] = cfg.dpo_use_weighting
+        if cfg.dpo_padding_free is not None:
+            training_args_kwargs["padding_free"] = cfg.dpo_padding_free
+        if cfg.dpo_norm_loss is not None:
+            training_args_kwargs["dpo_norm_loss"] = cfg.dpo_norm_loss
+        if cfg.dpo_use_logits_to_keep is not None:
+            training_args_kwargs["use_logits_to_keep"] = cfg.dpo_use_logits_to_keep
         return training_args_kwargs
diff --git a/src/axolotl/core/trainers/dpo/args.py b/src/axolotl/core/trainers/dpo/args.py
index de1758ed0..b1e53236e 100644
--- a/src/axolotl/core/trainers/dpo/args.py
+++ b/src/axolotl/core/trainers/dpo/args.py
@@ -14,3 +14,5 @@ class AxolotlDPOConfig(AxolotlTrainingMixins, DPOConfig):
     """
     DPO config for DPO training
     """
+
+    dpo_norm_loss: bool | None = False
diff --git a/src/axolotl/core/trainers/dpo/trainer.py b/src/axolotl/core/trainers/dpo/trainer.py
index c2c80c0bc..b3067bb46 100644
--- a/src/axolotl/core/trainers/dpo/trainer.py
+++ b/src/axolotl/core/trainers/dpo/trainer.py
@@ -5,65 +5,40 @@ from functools import wraps
 from typing import Any, Dict, Union
 
 import torch
-from peft.optimizers import create_loraplus_optimizer
 from torch import nn
-from transformers import Trainer
-from transformers.utils import is_sagemaker_mp_enabled
 from trl import DPOTrainer
 
-from axolotl.core.trainers.mixins import RngLoaderMixin, SchedulerMixin
+from axolotl.core.trainers.mixins import (
+    DistributedParallelMixin,
+    RngLoaderMixin,
+    SchedulerMixin,
+)
+from axolotl.core.trainers.mixins.optimizer import OptimizerInitMixin, OptimizerMixin
 from axolotl.core.trainers.utils import (
     sanitize_kwargs_for_ds_tagging,
     sanitize_kwargs_for_tagging,
 )
 
-if is_sagemaker_mp_enabled():
-    import smdistributed.modelparallel.torch as smp
 
-
-class AxolotlDPOTrainer(RngLoaderMixin, SchedulerMixin, DPOTrainer):
+class AxolotlDPOTrainer(
+    RngLoaderMixin,
+    SchedulerMixin,
+    OptimizerMixin,
+    OptimizerInitMixin,
+    DPOTrainer,
+    DistributedParallelMixin,
+):
     """Extend the base DPOTrainer for axolotl helpers."""
 
     tag_names = ["axolotl", "dpo"]
 
     def __init__(self, *args, dataset_tags=None, **kwargs):
         super().__init__(*args, **kwargs)
+
         self.dataset_tags = dataset_tags
         self.optimizer = None
         self.model_accepts_loss_kwargs = False
 
-    def create_optimizer(self):
-        # pylint: disable=duplicate-code
-        if self.args.loraplus_lr_ratio is None:
-            return super().create_optimizer()
-
-        opt_model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model
-        if self.optimizer is None:  # pylint: disable=access-member-before-definition
-            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(
-                self.args,
-                opt_model,
-            )
-
-            loraplus_lr_ratio = getattr(self.args, "loraplus_lr_ratio", None)
-            if loraplus_lr_ratio:
-                print("Using lora+")
-            loraplus_lr_embedding = getattr(self.args, "loraplus_lr_embedding", None)
-            # pylint: disable=duplicate-code
-            self.optimizer = create_loraplus_optimizer(  # pylint: disable=attribute-defined-outside-init
-                opt_model,
-                optimizer_cls,
-                loraplus_lr_ratio=loraplus_lr_ratio,
-                loraplus_lr_embedding=loraplus_lr_embedding,
-                **optimizer_kwargs,
-            )
-
-        if is_sagemaker_mp_enabled():
-            self.optimizer = smp.DistributedOptimizer(  # pylint: disable=attribute-defined-outside-init
-                self.optimizer
-            )
-
-        return self.optimizer
-
     @wraps(DPOTrainer.push_to_hub)
     def push_to_hub(self, *args, **kwargs) -> str:
         """
@@ -117,3 +92,20 @@ class AxolotlDPOTrainer(RngLoaderMixin, SchedulerMixin, DPOTrainer):
         gc.collect()
         torch.cuda.empty_cache()
         return loss
+
+    def concatenated_forward(
+        self,
+        model: nn.Module,
+        batch: dict[str, Union[list, torch.LongTensor]],
+        is_ref_model: bool = False,
+    ) -> dict[str, torch.Tensor]:
+        if self.args.dpo_norm_loss:
+            # fmt: off
+            loss_type: str = self.loss_type  # type: ignore[has-type]  # pylint: disable=access-member-before-definition
+            # fmt: on
+            # concatenated_forward handles avg token logprob for ipo case already
+            self.loss_type = "ipo"  # pylint: disable=attribute-defined-outside-init
+            res = super().concatenated_forward(model, batch, is_ref_model=is_ref_model)
+            self.loss_type = loss_type  # pylint: disable=attribute-defined-outside-init
+            return res
+        return super().concatenated_forward(model, batch, is_ref_model=is_ref_model)
diff --git a/src/axolotl/core/trainers/grpo/__init__.py b/src/axolotl/core/trainers/grpo/__init__.py
index f4685893b..4106a2a7d 100644
--- a/src/axolotl/core/trainers/grpo/__init__.py
+++ b/src/axolotl/core/trainers/grpo/__init__.py
@@ -2,9 +2,11 @@
 
 import importlib
 import inspect
-import logging
+import os
 from typing import Any
 
+from huggingface_hub import snapshot_download
+from requests import HTTPError
 from trl.trainer.grpo_trainer import RewardFunc
 
 from axolotl.core.trainers.grpo.args import AxolotlGRPOConfig
@@ -13,9 +15,11 @@ from axolotl.core.trainers.grpo.trainer import (
     AxolotlGRPOTrainer,
 )
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.trl import TRLConfig
+from axolotl.utils.schemas.vllm import VllmConfig
 
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__)
 
 
 class GRPOStrategy:
@@ -41,9 +45,19 @@ class GRPOStrategy:
             return grpo_args_kwargs
 
         trl: TRLConfig = cfg.trl  # type: ignore
+        vllm_cfg: VllmConfig = cfg.vllm  # type: ignore
 
         if trl.use_vllm:
             grpo_args_kwargs["use_vllm"] = trl.use_vllm
+            if trl.vllm_mode:
+                grpo_args_kwargs["vllm_mode"] = trl.vllm_mode
+            if trl.vllm_mode == "colocate":
+                grpo_args_kwargs["vllm_gpu_memory_utilization"] = (
+                    vllm_cfg.gpu_memory_utilization
+                )
+                grpo_args_kwargs["vllm_tensor_parallel_size"] = (
+                    vllm_cfg.tensor_parallel_size
+                )
             grpo_args_kwargs["vllm_server_host"] = trl.vllm_server_host or trl.vllm.host  # type: ignore[attr-defined]
             grpo_args_kwargs["vllm_server_port"] = trl.vllm_server_port or trl.vllm.port  # type: ignore[attr-defined]
             if trl.vllm_server_timeout:
@@ -69,6 +83,14 @@ class GRPOStrategy:
         grpo_args_kwargs["log_completions"] = trl.log_completions
         grpo_args_kwargs["num_completions_to_print"] = trl.num_completions_to_print
 
+        if cfg.context_parallel_size > 1:
+            grpo_args_kwargs["context_parallel_size"] = cfg.context_parallel_size
+
+        if trl.importance_sampling_level is not None:
+            grpo_args_kwargs["importance_sampling_level"] = (
+                trl.importance_sampling_level
+            )
+
         if trl.reward_weights:
             grpo_args_kwargs["reward_weights"] = trl.reward_weights
 
@@ -106,7 +128,9 @@ class GRPOStrategy:
         return grpo_args_kwargs
 
     @classmethod
-    def set_trainer_args(cls, cfg: DictDefault) -> list[Any]:
+    def set_trainer_args(
+        cls, cfg: DictDefault
+    ) -> list[Any]:  # pylint: disable=unused-argument
         trainer_args = []
         if cfg.trl and cfg.trl.reward_funcs:
             reward_funcs = []
@@ -123,6 +147,7 @@ class GRPOStrategy:
             trainer_kwargs["reward_processing_classes"] = (
                 cfg.trl.reward_processing_classes
             )
+
         return trainer_kwargs
 
     @classmethod
@@ -132,7 +157,7 @@ class GRPOStrategy:
 
     @classmethod
     def get_blocklist_args_kwargs(cls) -> list[str]:
-        return ["dataset_num_proc"]
+        return ["dataset_num_proc", "max_length", "include_tokens_per_second"]
 
     @classmethod
     def get_reward_func(cls, reward_func_fqn: str) -> RewardFunc:
@@ -162,9 +187,18 @@ class GRPOStrategy:
                     "Reward function must accept at least two arguments: prompts: list and completions: list"
                 )
             return reward_func
-        except ModuleNotFoundError:
+        except ModuleNotFoundError as exc:
             # the user has passed a string (ideally indicating the path of a reward model)
-            LOG.info(
-                f"Reward function {reward_func_fqn} is a pre-trained model path - if this is unexpected, please check the reward function path."
-            )
-            return reward_func
+            # check if it's a local dir path and not empty dir to a reward model
+            pretrained_log_msg = f"Reward function {reward_func_fqn} is a pre-trained model path - if this is unexpected, please check the reward function path."
+            if os.path.isdir(reward_func_fqn) and os.listdir(reward_func_fqn):
+                LOG.info(pretrained_log_msg)
+                return reward_func_fqn
+            try:
+                snapshot_download(reward_func_fqn, repo_type="model")
+                LOG.info(pretrained_log_msg)
+                return reward_func_fqn
+            except HTTPError:
+                raise ValueError(
+                    f"Reward function {reward_func_fqn} not found."
+                ) from exc
diff --git a/src/axolotl/core/trainers/grpo/args.py b/src/axolotl/core/trainers/grpo/args.py
index 76be88c89..2ea52998e 100644
--- a/src/axolotl/core/trainers/grpo/args.py
+++ b/src/axolotl/core/trainers/grpo/args.py
@@ -12,3 +12,5 @@ from axolotl.core.training_args import AxolotlTrainingMixins
 @dataclass
 class AxolotlGRPOConfig(AxolotlTrainingMixins, GRPOConfig):
     """Axolotl GRPO Config for GRPO training"""
+
+    context_parallel_size: int | None = None
diff --git a/src/axolotl/core/trainers/grpo/sampler.py b/src/axolotl/core/trainers/grpo/sampler.py
index ebc6e19e2..df679a6d2 100644
--- a/src/axolotl/core/trainers/grpo/sampler.py
+++ b/src/axolotl/core/trainers/grpo/sampler.py
@@ -20,7 +20,7 @@ class SequenceParallelRepeatRandomSampler(Sampler):
     - Data is properly distributed across SP groups.
 
     In the table below, the values represent dataset indices. Each SP group has
-    `sequence_parallel_degree = 2` GPUs working together on the same data. There are 2
+    `context_parallel_size = 2` GPUs working together on the same data. There are 2
     SP groups (SP0 and SP1), with `world_size = 4` total GPUs.
 
                                                Sequence Parallel Groups
@@ -45,7 +45,7 @@ class SequenceParallelRepeatRandomSampler(Sampler):
         rank: Rank of current process.
         batch_size: Number of samples per batch.
         repeat_count: How many times to repeat the full sampling process.
-        sequence_parallel_degree: Number of ranks in a sequence parallel group.
+        context_parallel_size: Number of ranks in a sequence parallel group.
         shuffle: Whether to shuffle the dataset.
         seed: Random seed for shuffling.
         drop_last: Whether to drop the last incomplete batch.
@@ -59,7 +59,7 @@ class SequenceParallelRepeatRandomSampler(Sampler):
         rank: int,
         batch_size: int = 1,
         repeat_count: int = 1,
-        sequence_parallel_degree: int = 1,
+        context_parallel_size: int = 1,
         shuffle: bool = True,
         seed: int = 0,
         drop_last: bool = False,
@@ -77,9 +77,9 @@ class SequenceParallelRepeatRandomSampler(Sampler):
         self.rank = rank
 
         # Sequence parallelism parameters
-        self.sequence_parallel_degree = sequence_parallel_degree
-        self.num_sp_groups = world_size // sequence_parallel_degree
-        self.sp_group_id = rank // sequence_parallel_degree
+        self.context_parallel_size = context_parallel_size
+        self.num_sp_groups = world_size // context_parallel_size
+        self.sp_group_id = rank // context_parallel_size
 
         # Adjust dataset size for distributed sampling
         self.num_samples = len(self.dataset)
diff --git a/src/axolotl/core/trainers/grpo/trainer.py b/src/axolotl/core/trainers/grpo/trainer.py
index b5b3912cf..49caa6406 100644
--- a/src/axolotl/core/trainers/grpo/trainer.py
+++ b/src/axolotl/core/trainers/grpo/trainer.py
@@ -3,6 +3,7 @@
 # pylint: disable=too-many-lines,duplicate-code,protected-access,no-member
 
 import warnings
+from functools import partial
 from typing import Any
 
 import datasets
@@ -42,7 +43,12 @@ from trl.trainer.grpo_trainer import RewardFunc, nanstd
 from trl.trainer.utils import pad
 
 from axolotl.core.trainers.grpo.sampler import SequenceParallelRepeatRandomSampler
-from axolotl.core.trainers.mixins import RngLoaderMixin, SchedulerMixin
+from axolotl.core.trainers.mixins import (
+    DistributedParallelMixin,
+    RngLoaderMixin,
+    SchedulerMixin,
+)
+from axolotl.core.trainers.mixins.optimizer import OptimizerInitMixin, OptimizerMixin
 from axolotl.monkeypatch.ring_attn import get_ring_attn_group
 
 if is_peft_available():
@@ -50,7 +56,14 @@ if is_peft_available():
     from peft import PeftConfig
 
 
-class AxolotlGRPOTrainer(RngLoaderMixin, SchedulerMixin, GRPOTrainer):
+class AxolotlGRPOTrainer(
+    RngLoaderMixin,
+    SchedulerMixin,
+    OptimizerMixin,
+    OptimizerInitMixin,
+    DistributedParallelMixin,
+    GRPOTrainer,
+):
     """Extend the base GRPOTrainer for axolotl helpers"""
 
     _tag_names = ["trl", "grpo", "axolotl"]
@@ -77,6 +90,7 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
             torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None
         ] = (None, None),
         peft_config: "PeftConfig | None" = None,
+        optimizer_cls_and_kwargs: tuple[type, dict] | None = None,
     ):
         # First call the superclass constructor with all arguments
         super().__init__(
@@ -90,11 +104,12 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
             callbacks=callbacks,
             optimizers=optimizers,
             peft_config=peft_config,
+            optimizer_cls_and_kwargs=optimizer_cls_and_kwargs,
         )
 
         # Get number of SP groups (number of processes divided by SP degree)
         num_processes = self.accelerator.num_processes
-        num_sp_groups = num_processes // self.args.sequence_parallel_degree
+        num_sp_groups = num_processes // self.args.context_parallel_size
 
         # Calculate batch size per SP group (not per process)
         sp_group_batch_size = self.args.per_device_train_batch_size * num_sp_groups
@@ -124,13 +139,20 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
 
             if self.num_generations not in possible_values:
                 raise ValueError(
-                    f"With sequence parallelism (degree {self.args.sequence_parallel_degree}), "
+                    f"With sequence parallelism (degree {self.args.context_parallel_size}), "
                     f"the eval batch size per SP group ({num_sp_groups} x {self.args.per_device_eval_batch_size}) "
                     f"must be evenly divisible by the number of generations per prompt "
                     f"({self.num_generations}). Given the current eval batch size, "
                     f"the valid values for the number of generations are: {possible_values}."
                 )
 
+        self.sp_group = None
+        self.rank = dist.get_rank()
+        self.world_size = dist.get_world_size()
+        self.local_rank = 0
+        self.local_world_size = 1
+
+    def train(self, *args, **kwargs):
         # Initialize the SP group
         self.sp_group = get_ring_attn_group()
         self.rank = dist.get_rank()
@@ -138,6 +160,8 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
         self.local_rank = dist.get_rank(group=self.sp_group)
         self.local_world_size = dist.get_world_size(group=self.sp_group)
 
+        return super().train(*args, **kwargs)
+
     def _get_train_sampler(self) -> Sampler:
         effective_batch_size = (
             self.args.per_device_train_batch_size
@@ -152,9 +176,9 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
             rank=self.rank,
             batch_size=effective_batch_size
             // self.num_generations
-            // self.args.sequence_parallel_degree,
+            // self.args.context_parallel_size,
             repeat_count=self.num_iterations * self.args.gradient_accumulation_steps,
-            sequence_parallel_degree=self.args.sequence_parallel_degree,
+            context_parallel_size=self.args.context_parallel_size,
             shuffle=True,
             seed=self.args.seed,
             drop_last=True,
@@ -201,7 +225,11 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
                 dataloader_params["drop_last"] = self.args.dataloader_drop_last
 
             if not is_eval:
-                dataloader_params["worker_init_fn"] = seed_worker
+                dataloader_params["worker_init_fn"] = partial(
+                    seed_worker,
+                    num_workers=self.args.dataloader_num_workers,
+                    rank=self.args.process_index,
+                )
 
         # Create the dataloader
         dataloader = DataLoader(dataset, **dataloader_params)
@@ -216,7 +244,7 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
         # TODO(djsaunde): We might be able to use `accelerate`'s dataloader preparation
         # if we use `dispatch_batches` and `slice_fn_for_dispatch` properly (i.e.,
         # slice each batch along the sequence dimension).
-        if self.args.sequence_parallel_degree > 1:
+        if self.args.context_parallel_size > 1:
             return dataloader
 
         # Otherwise prepare with accelerator
@@ -289,18 +317,18 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
             # Generate completions using vLLM: gather all prompts and use them in a single call in the main process
             all_prompts_text = gather_object(prompts_text)
             if self.accelerator.is_main_process:
-                if self.args.sequence_parallel_degree > 1:
+                if self.args.context_parallel_size > 1:
                     # Calculate sequence parallel group information
                     world_size = self.accelerator.num_processes
-                    sequence_parallel_degree = self.args.sequence_parallel_degree
-                    num_sp_groups = world_size // sequence_parallel_degree
+                    context_parallel_size = self.args.context_parallel_size
+                    num_sp_groups = world_size // context_parallel_size
 
                     # Since processes in the same SP group have the same prompts, we need to ensure
                     # we only take one copy of each prompt from each SP group
                     ordered_set_of_prompts = []
                     for sp_group_id in range(num_sp_groups):
                         # Get the first process from each SP group (typically the group leader)
-                        group_leader_rank = sp_group_id * sequence_parallel_degree
+                        group_leader_rank = sp_group_id * context_parallel_size
 
                         # Extract prompts from this SP group, accounting for num_generations duplicates
                         # We only need prompts from one rank in each SP group
@@ -316,7 +344,7 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
                     # num_generations outputs for each one. This is faster than generating outputs for each duplicate
                     # prompt individually.
                     ordered_set_of_prompts = all_prompts_text[
-                        :: self.num_generations * self.args.sequence_parallel_degree
+                        :: self.num_generations * self.args.context_parallel_size
                     ]
 
                 with profiling_context(self, "vLLM.generate"):
@@ -333,14 +361,14 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
                     )
             else:
                 completion_ids = [None] * (
-                    len(all_prompts_text) // self.args.sequence_parallel_degree
+                    len(all_prompts_text) // self.args.context_parallel_size
                 )
 
             # Broadcast the completions from the main process to all processes
             completion_ids = broadcast_object_list(completion_ids, from_process=0)
 
             # Determine the appropriate slice based on sequence parallelism
-            if self.args.sequence_parallel_degree > 1:
+            if self.args.context_parallel_size > 1:
                 # Calculate SP group ID (which group of ranks this rank belongs to)
                 sp_group_id = self.accelerator.process_index // self.local_world_size
 
@@ -564,7 +592,7 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
             advantages = advantages / (std_grouped_rewards + 1e-4)
 
         # Slice to keep only the local part of the data
-        if self.args.sequence_parallel_degree > 1:
+        if self.args.context_parallel_size > 1:
             # Calculate SP group ID (which group of ranks this rank belongs to)
             sp_group_id = self.accelerator.process_index // self.local_world_size
 
diff --git a/src/axolotl/core/trainers/mamba.py b/src/axolotl/core/trainers/mamba.py
index 38792e389..b475b26d9 100644
--- a/src/axolotl/core/trainers/mamba.py
+++ b/src/axolotl/core/trainers/mamba.py
@@ -5,6 +5,7 @@ import torch
 from axolotl.core.trainers.base import AxolotlTrainer
 
 
+# pylint: disable=too-many-ancestors
 class AxolotlMambaTrainer(AxolotlTrainer):
     """Mamba specific trainer to handle loss calculation"""
 
diff --git a/src/axolotl/core/trainers/mixins/__init__.py b/src/axolotl/core/trainers/mixins/__init__.py
index a71cb321a..b54577765 100644
--- a/src/axolotl/core/trainers/mixins/__init__.py
+++ b/src/axolotl/core/trainers/mixins/__init__.py
@@ -3,6 +3,10 @@
 # pylint: disable=unused-import
 # flake8: noqa
 
+from .activation_checkpointing import ActivationOffloadingMixin
+from .checkpoints import CheckpointSaveMixin
+from .distributed_parallel import DistributedParallelMixin
 from .optimizer import OptimizerMixin
+from .packing import PackingMixin
 from .rng_state_loader import RngLoaderMixin
 from .scheduler import SchedulerMixin
diff --git a/src/axolotl/core/trainers/mixins/activation_checkpointing.py b/src/axolotl/core/trainers/mixins/activation_checkpointing.py
new file mode 100644
index 000000000..1bfdb49f7
--- /dev/null
+++ b/src/axolotl/core/trainers/mixins/activation_checkpointing.py
@@ -0,0 +1,217 @@
+"""
+Trainer mixin for activation checkpointing w offloading
+"""
+
+import contextlib
+
+from peft import PeftModel
+from torch import nn
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    apply_activation_checkpointing,
+)
+from torch.distributed.fsdp.wrap import ModuleWrapPolicy
+from transformers import GradientCheckpointingLayer, Trainer
+from trl.models.activation_offloading import (
+    NoOpManager,
+    OffloadActivations,
+    get_act_offloading_ctx_manager,
+)
+
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+
+class ActivationOffloadingMixin(Trainer):
+    """
+    Trainer mixin class for activation checkpointing w offloading
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if self.args.activation_offloading:
+            if isinstance(self.model, PeftModel):
+                self.activation_offload_context = get_lora_act_offloading_ctx_manager(
+                    self.model, use_streams=True
+                )
+            else:
+                self.activation_offload_context = get_act_offloading_ctx_manager(
+                    self.model, use_streams=True
+                )
+        else:
+            self.activation_offload_context = contextlib.nullcontext()
+
+    def training_step(self, *args, **kwargs):
+        with self.activation_offload_context:
+            return super().training_step(*args, **kwargs)
+
+
+def ac_wrap_hf_model(model: nn.Module, **kwargs):
+    auto_wrap_policy = ModuleWrapPolicy(set((GradientCheckpointingLayer,)))
+    apply_activation_checkpointing(model, auto_wrap_policy=auto_wrap_policy, **kwargs)
+
+
+def get_lora_act_offloading_ctx_manager(
+    model: nn.Module,
+    use_pin_memory: bool = True,
+    use_streams: bool = True,
+    min_offload_size: int = 1024,
+    max_fwd_stash_size: int = 5,
+    warn_if_no_head: bool = True,
+) -> OffloadActivations:
+    """
+    Returns the activation offloading context manager for the model. All but the last output Linear in every step will
+    be offloaded.
+
+    If activation offloading is enabled, we return the OffloadActivations context manager. If activation offloading is
+    disabled, we return a NoOpManager context manager.
+
+    Args:
+        model (`nn.Module`):
+            Model to wrap with the activation offloading context manager.
+        use_pin_memory (`bool`, *optional*, defaults to `True`):
+            Whether to offloaded Tensor will be placed in pinned memory on the CPU. Pinned memory allows the Tensor to
+            be moved back onto GPU more quickly but is a limited resource.
+        use_streams (`bool`, *optional*, defaults to `True`):
+            Whether to use streams for performance optimization where the communications get overlapped with the
+            computation. Requires a torch build after torch-2.5.0.
+        min_offload_size (`int`, *optional*, defaults to `1024`):
+            Minimum number of bytes a Tensor must be in order to qualify for offloading. If the tensor is too small, we
+            do not want to waste bandwidth and resources moving it to CPU and back.
+        max_fwd_stash_size (`int`, *optional*, defaults to `5`):
+            Maximum size of the forward stash, or the maximum number of consecutive activations to keep alive during
+            the forward pass. This number must be at least 1. Keeping alive more activations will potentially allow
+            more overlap between the communication and compute streams at the cost of increasing memory usage. Keeping
+            alive fewer activations will conserve memory, but may cause poor overlap between the streams, increasing
+            runtime.
+        warn_if_no_head (`bool`, *optional*, defaults to `True`):
+            Whether to warn if no output head is detected. If set to `False`, no warning will be raised if no output
+            head is detected.
+
+    Returns:
+        `contextlib.ContextDecorator`:
+            Activation offloading context manager for the model.
+    """
+    # pylint: disable=unnecessary-dunder-call
+    activations_handling_ctx = OffloadActivations(
+        use_pin_memory=use_pin_memory,
+        use_streams=use_streams,
+        min_offload_size=min_offload_size,
+        max_fwd_stash_size=max_fwd_stash_size,
+    )
+
+    # Below is our hack to disable offloading the last output Linear in every
+    # step, as the cost for offloading the activation and then soon after bringing
+    # it back is expensive.
+    output_head_detected = False
+    noop_ctx = NoOpManager()
+
+    # Try to get the actual model if it's wrapped
+    unwrapped_model = model
+    if hasattr(unwrapped_model, "module"):
+        unwrapped_model = unwrapped_model.module
+    # check for PEFT models
+    if hasattr(unwrapped_model, "base_model") and hasattr(
+        unwrapped_model, "peft_config"
+    ):
+        unwrapped_model = unwrapped_model.base_model
+
+    # Check for different types of output heads
+    if hasattr(unwrapped_model, "output"):
+        if isinstance(unwrapped_model.output, nn.Module):
+            unwrapped_model.output.register_forward_pre_hook(
+                lambda *args: noop_ctx.__enter__()
+            )
+            unwrapped_model.output.register_forward_hook(
+                lambda *args: noop_ctx.__exit__(), always_call=True
+            )
+            output_head_detected = True
+        elif hasattr(unwrapped_model.output, "linear") and isinstance(
+            unwrapped_model.output.linear, nn.Module
+        ):
+            unwrapped_model.output.linear.register_forward_pre_hook(
+                lambda *args: noop_ctx.__enter__()
+            )
+            unwrapped_model.output.linear.register_forward_hook(
+                lambda *args: noop_ctx.__exit__(), always_call=True
+            )
+            output_head_detected = True
+
+    # Check for HuggingFace model output heads
+    elif hasattr(unwrapped_model, "lm_head"):
+        unwrapped_model.lm_head.register_forward_pre_hook(
+            lambda *args: noop_ctx.__enter__()
+        )
+        unwrapped_model.lm_head.register_forward_hook(
+            lambda *args: noop_ctx.__exit__(), always_call=True
+        )
+        output_head_detected = True
+
+    # Check for decoder-based models
+    elif hasattr(unwrapped_model, "decoder"):
+        decoder = unwrapped_model.decoder
+        if hasattr(decoder, "output"):
+            decoder.output.register_forward_pre_hook(lambda *args: noop_ctx.__enter__())
+            decoder.output.register_forward_hook(
+                lambda *args: noop_ctx.__exit__(), always_call=True
+            )
+            output_head_detected = True
+        # Some models have lm_head in the decoder
+        elif hasattr(decoder, "lm_head"):
+            decoder.lm_head.register_forward_pre_hook(
+                lambda *args: noop_ctx.__enter__()
+            )
+            decoder.lm_head.register_forward_hook(
+                lambda *args: noop_ctx.__exit__(), always_call=True
+            )
+            output_head_detected = True
+
+    # Check for transformer models with final layer norm
+    elif hasattr(unwrapped_model, "final_layer_norm") or hasattr(
+        unwrapped_model, "ln_f"
+    ):
+        final_norm = (
+            getattr(unwrapped_model, "final_layer_norm", None) or unwrapped_model.ln_f
+        )
+        final_norm.register_forward_pre_hook(lambda *args: noop_ctx.__enter__())
+        final_norm.register_forward_hook(
+            lambda *args: noop_ctx.__exit__(), always_call=True
+        )
+        output_head_detected = True
+
+    # Check for models with head module
+    elif hasattr(unwrapped_model, "head") and isinstance(
+        unwrapped_model.head, nn.Module
+    ):
+        unwrapped_model.head.register_forward_pre_hook(
+            lambda *args: noop_ctx.__enter__()
+        )
+        unwrapped_model.head.register_forward_hook(
+            lambda *args: noop_ctx.__exit__(), always_call=True
+        )
+        output_head_detected = True
+
+    if not output_head_detected and warn_if_no_head:
+        LOG.warning(
+            "During activation offloading, no output head was detected. If your model has an output head, it will be "
+            "offloaded. This usually greatly slows training, given the large vocabulary size. To change this "
+            "behavior, set your output head as model.output and make it an nn.Module. You can disable this warning by "
+            "passing `warn_if_no_head=False`."
+        )
+
+    for name, module in unwrapped_model.named_modules():
+        # Disable offloading for any Liger modules
+        if "liger" in name.lower():
+            module.register_forward_pre_hook(lambda *args: noop_ctx.__enter__())
+            module.register_forward_hook(
+                lambda *args: noop_ctx.__exit__(), always_call=True
+            )
+        # disable offloading for any submodules to fix LoRA training
+        if name.endswith("._checkpoint_wrapped_module"):
+            for _, sub_module in module.named_modules():
+                sub_module.register_forward_pre_hook(lambda *args: noop_ctx.__enter__())
+                sub_module.register_forward_hook(
+                    lambda *args: noop_ctx.__exit__(), always_call=True
+                )
+
+    return activations_handling_ctx
diff --git a/src/axolotl/core/trainers/mixins/checkpoints.py b/src/axolotl/core/trainers/mixins/checkpoints.py
new file mode 100644
index 000000000..4042ef9f1
--- /dev/null
+++ b/src/axolotl/core/trainers/mixins/checkpoints.py
@@ -0,0 +1,23 @@
+"""Custom handling to not fail training if fsdp optimizer is not savable"""
+
+from transformers import Trainer
+
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+
+class CheckpointSaveMixin(Trainer):
+    """Mixin to handle saving the optimizer and scheduler if they are not savable."""
+
+    def _save_optimizer_and_scheduler(self, output_dir):
+        try:
+            super()._save_optimizer_and_scheduler(output_dir)
+        except (NotImplementedError, KeyError) as exc:
+            # TODO: fix fsdp2 optimizer saving
+            LOG.warning_once(
+                f"Trainer does not support saving optimizer and scheduler:  {exc}\n"
+                "Optimizer and scheduler states were not saved - resuming from checkpoints "
+                "for this training run will not be possible.",
+                main_process_only=True,
+            )
diff --git a/src/axolotl/core/trainers/mixins/distributed_parallel.py b/src/axolotl/core/trainers/mixins/distributed_parallel.py
new file mode 100644
index 000000000..d163e4eb5
--- /dev/null
+++ b/src/axolotl/core/trainers/mixins/distributed_parallel.py
@@ -0,0 +1,33 @@
+"""
+Mixin for correctly saving fsdp
+"""
+
+from accelerate import PartialState
+from transformers import Trainer
+
+
+class DistributedParallelMixin(Trainer):
+    """
+    Mixin for correctly saving fsdp
+    """
+
+    def _save(self, output_dir: str | None = None, state_dict=None):
+        if (
+            state_dict is None
+            and self.accelerator.parallelism_config
+            and self.accelerator.parallelism_config.dp_shard_enabled
+        ):
+            state_dict = self.accelerator.get_state_dict(self.model)
+        super()._save(output_dir, state_dict=state_dict)
+
+    def create_accelerator_and_postprocess(self):
+        super().create_accelerator_and_postprocess()
+        if (
+            self.accelerator.distributed_type == "FSDP"
+            and self.accelerator.state.fsdp_plugin is None
+        ):
+            # pylint: disable=protected-access
+            # handle Context Parallelism without FSDP
+            self.accelerator.state.distributed_type = "MULTI_GPU"
+            self.accelerator.state._shared_state["distributed_type"] = "MULTI_GPU"
+            PartialState().distributed_type = "MULTI_GPU"
diff --git a/src/axolotl/core/trainers/mixins/optimizer.py b/src/axolotl/core/trainers/mixins/optimizer.py
index bde58aa1d..a9a9a3992 100644
--- a/src/axolotl/core/trainers/mixins/optimizer.py
+++ b/src/axolotl/core/trainers/mixins/optimizer.py
@@ -1,18 +1,17 @@
 """Module for Axolotl trainer optimizer mixin"""
 
-import logging
-
 from peft.optimizers import create_loraplus_optimizer
 from torch import nn
 from transformers.trainer import Trainer
 from transformers.utils import is_sagemaker_mp_enabled
 
 from axolotl.integrations.base import BaseOptimizerFactory
+from axolotl.utils.logging import get_logger
 
 if is_sagemaker_mp_enabled():
     import smdistributed.modelparallel.torch as smp
 
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__)
 
 
 class OptimizerMixin(Trainer):
@@ -199,3 +198,20 @@ class OptimizerMixin(Trainer):
             )
 
         return self.optimizer
+
+
+class OptimizerInitMixin:
+    """
+    Mixin to handle common optimizer initialization logic for Trainers (mostly TRL) that do not
+    accept optimizer_cls_and_kwargs as kwarg in constructor.
+    """
+
+    def __init__(self, *args, **kwargs):
+        optimizer_cls_and_kwargs = kwargs.pop("optimizer_cls_and_kwargs", None)
+        super().__init__(*args, **kwargs)
+        if (
+            optimizer_cls_and_kwargs
+            and self.optimizer_cls_and_kwargs is None
+            and self.optimizer is None
+        ):
+            self.optimizer_cls_and_kwargs = optimizer_cls_and_kwargs
diff --git a/src/axolotl/core/trainers/mixins/packing.py b/src/axolotl/core/trainers/mixins/packing.py
new file mode 100644
index 000000000..249ceeb4f
--- /dev/null
+++ b/src/axolotl/core/trainers/mixins/packing.py
@@ -0,0 +1,20 @@
+"""Trainer mixin to support packing"""
+
+from transformers import Trainer
+
+
+class PackingMixin(Trainer):
+    """
+    Trainer mixin to support packing
+    """
+
+    def _set_signature_columns_if_needed(self):
+        super()._set_signature_columns_if_needed()
+        if (
+            self._signature_columns
+            and self.args.sample_packing
+            and self.args.sample_packing_drop_attention_mask
+        ):
+            set_sig_columns = set(self._signature_columns)
+            set_sig_columns.remove("attention_mask")
+            self._signature_columns = list(set_sig_columns)
diff --git a/src/axolotl/core/trainers/mixins/rng_state_loader.py b/src/axolotl/core/trainers/mixins/rng_state_loader.py
index 0e101dabb..f248394b2 100644
--- a/src/axolotl/core/trainers/mixins/rng_state_loader.py
+++ b/src/axolotl/core/trainers/mixins/rng_state_loader.py
@@ -6,7 +6,6 @@ See https://github.com/huggingface/transformers/pull/37162
 TODO: Remove when upstream added PR to release
 """
 
-import logging
 import os
 import random
 
@@ -17,7 +16,9 @@ from transformers.trainer import safe_globals
 from transformers.trainer_pt_utils import set_rng_state_for_device
 from transformers.training_args import ParallelMode
 
-LOG = logging.getLogger(__name__)
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
 
 
 class RngLoaderMixin(Trainer):
diff --git a/src/axolotl/core/trainers/mixins/scheduler.py b/src/axolotl/core/trainers/mixins/scheduler.py
index 0c36f9f95..399bf5947 100644
--- a/src/axolotl/core/trainers/mixins/scheduler.py
+++ b/src/axolotl/core/trainers/mixins/scheduler.py
@@ -1,20 +1,20 @@
 """Module for Axolotl trainer scheduler mixin"""
 
-import logging
-
 import torch
 from torch.optim.lr_scheduler import LRScheduler, OneCycleLR
 from transformers.trainer import Trainer
 
 from axolotl.integrations.base import PluginManager
+from axolotl.utils.logging import get_logger
 from axolotl.utils.schedulers import (
+    JaggedLRRestartScheduler,
     RexLR,
     get_cosine_schedule_with_min_lr,
     get_cosine_schedule_with_quadratic_warmup,
     get_cosine_schedule_with_warmup_decay_constant,
 )
 
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__)
 
 
 class SchedulerMixin(Trainer):
@@ -80,13 +80,15 @@ class SchedulerMixin(Trainer):
                 self.lr_scheduler = RexLR(
                     optimizer=optimizer,
                     max_lr=self.args.learning_rate,
-                    min_lr=0 if not use_cosine_min_lr else (self.args.learning_rate * self.args.cosine_min_lr_ratio),
+                    min_lr=0 if not use_cosine_min_lr else (
+                        self.args.learning_rate * self.args.cosine_min_lr_ratio),
                     total_steps=num_training_steps,
                     num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
                 )
             elif use_cosine_quadratic:
                 if use_cosine_min_lr:
-                    LOG.warning("Both cosine quadratic warmup and min lr detected. Using quadratic warmup.")
+                    LOG.warning(
+                        "Both cosine quadratic warmup and min lr detected. Using quadratic warmup.")
 
                 self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup(  # pylint: disable=attribute-defined-outside-init
                     optimizer,
@@ -112,12 +114,32 @@ class SchedulerMixin(Trainer):
                     min_lr_ratio=self.args.cosine_min_lr_ratio,
                 )
             else:
-                return super().create_scheduler(num_training_steps, optimizer=optimizer)
+                super().create_scheduler(num_training_steps, optimizer=optimizer)
         else:
             if use_cosine_quadratic:
-                LOG.warning("axolotl's cosine scheduler with quadratic warmup not used (e.g., because of deepspeed).")
+                LOG.warning(
+                    "axolotl's cosine scheduler with quadratic warmup not used (e.g., because of deepspeed).")
 
             if use_cosine_min_lr:
-                LOG.warning("axolotl's cosine scheduler with min lr not used (e.g., because of deepspeed).")
+                LOG.warning(
+                    "axolotl's cosine scheduler with min lr not used (e.g., because of deepspeed).")
+
+        if self.args.jagged_restart_steps:
+            warmup_steps = (
+                self.args.jagged_restart_warmup_steps or 10
+            )
+            anneal_steps = (
+                self.args.jagged_restart_anneal_steps or 1
+            )
+            if not self.lr_scheduler:
+                super().create_scheduler(num_training_steps, optimizer)
+            self.lr_scheduler = JaggedLRRestartScheduler(  # pylint: disable=attribute-defined-outside-init
+                optimizer,
+                self.lr_scheduler,
+                self.args.jagged_restart_steps,
+                warmup_steps,
+                anneal_steps,
+                min_lr_scale=self.args.cosine_min_lr_ratio or 0.001,
+            )
 
         return self.lr_scheduler  # type: ignore
diff --git a/src/axolotl/core/trainers/relora.py b/src/axolotl/core/trainers/relora.py
deleted file mode 100644
index 890278f49..000000000
--- a/src/axolotl/core/trainers/relora.py
+++ /dev/null
@@ -1,46 +0,0 @@
-"""Module for ReLoRA trainer"""
-
-import torch
-from torch.optim.lr_scheduler import LRScheduler
-
-from axolotl.core.trainers.base import AxolotlTrainer
-from axolotl.monkeypatch.relora import ReLoRAScheduler
-
-
-class ReLoRATrainer(AxolotlTrainer):
-    """Trainer subclass that uses the `OneCycleLR` scheduler"""
-
-    tag_names = ["axolotl", "relora"]
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.lr_scheduler = None
-
-    def create_scheduler(
-        self,
-        num_training_steps: int,
-        optimizer: torch.optim.Optimizer | None = None,
-    ) -> LRScheduler:
-        optimizer = self.optimizer if optimizer is None else optimizer
-        lr_scheduler: LRScheduler = super().create_scheduler(
-            num_training_steps, optimizer
-        )
-
-        if self.args.relora_steps:
-            warmup_steps = (
-                self.args.relora_warmup_steps if self.args.relora_warmup_steps else 10
-            )
-            anneal_steps = (
-                self.args.relora_anneal_steps if self.args.relora_anneal_steps else 1
-            )
-            self.lr_scheduler = ReLoRAScheduler(  # type: ignore
-                optimizer,
-                lr_scheduler,
-                self.args.relora_steps,
-                anneal_steps,
-                warmup_steps,
-            )
-        else:
-            self.lr_scheduler = lr_scheduler  # type: ignore
-
-        return self.lr_scheduler  # type: ignore
diff --git a/src/axolotl/core/trainers/trl.py b/src/axolotl/core/trainers/trl.py
index b2c5c54ca..c5f19a6fe 100644
--- a/src/axolotl/core/trainers/trl.py
+++ b/src/axolotl/core/trainers/trl.py
@@ -1,161 +1,41 @@
-"""Module for TRL PPO trainer"""
+"""Module for TRL RL trainers"""
 
-from typing import Literal, Union
-
-import torch
-from tqdm import tqdm
 from trl import (
     CPOTrainer,
     KTOTrainer,
     ORPOTrainer,
-    PPOTrainer,
     PRMTrainer,
     RewardTrainer,
 )
 
-from axolotl.core.trainers.mixins import RngLoaderMixin
+from axolotl.core.trainers.mixins import DistributedParallelMixin, RngLoaderMixin
+from axolotl.core.trainers.mixins.optimizer import OptimizerInitMixin, OptimizerMixin
 from axolotl.core.trainers.mixins.scheduler import SchedulerMixin
 
 
-class TRLPPOTrainer(PPOTrainer):
-    """Wrapper for TRL PPO trainer to handle customizations"""
-
-    tag_names = ["axolotl", "ppo"]
-
-    def train(
-        self,
-        reward_pipe,
-        resume_from_checkpoint=None,  # pylint: disable=unused-argument
-    ):
-        generation_kwargs = {
-            "min_length": -1,
-            "top_k": 0.0,
-            "top_p": 1.0,
-            "do_sample": True,
-            "pad_token_id": self.tokenizer.eos_token_id,
-            "max_new_tokens": 32,
-        }
-        sent_kwargs = {
-            "return_all_scores": True,
-            "function_to_apply": "none",
-            "batch_size": 16,
-        }
-
-        for _, batch in tqdm(enumerate(self.dataloader)):
-            query_tensors = batch["input_ids"]
-
-            # generate model response
-            response_tensors, ref_response_tensors = self.generate(
-                query_tensors,
-                return_prompt=False,
-                generate_ref_response=True,
-                **generation_kwargs,
-            )
-            batch["response"] = self.tokenizer.batch_decode(response_tensors)
-            batch["ref_response"] = self.tokenizer.batch_decode(ref_response_tensors)
-
-            # Compute sentiment score
-            texts = [q + r for q, r in zip(batch["query"], batch["response"])]
-            pipe_outputs = reward_pipe(texts, **sent_kwargs)
-            rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]
-            ref_texts = [q + r for q, r in zip(batch["query"], batch["ref_response"])]
-            ref_pipe_outputs = reward_pipe(ref_texts, **sent_kwargs)
-            ref_rewards = [
-                torch.tensor(output[1]["score"]) for output in ref_pipe_outputs
-            ]
-            batch["ref_rewards"] = ref_rewards
-
-            # Run PPO step
-            stats = self.step(query_tensors, response_tensors, rewards)
-            self.log_stats(
-                stats,
-                batch,
-                rewards,
-                columns_to_log=["query", "response", "ref_response", "ref_rewards"],
-            )
-
-
-class AxolotlORPOTrainer(RngLoaderMixin, SchedulerMixin, ORPOTrainer):
+class AxolotlORPOTrainer(
+    RngLoaderMixin,
+    SchedulerMixin,
+    OptimizerMixin,
+    OptimizerInitMixin,
+    DistributedParallelMixin,
+    ORPOTrainer,
+):
     """
     Extend the base ORPOTrainer for axolotl helpers
     """
 
     tag_names = ["axolotl", "orpo"]
 
-    def get_batch_loss_metrics(
-        self,
-        model,
-        batch: dict[str, Union[list, torch.LongTensor]],
-        train_eval: Literal["train", "eval"] = "train",
-    ):
-        """Compute the ORPO loss and other metrics for the given batch of inputs for train or test."""
 
-        # TODO remove once https://github.com/huggingface/trl/pull/3069 is included in a trl release
-
-        metrics = {}
-
-        forward_output = self.concatenated_forward(model, batch)
-        (
-            policy_chosen_logps,
-            policy_rejected_logps,
-            policy_chosen_logits,
-            policy_rejected_logits,
-            policy_nll_loss,
-        ) = forward_output[:5]
-        if self.aux_loss_enabled:
-            aux_loss = forward_output[5]
-
-        losses, chosen_rewards, rejected_rewards, log_odds_ratio, log_odds_chosen = (
-            self.odds_ratio_loss(policy_chosen_logps, policy_rejected_logps)
-        )
-        # full ORPO loss
-        loss = policy_nll_loss - losses.mean()
-
-        reward_accuracies = (chosen_rewards > rejected_rewards).float()
-
-        prefix = "eval_" if train_eval == "eval" else ""
-        metrics[f"{prefix}rewards/chosen"] = self.accelerator.gather_for_metrics(
-            chosen_rewards
-        ).mean()
-        metrics[f"{prefix}rewards/rejected"] = self.accelerator.gather_for_metrics(
-            rejected_rewards
-        ).mean()
-        metrics[f"{prefix}rewards/accuracies"] = self.accelerator.gather_for_metrics(
-            reward_accuracies
-        ).mean()
-        metrics[f"{prefix}rewards/margins"] = self.accelerator.gather_for_metrics(
-            chosen_rewards - rejected_rewards
-        ).mean()
-        metrics[f"{prefix}logps/rejected"] = (
-            self.accelerator.gather_for_metrics(policy_rejected_logps).detach().mean()
-        )
-        metrics[f"{prefix}logps/chosen"] = (
-            self.accelerator.gather_for_metrics(policy_chosen_logps).detach().mean()
-        )
-        metrics[f"{prefix}logits/rejected"] = self.accelerator.gather_for_metrics(
-            policy_rejected_logits.detach().mean()
-        ).mean()
-        metrics[f"{prefix}logits/chosen"] = self.accelerator.gather_for_metrics(
-            policy_chosen_logits.detach().mean()
-        ).mean()
-        metrics[f"{prefix}nll_loss"] = (
-            self.accelerator.gather_for_metrics(policy_nll_loss).detach().mean()
-        )
-        metrics[f"{prefix}log_odds_ratio"] = (
-            self.accelerator.gather_for_metrics(log_odds_ratio).detach().mean()
-        )
-        metrics[f"{prefix}log_odds_chosen"] = (
-            self.accelerator.gather_for_metrics(log_odds_chosen).detach().mean()
-        )
-        for k, v in metrics.items():
-            metrics[k] = v.item()
-        if self.aux_loss_enabled:
-            loss += self.aux_loss_coef * aux_loss
-
-        return loss, metrics
-
-
-class AxolotlKTOTrainer(RngLoaderMixin, SchedulerMixin, KTOTrainer):
+class AxolotlKTOTrainer(
+    RngLoaderMixin,
+    SchedulerMixin,
+    OptimizerMixin,
+    OptimizerInitMixin,
+    DistributedParallelMixin,
+    KTOTrainer,
+):
     """
     Extend the base KTOTrainer for axolotl helpers
     """
@@ -163,89 +43,29 @@ class AxolotlKTOTrainer(RngLoaderMixin, SchedulerMixin, KTOTrainer):
     tag_names = ["axolotl", "kto"]
 
 
-class AxolotlCPOTrainer(RngLoaderMixin, SchedulerMixin, CPOTrainer):
+class AxolotlCPOTrainer(
+    RngLoaderMixin,
+    SchedulerMixin,
+    OptimizerMixin,
+    OptimizerInitMixin,
+    DistributedParallelMixin,
+    CPOTrainer,
+):
     """
     Extend the base CPOTrainer for axolotl helpers
     """
 
     tag_names = ["axolotl", "cpo"]
 
-    def get_batch_loss_metrics(
-        self,
-        model,
-        batch: dict[str, Union[list, torch.LongTensor]],
-        train_eval: Literal["train", "eval"] = "train",
-    ):
-        """Compute the CPO loss and other metrics for the given batch of inputs for train or test."""
-        metrics = {}
 
-        forward_output = self.concatenated_forward(model, batch)
-        (
-            policy_chosen_logps,
-            policy_rejected_logps,
-            policy_chosen_logits,
-            policy_rejected_logits,
-            policy_nll_loss,
-        ) = forward_output[:5]
-        if self.aux_loss_enabled:
-            aux_loss = forward_output[5]
-
-        losses, chosen_rewards, rejected_rewards = self.cpo_loss(
-            policy_chosen_logps,
-            policy_rejected_logps,
-        )
-
-        loss = losses.mean() + self.cpo_alpha * policy_nll_loss
-        reward_accuracies = (chosen_rewards > rejected_rewards).float()
-
-        prefix = "eval_" if train_eval == "eval" else ""
-        metrics[f"{prefix}rewards/chosen"] = (
-            self.accelerator.gather_for_metrics(chosen_rewards).mean().item()
-        )
-        metrics[f"{prefix}rewards/rejected"] = (
-            self.accelerator.gather_for_metrics(rejected_rewards).mean().item()
-        )
-        metrics[f"{prefix}rewards/accuracies"] = (
-            self.accelerator.gather_for_metrics(reward_accuracies).mean().item()
-        )
-        metrics[f"{prefix}rewards/margins"] = (
-            self.accelerator.gather_for_metrics(chosen_rewards - rejected_rewards)
-            .mean()
-            .item()
-        )
-        metrics[f"{prefix}logps/rejected"] = (
-            self.accelerator.gather_for_metrics(policy_rejected_logps)
-            .detach()
-            .mean()
-            .item()
-        )
-        metrics[f"{prefix}logps/chosen"] = (
-            self.accelerator.gather_for_metrics(policy_chosen_logps)
-            .detach()
-            .mean()
-            .item()
-        )
-        metrics[f"{prefix}logits/rejected"] = (
-            self.accelerator.gather_for_metrics(policy_rejected_logits.detach().mean())
-            .mean()
-            .item()
-        )
-        metrics[f"{prefix}logits/chosen"] = (
-            self.accelerator.gather_for_metrics(policy_chosen_logits.detach().mean())
-            .mean()
-            .item()
-        )
-        metrics[f"{prefix}nll_loss"] = (
-            self.accelerator.gather_for_metrics(policy_nll_loss).detach().mean().item()
-        )
-
-        if self.aux_loss_enabled:
-            loss += self.aux_loss_coef * aux_loss
-
-        return loss, metrics
-
-
-class AxolotlRewardTrainer(RngLoaderMixin, SchedulerMixin, RewardTrainer):
+class AxolotlRewardTrainer(
+    RngLoaderMixin,
+    SchedulerMixin,
+    OptimizerMixin,
+    OptimizerInitMixin,
+    DistributedParallelMixin,
+    RewardTrainer,
+):
     """
     Extend the base RewardTrainer for axolotl helpers
     """
@@ -253,7 +73,14 @@ class AxolotlRewardTrainer(RngLoaderMixin, SchedulerMixin, RewardTrainer):
     tag_names = ["axolotl", "reward"]
 
 
-class AxolotlPRMTrainer(RngLoaderMixin, SchedulerMixin, PRMTrainer):
+class AxolotlPRMTrainer(
+    RngLoaderMixin,
+    SchedulerMixin,
+    OptimizerMixin,
+    OptimizerInitMixin,
+    DistributedParallelMixin,
+    PRMTrainer,
+):
     """
     Extend the base trl.PRMTrainer for axolotl helpers
     """
diff --git a/src/axolotl/core/training_args.py b/src/axolotl/core/training_args.py
index 9c93f77c7..d5be9fc62 100644
--- a/src/axolotl/core/training_args.py
+++ b/src/axolotl/core/training_args.py
@@ -2,244 +2,17 @@
 extra axolotl specific training args
 """
 
-from dataclasses import dataclass, field
-from typing import Optional
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Optional, Type
 
-from PIL.Image import Resampling
 from transformers import TrainingArguments
 from trl import CPOConfig, KTOConfig, ORPOConfig, PRMConfig, RewardConfig
 
+from axolotl.integrations.config import merge_training_args
 
-@dataclass
-class AxolotlTrainingMixins:
-    """
-    Mixin class for the Axolotl training args.
-    """
-
-    # pylint: disable=duplicate-code
-    model_type: Optional[str] = field(
-        default=None, metadata={"help": "HF model configuration model_type."}
-    )
-    lr_quadratic_warmup: bool = field(
-        default=False,
-        metadata={"help": "Use quadratic warmup for cosine scheduling."},
-    )
-    pretraining: bool = field(
-        default=False,
-        metadata={
-            "help": "Indicates to trainer whether we are doing continued pretraining."
-        },
-    )
-    sample_packing: bool = field(
-        default=False,
-        metadata={"help": "Use sample packing for efficient training."},
-    )
-    sample_packing_sequentially: bool = field(
-        default=False,
-        metadata={
-            "help": "Use next-fit sample packing that preserves the order of samples coming from the sampler. Use in combination with curriculum_sampling for fully sequential packing."
-        },
-    )
-    multipack_real_batches: bool = field(
-        default=False,
-        metadata={"help": "Use real batches for efficient training."},
-    )
-    eval_sample_packing: Optional[bool] = field(
-        default=None,
-        metadata={"help": "Use sample packing for efficient evals."},
-    )
-    sample_packing_efficiency: float = field(
-        default=1.0,
-        metadata={"help": "Sample packing efficiency for calculating batch length."},
-    )
-    sample_packing_bin_size: int = field(
-        default=200,
-        metadata={
-            "help": "The max number of samples that packed sample can contain after packing. Increase for better packing."
-        },
-    )
-    sample_packing_group_size: int = field(
-        default=100000,
-        metadata={
-            "help": "The number of samples to group together for packing. Increase for better packing."
-        },
-    )
-    max_seq_length: int = field(
-        default=2048,
-        metadata={"help": "The maximum sequence length the model can handle"},
-    )
-    relora_steps: Optional[int] = field(
-        default=None,
-        metadata={"help": "how often to reset for ReLoRA"},
-    )
-    relora_warmup_steps: Optional[int] = field(
-        default=None,
-        metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
-    )
-    relora_anneal_steps: Optional[int] = field(
-        default=None,
-        metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
-    )
-    relora_prune_ratio: Optional[float] = field(
-        default=0.9,
-        metadata={"help": "prune ratio for magnitude pruning of the optimizer"},
-    )
-    bench_split: Optional[str] = field(
-        default="eval", metadata={"help": "The benchmark split to run on"}
-    )
-    bench_dataset: Optional[str] = field(
-        default="pharaouk/dharma-1/dharma_1_mini.json",
-        metadata={
-            "help": "Benchmark dataset to use: options are `mmlu-zs`, `mmlu-fs`, or the full path to the dataset file"
-        },
-    )
-    do_bench_eval: Optional[bool] = field(
-        default=False, metadata={"help": "Whether to run the Benchmark evaluation."}
-    )
-    do_causal_lm_eval: Optional[bool] = field(
-        default=False, metadata={"help": "Whether to run the Causal LM evaluation."}
-    )
-    max_bench_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "If set, only evaluates on `max_bench_samples` of the benchmark dataset."
-        },
-    )
-    bench_source_max_len: int = field(
-        default=2048, metadata={"help": "Maximum source sequence length for bench."}
-    )
-    dataloader_prefetch_factor: Optional[int] = field(
-        default=None,
-        metadata={"help": "prefetch_factor argument to the dataloader"},
-    )
-    cosine_min_lr_ratio: Optional[float] = field(
-        default=None,
-        metadata={"help": "Minimum learning rate is min_lr_ratio * learning_rate"},
-    )
-    cosine_constant_lr_ratio: Optional[float] = field(
-        default=None,
-        metadata={
-            "help": "Starting constant learning rate step is cosine_constant_lr_ratio * max_steps"
-        },
-    )
-    loraplus_lr_ratio: Optional[float] = field(
-        default=None, metadata={"help": "loraplus learning rate ratio lr_B / lr_A."}
-    )
-    loraplus_lr_embedding: Optional[float] = field(
-        default=1e-6,
-        metadata={"help": "loraplus learning rate for lora embedding layers."},
-    )
-    embedding_lr_scale: Optional[float] = field(
-        default=None,
-        metadata={"help": "Scale the learning rate for the embedding layers."},
-    )
-    lr_groups: Optional[list[dict]] = field(
-        default=None,
-        metadata={"help": "Specify learning rate groups for with different LRs."},
-    )
-    embedding_lr: Optional[float] = field(
-        default=None,
-        metadata={"help": "absolute learning rate for the embedding layers."},
-    )
-    qlora: bool = field(
-        default=False,
-        metadata={"help": "whether this is a qlora training"},
-    )
-    orpo_alpha: Optional[float] = field(
-        default=None,
-    )
-    lisa_n_layers: Optional[int] = field(
-        default=None,
-        metadata={"help": "the number of activate layers in LISA"},
-    )
-    lisa_step_interval: Optional[int] = field(
-        default=None,
-        metadata={"help": "how often to switch layers in LISA"},
-    )
-    lisa_layers_attribute: Optional[str] = field(
-        default=None,
-        metadata={"help": "path under the model to access the layers"},
-    )
-    curriculum_sampling: Optional[bool] = field(
-        default=None,
-        metadata={"help": "whether to use sequential sampling for curriculum learning"},
-    )
-    alternate_optimizer: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "workaround to pass an alternate optimizer to the HF trainer"
-        },
-    )
-    alternate_lr_scheduler_type: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "workaround to pass an alternate lr scheduler to the HF trainer"
-        },
-    )
-    chat_template: Optional[str] = field(
-        default=None,
-        metadata={"help": "Chat template converting chat messages to text"},
-    )
-
-    kd_ce_alpha: Optional[float] = field(
-        default=None,
-        metadata={
-            "help": "The alpha scaling parameter for SFT cross entropy loss when using KD"
-        },
-    )
-
-    kd_alpha: Optional[float] = field(
-        default=1.0,
-        metadata={"help": "The alpha scaling parameter for KD loss"},
-    )
-
-    kd_temperature: Optional[float] = field(
-        default=1.0,
-        metadata={
-            "help": "the temperature parameter for KL divergence loss when using KD"
-        },
-    )
-
-    kd_zscore_base_temp: Optional[float] = field(
-        default=None,
-        metadata={
-            "help": "the base temperature parameter for KL divergence with z-score when using KD"
-        },
-    )
-
-    kd_top_k_before_softmax: Optional[bool] = field(
-        default=None,
-        metadata={
-            "help": "Whether to apply top_k_before_softmax to the logits when using KD"
-        },
-    )
-
-    adam_beta3: Optional[float] = field(
-        default=None,
-        metadata={
-            "help": "The beta3 hyperparameter used in some optimizers such as CAME"
-        },
-    )
-    adam_epsilon2: Optional[float] = field(
-        default=None,
-        metadata={
-            "help": "The epsilon2 hyperparameter used in some optimizers such as CAME"
-        },
-    )
-
-    # multi-modal section
-
-    image_size: int | tuple[int, int] | None = field(
-        default=None,
-        metadata={"help": "The size of the image to resize to"},
-    )
-
-    image_resize_algorithm: Resampling | None = field(
-        default=None,
-        metadata={"help": "The algorithm to use for image resizing"},
-    )
-
-    # end of multi-modal section
+AxolotlTrainingMixins: Type = merge_training_args()
 
 
 @dataclass
diff --git a/src/axolotl/core/training_args_base.py b/src/axolotl/core/training_args_base.py
new file mode 100644
index 000000000..fd0859ae9
--- /dev/null
+++ b/src/axolotl/core/training_args_base.py
@@ -0,0 +1,260 @@
+"""
+Base Axolotl Training Mixins shared across various trainer configs
+"""
+
+from dataclasses import dataclass, field
+from typing import Optional
+
+from PIL.Image import Resampling
+
+
+@dataclass
+class AxolotlTrainingMixins:
+    """
+    Mixin class for the Axolotl training args.
+    """
+
+    # pylint: disable=duplicate-code
+    model_type: Optional[str] = field(
+        default=None, metadata={"help": "HF model configuration model_type."}
+    )
+    lr_quadratic_warmup: bool = field(
+        default=False,
+        metadata={"help": "Use quadratic warmup for cosine scheduling."},
+    )
+    pretraining: bool = field(
+        default=False,
+        metadata={
+            "help": "Indicates to trainer whether we are doing continued pretraining."
+        },
+    )
+    sample_packing: bool = field(
+        default=False,
+        metadata={"help": "Use sample packing for efficient training."},
+    )
+    sample_packing_sequentially: bool = field(
+        default=False,
+        metadata={
+            "help": "Use next-fit sample packing that preserves the order of samples coming from the sampler. Use in combination with curriculum_sampling for fully sequential packing."
+        },
+    )
+    sample_packing_mp_start_method: str | None = field(
+        default=None,
+        metadata={"help": "The multiprocessing start method to use."},
+    )
+    sample_packing_drop_attention_mask: bool = field(
+        default=False,
+        metadata={"help": "Drop attention mask from inputs when using packing."},
+    )
+    multipack_real_batches: bool = field(
+        default=False,
+        metadata={"help": "Use real batches for efficient training."},
+    )
+    eval_sample_packing: Optional[bool] = field(
+        default=None,
+        metadata={"help": "Use sample packing for efficient evals."},
+    )
+    sample_packing_efficiency: float = field(
+        default=1.0,
+        metadata={"help": "Sample packing efficiency for calculating batch length."},
+    )
+    sample_packing_bin_size: int = field(
+        default=200,
+        metadata={
+            "help": "The max number of samples that packed sample can contain after packing. Increase for better packing."
+        },
+    )
+    sample_packing_group_size: int = field(
+        default=100000,
+        metadata={
+            "help": "The number of samples to group together for packing. Increase for better packing."
+        },
+    )
+    max_seq_length: int = field(
+        default=2048,
+        metadata={"help": "The maximum sequence length the model can handle"},
+    )
+    dataset_num_proc: int | None = field(
+        default=None,
+        metadata={"help": "The number of processes to use for data processing"},
+    )
+    relora_steps: Optional[int] = field(
+        default=None,
+        metadata={"help": "how often to reset for ReLoRA"},
+    )
+    relora_prune_ratio: Optional[float] = field(
+        default=0.9,
+        metadata={"help": "prune ratio for magnitude pruning of the optimizer"},
+    )
+    jagged_restart_steps: Optional[int] = field(
+        default=None,
+        metadata={"help": "how often to reset for jagged restarts"},
+    )
+    jagged_restart_warmup_steps: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "how many warmup steps to take after reset for jagged restarts"
+        },
+    )
+    jagged_restart_anneal_steps: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "how many anneal steps to take before reset for jagged restarts"
+        },
+    )
+    bench_split: Optional[str] = field(
+        default="eval", metadata={"help": "The benchmark split to run on"}
+    )
+    bench_dataset: Optional[str] = field(
+        default="pharaouk/dharma-1/dharma_1_mini.json",
+        metadata={
+            "help": "Benchmark dataset to use: options are `mmlu-zs`, `mmlu-fs`, or the full path to the dataset file"
+        },
+    )
+    do_bench_eval: Optional[bool] = field(
+        default=False, metadata={"help": "Whether to run the Benchmark evaluation."}
+    )
+    do_causal_lm_eval: Optional[bool] = field(
+        default=False, metadata={"help": "Whether to run the Causal LM evaluation."}
+    )
+    max_bench_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "If set, only evaluates on `max_bench_samples` of the benchmark dataset."
+        },
+    )
+    bench_source_max_len: int = field(
+        default=2048, metadata={"help": "Maximum source sequence length for bench."}
+    )
+    dataloader_prefetch_factor: Optional[int] = field(
+        default=None,
+        metadata={"help": "prefetch_factor argument to the dataloader"},
+    )
+    cosine_min_lr_ratio: Optional[float] = field(
+        default=None,
+        metadata={"help": "Minimum learning rate is min_lr_ratio * learning_rate"},
+    )
+    cosine_constant_lr_ratio: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": "Starting constant learning rate step is cosine_constant_lr_ratio * max_steps"
+        },
+    )
+    loraplus_lr_ratio: Optional[float] = field(
+        default=None, metadata={"help": "loraplus learning rate ratio lr_B / lr_A."}
+    )
+    loraplus_lr_embedding: Optional[float] = field(
+        default=1e-6,
+        metadata={"help": "loraplus learning rate for lora embedding layers."},
+    )
+    embedding_lr_scale: Optional[float] = field(
+        default=None,
+        metadata={"help": "Scale the learning rate for the embedding layers."},
+    )
+    lr_groups: Optional[list[dict]] = field(
+        default=None,
+        metadata={"help": "Specify learning rate groups for with different LRs."},
+    )
+    embedding_lr: Optional[float] = field(
+        default=None,
+        metadata={"help": "absolute learning rate for the embedding layers."},
+    )
+    qlora: bool = field(
+        default=False,
+        metadata={"help": "whether this is a qlora training"},
+    )
+    orpo_alpha: Optional[float] = field(
+        default=None,
+    )
+    lisa_n_layers: Optional[int] = field(
+        default=None,
+        metadata={"help": "the number of activate layers in LISA"},
+    )
+    lisa_step_interval: Optional[int] = field(
+        default=None,
+        metadata={"help": "how often to switch layers in LISA"},
+    )
+    lisa_layers_attribute: Optional[str] = field(
+        default=None,
+        metadata={"help": "path under the model to access the layers"},
+    )
+    curriculum_sampling: Optional[bool] = field(
+        default=None,
+        metadata={"help": "whether to use sequential sampling for curriculum learning"},
+    )
+    alternate_lr_scheduler_type: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "workaround to pass an alternate lr scheduler to the HF trainer"
+        },
+    )
+    chat_template: Optional[str] = field(
+        default=None,
+        metadata={"help": "Chat template converting chat messages to text"},
+    )
+
+    # kd_ce_alpha: Optional[float] = field(
+    #     default=None,
+    #     metadata={
+    #         "help": "The alpha scaling parameter for SFT cross entropy loss when using KD"
+    #     },
+    # )
+    #
+    # kd_alpha: Optional[float] = field(
+    #     default=1.0,
+    #     metadata={"help": "The alpha scaling parameter for KD loss"},
+    # )
+    #
+    # kd_temperature: Optional[float] = field(
+    #     default=1.0,
+    #     metadata={
+    #         "help": "the temperature parameter for KL divergence loss when using KD"
+    #     },
+    # )
+
+    adam_beta3: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": "The beta3 hyperparameter used in some optimizers such as CAME"
+        },
+    )
+    adam_epsilon2: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": "The epsilon2 hyperparameter used in some optimizers such as CAME"
+        },
+    )
+
+    activation_offloading: bool | None = field(
+        default=None,
+        metadata={"help": "Use activation offloading with CUDA streams for training."},
+    )
+
+    # multi-modal section
+
+    image_size: int | tuple[int, int] | None = field(
+        default=None,
+        metadata={"help": "The size of the image to resize to"},
+    )
+
+    image_resize_algorithm: Resampling | None = field(
+        default=None,
+        metadata={"help": "The algorithm to use for image resizing"},
+    )
+
+    # end of multi-modal section
+
+    dion_learning_rate: float | None = field(
+        default=None,
+        metadata={"help": "The learning rate for Dion"},
+    )
+    dion_momentum: float | None = field(
+        default=None,
+        metadata={"help": "The momentum for Dion"},
+    )
+    dion_rank_fraction: float | None = field(
+        default=None,
+    )
+    dion_rank_multiple_of: int | None = field(
+        default=None,
+    )
diff --git a/src/axolotl/datasets.py b/src/axolotl/datasets.py
index 143928019..c9d006ac8 100644
--- a/src/axolotl/datasets.py
+++ b/src/axolotl/datasets.py
@@ -1,12 +1,10 @@
 """Module containing Dataset functionality"""
 
-import logging
-import os
-from typing import List, Optional, Union
-
 import torch
 from datasets import Dataset, IterableDataset
 
+from axolotl.utils.logging import get_logger
+
 from .prompt_tokenizers import PromptTokenizingStrategy
 
 # We want this to be a wrapper for an existing dataset that we have loaded
@@ -15,25 +13,25 @@ from .prompt_tokenizers import PromptTokenizingStrategy
 # let's check to ensure we don't truncate an item in the middle, we'll use
 # the collators later on to pad the datasets
 
-LOG = logging.getLogger("axolotl")
+LOG = get_logger(__name__)
 
 
 class TokenizedPromptDataset(Dataset):
-    """
-    Dataset that returns tokenized prompts from a stream of text files.
-        Args:
-            prompt_tokenizer (PromptTokenizingStrategy): The prompt tokenizing method for processing the data.
-            dataset (dataset.Dataset): Dataset with text files.
-            process_count (int): Number of processes to use for tokenizing.
-            keep_in_memory (bool): Whether to keep the tokenized dataset in memory.
+    """Dataset that returns tokenized prompts from a stream of text files.
+
+    Args:
+        prompt_tokenizer: The prompt tokenizing method for processing the data.
+        dataset: Dataset with text files.
+        process_count: Number of processes to use for tokenizing.
+        keep_in_memory: Whether to keep the tokenized dataset in memory.
     """
 
     def __init__(  # pylint: disable=super-init-not-called
         self,
         prompt_tokenizer: PromptTokenizingStrategy,
         dataset: Dataset,
-        process_count: Optional[int] = None,
-        keep_in_memory: Optional[bool] = False,
+        process_count: int | None = None,
+        keep_in_memory: bool | None = False,
         **kwargs,
     ):
         self.prompt_tokenizer = prompt_tokenizer
@@ -46,7 +44,6 @@ class TokenizedPromptDataset(Dataset):
 
     def process(self, dataset):
         features = dataset.features.keys()
-        num_proc = min(64, self.process_count if self.process_count else os.cpu_count())
 
         map_kwargs = {}
         if self.prompt_tokenizer.supports_batched:
@@ -59,13 +56,13 @@ class TokenizedPromptDataset(Dataset):
         ):
             dataset = dataset.filter(
                 self.prompt_tokenizer.filter_rows,
-                num_proc=num_proc,
+                num_proc=self.process_count,
                 desc="Strategy Filtering Rows",
             )
 
         return dataset.map(
             self.prompt_tokenizer.tokenize_prompt,
-            num_proc=num_proc,
+            num_proc=self.process_count,
             remove_columns=features,
             keep_in_memory=self.keep_in_memory,
             desc="Tokenizing Prompts",
@@ -75,14 +72,14 @@ class TokenizedPromptDataset(Dataset):
 
 def wrap_dataset_for_tokenized_prompt(
     prompt_tokenizer: PromptTokenizingStrategy,
-    dataset: Union[Dataset, IterableDataset],
+    dataset: Dataset | IterableDataset,
     **kwargs,
 ):
     if isinstance(dataset, IterableDataset):
         map_kwargs = {}
         if prompt_tokenizer.supports_batched:
             map_kwargs["batched"] = True
-        features = dataset.features.keys()
+        features = list(dataset.features.keys())
         return dataset.map(
             prompt_tokenizer.tokenize_prompt,
             remove_columns=features,
@@ -93,12 +90,13 @@ def wrap_dataset_for_tokenized_prompt(
 
 # TODO this isn't the best since it can't interleave datasets
 class ConstantLengthDataset(IterableDataset):
-    """
-    Iterable dataset that returns constant length chunks of tokens from stream of text files.
-        Args:
-            tokenizer (Tokenizer): The processor used for processing the data.
-            dataset (dataset.Dataset): Dataset with text files.
-            seq_length (int): Length of token sequences to return.
+    """Iterable dataset that returns constant length chunks of tokens from stream of
+    text files.
+
+    Args:
+        tokenizer: The processor used for processing the data.
+        dataset: Dataset with text files.
+        seq_length: Length of token sequences to return.
     """
 
     def __init__(  # pylint: disable=super-init-not-called
@@ -109,7 +107,7 @@ class ConstantLengthDataset(IterableDataset):
     ):
         self.tokenizer = tokenizer
         self.concat_token_id = tokenizer.eos_token_id
-        self.datasets: List[IterableDataset] = datasets
+        self.datasets: list[IterableDataset] = datasets
         self.seq_length = seq_length
 
         vocab_size = len(tokenizer.get_vocab())
@@ -173,7 +171,10 @@ class ConstantLengthDataset(IterableDataset):
                             }
                         else:
                             LOG.warning(
-                                f"dropping batch due to tensor size mismatch input_ids: {input_ids.size()}, labels: {labels.size()}, attention_mask: {attention_mask.size()}"
+                                "Dropping batch due to tensor size mismatch "
+                                f"input_ids: {input_ids.size()}, "
+                                f"labels: {labels.size()}, "
+                                f"attention_mask: {attention_mask.size()}"
                             )
                     buffer = {
                         "input_ids": [],
diff --git a/src/axolotl/evaluate.py b/src/axolotl/evaluate.py
index 6d6813730..2b5869939 100644
--- a/src/axolotl/evaluate.py
+++ b/src/axolotl/evaluate.py
@@ -7,7 +7,6 @@ from pathlib import Path
 from typing import Dict, Optional
 
 import torch
-from accelerate.logging import get_logger
 from datasets import Dataset
 from transformers.trainer import Trainer
 
@@ -17,6 +16,7 @@ from axolotl.train import (
 )
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import cleanup_distributed
+from axolotl.utils.logging import get_logger
 from axolotl.utils.trainer import setup_trainer
 
 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
diff --git a/src/axolotl/integrations/base.py b/src/axolotl/integrations/base.py
index eb2b29cbe..94ee8d4b1 100644
--- a/src/axolotl/integrations/base.py
+++ b/src/axolotl/integrations/base.py
@@ -22,15 +22,20 @@ from __future__ import annotations
 
 import collections
 import importlib
-import logging
+import traceback
 from typing import TYPE_CHECKING, Callable, OrderedDict, Union
 
 from peft import PeftModel
+from torch import nn
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LRScheduler
 from transformers import PreTrainedModel, Trainer
+from transformers.trainer_pt_utils import get_parameter_names
 
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
 
 if TYPE_CHECKING:
     from axolotl.common.datasets import TrainDatasetMeta
@@ -71,8 +76,8 @@ class BasePlugin:
     def __init__(self):
         """Initializes the BasePlugin."""
 
-    def register(self, cfg: DictDefault):  # pylint: disable=unused-argument
-        """Registers the plugin with the given configuration.
+    def register(self, cfg: dict):  # pylint: disable=unused-argument
+        """Registers the plugin with the given configuration as an unparsed dict.
 
         Args:
             cfg: The configuration for the plugin.
@@ -81,6 +86,11 @@ class BasePlugin:
     def get_input_args(self) -> str | None:
         """Returns a pydantic model for the plugin's input arguments."""
 
+    def get_training_args_mixin(self) -> str | None:
+        """
+        Returns a dataclass model for the plugin's training arguments.
+        """
+
     def load_datasets(
         self, cfg: DictDefault, preprocess: bool = False
     ) -> Union["TrainDatasetMeta", None]:
@@ -156,6 +166,31 @@ class BasePlugin:
             trainer: The trainer object for training.
         """
 
+    def get_training_args(self, cfg: DictDefault):  # pylint: disable=unused-argument):
+        """
+        Returns custom training arguments to set on TrainingArgs.
+
+        Args:
+            cfg: The global axolotl configuration.
+
+        Returns:
+            object: dict containing the training arguments.
+        """
+
+    def get_collator_cls_and_kwargs(
+        self, cfg: DictDefault, is_eval: bool = False
+    ):  # pylint: disable=unused-argument):
+        """
+        Returns a custom class for the collator.
+
+        Args:
+            cfg: The global axolotl configuration.
+            is_eval: Whether this is an eval split.
+
+        Returns:
+            class: The class for the collator.
+        """
+
     # pylint: disable=unused-argument
     def create_optimizer(self, cfg: DictDefault, trainer: Trainer) -> Optimizer | None:
         """Creates and returns an optimizer for training.
@@ -276,7 +311,7 @@ def load_plugin(plugin_name: str) -> BasePlugin:
     return plugin
 
 
-class PluginManager:
+class PluginManager:  # pylint: disable=too-many-public-methods
     """The `PluginManager` class is responsible for loading and managing plugins. It
     should be a singleton so it can be accessed from anywhere in the codebase.
 
@@ -331,12 +366,15 @@ class PluginManager:
             ImportError: If the plugin module cannot be imported.
         """
         try:
-            logging.info(f"Attempting to load plugin: {plugin_name}")
+            LOG.info(f"Attempting to load plugin: {plugin_name}")
             plugin = load_plugin(plugin_name)
             self.plugins[plugin_name] = plugin
-            logging.info(f"Plugin loaded successfully: {plugin_name}")
-        except ImportError:
-            logging.error(f"Failed to load plugin: {plugin_name}")
+            LOG.info(f"Plugin loaded successfully: {plugin_name}")
+        except ImportError as exc:
+            LOG.error(f"Failed to load plugin: {plugin_name}")
+            # print stacktrace
+            traceback.print_exc()
+            print(f"Error: {exc}")
 
     def get_input_args(self) -> list[str]:
         """Returns a list of Pydantic classes for all registered plugins' input arguments.'
@@ -351,6 +389,20 @@ class PluginManager:
                 input_args.append(input_args_from_plugin)
         return input_args
 
+    def get_training_args_mixin(self):
+        """
+        Returns a list of dataclasses for all registered plugins' training args mixins'
+
+        Returns:
+        list[str]: A list of dataclsses
+        """
+        training_args = []
+        for plugin in self.plugins.values():
+            training_args_from_plugin = plugin.get_training_args_mixin()
+            if training_args_from_plugin is not None:
+                training_args.append(training_args_from_plugin)
+        return training_args
+
     def load_datasets(
         self, cfg: DictDefault, preprocess: bool = False
     ) -> Union["TrainDatasetMeta", None]:
@@ -440,6 +492,42 @@ class PluginManager:
                 return trainer_cls
         return None
 
+    def get_training_args(self, cfg):
+        """
+        Calls the get_training_args method of all registered plugins and returns the combined training arguments.
+
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+
+        Returns:
+        object: The training arguments
+        """
+        training_args_kwargs = {}
+        for plugin in self.plugins.values():
+            training_args = plugin.get_training_args(cfg)
+            if training_args is not None:
+                training_args_kwargs.update(training_args)
+
+        return training_args_kwargs
+
+    def get_collator_cls_and_kwargs(self, cfg, is_eval=False):
+        """
+        Calls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.
+
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+        is_eval (bool): Whether this is an eval split.
+
+        Returns:
+        object: The collator class, or None if none was found.
+        """
+        for plugin in self.plugins.values():
+            collator = plugin.get_collator_cls_and_kwargs(cfg, is_eval=is_eval)
+            if collator is not None:
+                collator_cls, collator_kwargs = collator
+                return collator_cls, collator_kwargs
+        return None
+
     def post_trainer_create(self, cfg: DictDefault, trainer: Trainer):
         """Calls the `post_trainer_create` method of all registered plugins.
 
@@ -555,3 +643,24 @@ class BaseOptimizerFactory:
         self, opt_model, training_args, **optimizer_kwargs
     ) -> Optimizer | None:
         pass
+
+    # duplicated from transformers
+    def get_decay_parameter_names(self, model) -> list[str]:
+        """
+        Get all parameter names that weight decay will be applied to.
+
+        This function filters out parameters in two ways:
+        1. By layer type (instances of layers specified in ALL_LAYERNORM_LAYERS)
+        2. By parameter name patterns (containing 'bias', or variation of 'norm')
+        """
+        forbidden_name_patterns = [
+            r"bias",
+            r"layernorm",
+            r"rmsnorm",
+            r"(?:^|\.)norm(?:$|\.)",
+            r"_norm(?:$|\.)",
+        ]
+        decay_parameters = get_parameter_names(
+            model, [nn.LayerNorm], forbidden_name_patterns
+        )
+        return decay_parameters
diff --git a/src/axolotl/integrations/config.py b/src/axolotl/integrations/config.py
index b443f228e..f5fc07e9e 100644
--- a/src/axolotl/integrations/config.py
+++ b/src/axolotl/integrations/config.py
@@ -16,7 +16,7 @@ Module to handle merging the plugins' input arguments with the base configuratio
 This was moved here to prevent circular imports.
 """
 
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Type
 
 from axolotl.utils.schemas.config import (
     AxolotlConfigWCapabilities as AxolotlConfigWCapabilitiesBase,
@@ -61,3 +61,43 @@ def merge_input_args():
         ]
         return AxolotlConfigWCapabilities, AxolotlInputConfig
     return AxolotlConfigWCapabilitiesBase, AxolotlInputConfigBase
+
+
+def merge_training_args() -> Type:
+    """
+    Merges training arguments from registered plugins with the base TrainingArguments.
+
+    This function retrieves the training arguments from registered plugins using the PluginManager.
+    It then dynamically creates new classes, AxolotlTrainingMixins,
+    that inherit from the base configurations and include the training arguments from the plugins.
+
+    Returns:
+    tuple: A tuple containing the newly created classes, AxolotlTrainingMixins.
+    """
+    # pylint: disable=duplicate-code
+    from axolotl.core.training_args_base import (
+        AxolotlTrainingMixins as AxolotlTrainingMixinsBase,
+    )
+    from axolotl.integrations.base import PluginManager
+
+    plugin_manager = PluginManager.get_instance()
+    training_args_mixins: List[str] = plugin_manager.get_training_args_mixin()
+    mixin_classes = []
+    dynamic_input = ""
+    for plugin_args in training_args_mixins:
+        plugin_module, plugin_cls = plugin_args.rsplit(".", 1)
+        dynamic_input += f"from {plugin_module} import {plugin_cls}\n"
+        mixin_classes.append(plugin_cls)
+    if dynamic_input:
+        dynamic_input += f"class AxolotlTrainingMixins(AxolotlTrainingMixinsBase, {', '.join(mixin_classes)}):\n    pass\n"
+
+        namespace: Dict[Any, Any] = {}
+        local_vars = {"AxolotlTrainingMixinsBase": AxolotlTrainingMixinsBase}
+        exec(  # pylint: disable=exec-used  # nosec B102
+            dynamic_input, {**globals(), **local_vars}, namespace
+        )
+        AxolotlTrainingMixins = namespace[  # pylint: disable=invalid-name
+            "AxolotlTrainingMixins"
+        ]
+        return AxolotlTrainingMixins
+    return AxolotlTrainingMixinsBase
diff --git a/src/axolotl/integrations/cut_cross_entropy/README.md b/src/axolotl/integrations/cut_cross_entropy/README.md
index 627ebd935..02e4e6686 100644
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -19,7 +19,7 @@ python scripts/cutcrossentropy_install.py | sh
 
 - If you are installing from pip
 ```bash
-pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/apple/ml-cross-entropy.git@bad6f7b49c75fdec69471abb71b4cddd0f0c6438"
+pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0ee9ee8"
 ```
 
 ## Usage
@@ -31,27 +31,40 @@ plugins:
 
 ## Supported Models
 
-- llama
-- llama4
-- llama4_text
-- mllama
-- phi3
+- arcee
+- cohere
+- cohere2
 - gemma
 - gemma2
 - gemma3
 - gemma3_text
+- gemma3n
+- gemma3n_text
+- glm
+- glm4
+- gpt_oss
+- granite
+- granitemoe
+- hunyuan_v1_dense
+- hunyuan_v1_moe
+- llama
+- llama4
+- llama4_text
 - mistral
 - mistral3
+- mixtral
+- mllama
+- phi
+- phi3
+- phi4_multimodal
 - qwen2
-- qwen2_moe
 - qwen2_vl
+- qwen2_moe
 - qwen2_5_vl
 - qwen3
 - qwen3_moe
-- cohere
-- cohere2
-- glm
-- glm4
+- smollm3
+- voxtral
 
 ## Citation
 
diff --git a/src/axolotl/integrations/cut_cross_entropy/__init__.py b/src/axolotl/integrations/cut_cross_entropy/__init__.py
index 7420674fa..4689cc9a8 100644
--- a/src/axolotl/integrations/cut_cross_entropy/__init__.py
+++ b/src/axolotl/integrations/cut_cross_entropy/__init__.py
@@ -19,21 +19,22 @@ Cut Cross Entropy is an optimized implementation of cross entropy loss
 from Apple's ML team.
 """
 import importlib
-import logging
+from functools import partial
 
 import torch
 
 from axolotl.integrations.base import BasePlugin
 from axolotl.utils import get_pytorch_version
-from axolotl.utils.distributed import is_main_process
+from axolotl.utils.callbacks.models import get_causal_lm_model_cls_prefix
+from axolotl.utils.logging import get_logger
 
 from .args import CutCrossEntropyArgs  # pylint: disable=unused-import. # noqa: F401
 
-LOG = logging.getLogger("axolotl.integrations.cut_cross_entropy")
+LOG = get_logger(__name__)
 
 _CCE_INSTALL_MESSAGE = (
-    "Please install cut_cross_entropy with transformers support using "
-    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/apple/ml-cross-entropy.git@bad6f7b49c75fdec69471abb71b4cddd0f0c6438"`'
+    "Please install Axolotl's fork of cut_cross_entropy with transformers support using "
+    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0ee9ee8"`'
 )
 
 
@@ -65,21 +66,78 @@ class CutCrossEntropyPlugin(BasePlugin):
             "cut_cross_entropy.transformers"
         )
         if cce_spec_transformers is None:
-            raise ImportError(_CCE_INSTALL_MESSAGE)
+            raise ImportError(
+                "Transformers support is not installed. " + _CCE_INSTALL_MESSAGE
+            )
+
+        # Check if Axolotl's cce fork is installed
+        try:
+            from cut_cross_entropy.transformers.patch import AXOLOTL_CCE_FORK
+
+            if not AXOLOTL_CCE_FORK:
+                raise ImportError
+        except ImportError as e:
+            raise ImportError(
+                "Axolotl's fork of cut_cross_entropy is not installed. "
+                + _CCE_INSTALL_MESSAGE
+            ) from e
 
     def pre_model_load(self, cfg):
         """Apply cut cross entropy before model loading if enabled."""
         if cfg.cut_cross_entropy:
             self._check_requirements()
+            self.patch_llama_like(cfg.model_config_type)
 
-            from axolotl.integrations.cut_cross_entropy.monkeypatch.patch import (
-                cce_patch,
+            from cut_cross_entropy.transformers.patch import cce_patch
+
+            LOG.info(
+                f"Applying Cut Cross Entropy to model type: {cfg.model_config_type}"
             )
 
-            if is_main_process(use_environ=True):
-                LOG.info(
-                    f"Applying Cut Cross Entropy to model type: {cfg.model_config_type}"
-                )
-
             # The patch checks model_type internally
             cce_patch(cfg.model_config_type)
+
+    def patch_llama_like(
+        self,
+        model_type: str,
+    ) -> None:
+        """
+        Generic patch for model architectures with causal lm similar to llama
+        """
+        from cut_cross_entropy.transformers.patch import PATCH_FNS
+
+        def patch_generic(
+            maybe_model, patch_options, model_type: str
+        ):  # pylint: disable=unused-argument
+            import cut_cross_entropy.transformers.llama
+            from cut_cross_entropy.transformers.llama import cce_forward
+
+            try:
+                # Dynamically import the module and CausalLM class
+                module_path = f"transformers.models.{model_type}.modeling_{model_type}"
+                model_cls_prefix, _ = get_causal_lm_model_cls_prefix(model_type)
+                module = __import__(
+                    module_path, fromlist=[f"{model_cls_prefix}ForCausalLM"]
+                )
+                model_cls = getattr(module, f"{model_cls_prefix}ForCausalLM")
+
+                cut_cross_entropy.transformers.llama._PATCH_OPTS = (  # pylint: disable=protected-access
+                    patch_options
+                )
+
+                model_cls.forward = cce_forward
+            # pylint: disable=duplicate-code
+            except (ImportError, AttributeError) as e:
+                raise RuntimeError(
+                    f"Could not import ForCausalLM class for model_type: {model_type}. "
+                    f"Error: {str(e)}"
+                ) from e
+
+        if model_type not in PATCH_FNS:
+            LOG.warning_once(
+                "Setting up generic cce patch for model type: %s", model_type
+            )
+            LOG.warning_once(
+                f"Generic Cut Cross Entropy + {model_type} support is experimental and may not work as expected."
+            )
+            PATCH_FNS[model_type] = partial(patch_generic, model_type=model_type)
diff --git a/src/axolotl/integrations/cut_cross_entropy/args.py b/src/axolotl/integrations/cut_cross_entropy/args.py
index da1db7397..22852479a 100644
--- a/src/axolotl/integrations/cut_cross_entropy/args.py
+++ b/src/axolotl/integrations/cut_cross_entropy/args.py
@@ -15,12 +15,13 @@
 """
 Module for handling Cut Cross Entropy input arguments.
 """
-import logging
 from typing import Optional
 
 from pydantic import BaseModel, model_validator
 
-LOG = logging.getLogger("axolotl.integrations.cut_cross_entropy.args")
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
 
 
 class CutCrossEntropyArgs(BaseModel):
@@ -40,3 +41,13 @@ class CutCrossEntropyArgs(BaseModel):
             )
 
         return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_chunked_cross_entropy_not_set(cls, data):
+        if data.get("chunked_cross_entropy"):
+            raise ValueError(
+                "Cut Cross Entropy does not support chunked cross entropy. "
+                "Please set `chunked_cross_entropy` to `False` or disable Cut Cross Entropy."
+            )
+        return data
diff --git a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/cohere.py b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/cohere.py
deleted file mode 100644
index ea9e10724..000000000
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/cohere.py
+++ /dev/null
@@ -1,191 +0,0 @@
-"""Cohere and Cohere2 CCE patch."""
-
-# This patch is based off transformers 4.50.0.
-# It patches the forward function for CohereForCausalLM and Cohere2ForCausalLM.
-# It scales the hidden states by the logit scale in advance instead of the logits as the
-# operation is done internally and should be mathematically equivalent.
-
-# pylint: disable=duplicate-code
-
-from types import MethodType
-from typing import Optional, Tuple, Union
-
-import torch
-import transformers
-from cut_cross_entropy.transformers.utils import (
-    PatchOptions,
-    TransformersModelT,
-    apply_lce,
-)
-from transformers.cache_utils import Cache
-from transformers.modeling_outputs import CausalLMOutputWithPast
-from transformers.models.cohere.modeling_cohere import (
-    KwargsForCausalLM,
-)
-from transformers.processing_utils import Unpack
-from transformers.utils.deprecation import deprecate_kwarg
-
-_PATCH_OPTS: PatchOptions | None = None
-
-
-@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
-def cce_forward(
-    self,
-    input_ids: torch.LongTensor | None = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    **kwargs: Unpack[KwargsForCausalLM],
-) -> Union[Tuple, CausalLMOutputWithPast]:
-    r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-    Returns:
-
-    Example:
-
-    ```python
-    >> from transformers import AutoTokenizer, CohereForCausalLM
-
-    >> model = CohereForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01")
-    >> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")
-
-    >> prompt = "Hey, are you conscious? Can you talk to me?"
-    >> inputs = tokenizer(prompt, return_tensors="pt")
-
-    >> # Generate
-    >> generate_ids = model.generate(inputs.input_ids, max_length=30)
-    >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-    ```"""
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-
-    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-    outputs = self.model(
-        input_ids=input_ids,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        cache_position=cache_position,
-        **kwargs,
-    )
-
-    hidden_states = outputs[0]
-    loss = None
-    logits = None
-
-    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-    slice_indices = (
-        slice(-logits_to_keep, None)
-        if isinstance(logits_to_keep, int)
-        else logits_to_keep
-    )
-
-    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
-        assert labels is not None
-        # scale hidden_states by logit_scale in-place of logits
-        loss = apply_lce(
-            hidden_states[:, slice_indices, :] * self.logit_scale,
-            self.lm_head.weight,
-            labels,
-            _PATCH_OPTS,
-            **kwargs,
-        )
-    else:
-        logits = self.lm_head(hidden_states[:, slice_indices, :])
-        logits = logits * self.logit_scale  # main diff from Llama
-
-        if labels is not None:
-            loss = self.loss_function(
-                logits=logits,
-                labels=labels,
-                vocab_size=self.config.vocab_size,
-                **kwargs,
-            )
-
-    if not return_dict:
-        output = (logits,) + outputs[1:]
-        return (loss,) + output if loss is not None else output
-
-    return CausalLMOutputWithPast(
-        loss=loss,
-        logits=logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-    )
-
-
-def patch_cohere(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-    global _PATCH_OPTS  # pylint: disable=global-statement
-    from transformers.models.cohere import modeling_cohere
-
-    _PATCH_OPTS = patch_options
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_cohere.CohereForCausalLM
-        ), f"Expected a CohereForCausalLM model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward, maybe_model)
-        return maybe_model
-
-    modeling_cohere.CohereForCausalLM.forward = cce_forward
-    return None
-
-
-def patch_cohere2(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-    global _PATCH_OPTS  # pylint: disable=global-statement
-    from transformers.models.cohere2 import modeling_cohere2
-
-    _PATCH_OPTS = patch_options
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_cohere2.Cohere2ForCausalLM
-        ), f"Expected a Cohere2ForCausalLM model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward, maybe_model)
-        return maybe_model
-
-    modeling_cohere2.Cohere2ForCausalLM.forward = cce_forward
-    return None
diff --git a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/gemma.py b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/gemma.py
deleted file mode 100644
index ae3d8c6ef..000000000
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/gemma.py
+++ /dev/null
@@ -1,165 +0,0 @@
-"""Gemma CCE patch"""
-
-# This patch is based off transformers 4.50.0.
-
-# pylint: disable=duplicate-code
-
-from types import MethodType
-from typing import Optional, Tuple, Union
-
-import torch
-import transformers
-from cut_cross_entropy.transformers.utils import (
-    PatchOptions,
-    TransformersModelT,
-    apply_lce,
-)
-from transformers.cache_utils import Cache
-from transformers.modeling_outputs import CausalLMOutputWithPast
-from transformers.models.gemma.modeling_gemma import (
-    KwargsForCausalLM,
-)
-from transformers.processing_utils import Unpack
-from transformers.utils.deprecation import deprecate_kwarg
-
-_PATCH_OPTS: PatchOptions | None = None
-
-
-@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
-def cce_forward(
-    self,
-    input_ids: torch.LongTensor | None = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    **kwargs: Unpack[KwargsForCausalLM],
-) -> Union[Tuple, CausalLMOutputWithPast]:
-    r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-    Returns:
-
-    Example:
-
-    ```python
-    >>> from transformers import AutoTokenizer, GemmaForCausalLM
-
-    >>> model = GemmaForCausalLM.from_pretrained("google/gemma-7b")
-    >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")
-
-    >>> prompt = "What is your favorite condiment?"
-    >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-    >>> # Generate
-    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    "What is your favorite condiment?"
-    ```"""
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-
-    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-    outputs = self.model(
-        input_ids=input_ids,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        cache_position=cache_position,
-        **kwargs,
-    )
-
-    hidden_states = outputs[0]
-    loss = None
-    logits = None
-
-    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-    slice_indices = (
-        slice(-logits_to_keep, None)
-        if isinstance(logits_to_keep, int)
-        else logits_to_keep
-    )
-
-    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
-        assert labels is not None
-        loss = apply_lce(
-            hidden_states[:, slice_indices, :],
-            self.lm_head.weight,
-            labels,
-            _PATCH_OPTS,
-            **kwargs,
-        )
-    else:
-        logits = self.lm_head(hidden_states[:, slice_indices, :])
-        if labels is not None:
-            loss = self.loss_function(
-                logits=logits,
-                labels=labels,
-                vocab_size=self.config.vocab_size,
-                **kwargs,
-            )
-
-    if not return_dict:
-        output = (logits,) + outputs[1:]
-        return (loss,) + output if loss is not None else output
-
-    return CausalLMOutputWithPast(
-        loss=loss,
-        logits=logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-    )
-
-
-def patch_gemma(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-    global _PATCH_OPTS  # pylint: disable=global-statement
-    from transformers.models.gemma import modeling_gemma
-
-    _PATCH_OPTS = patch_options
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_gemma.GemmaForCausalLM
-        ), f"Expected a GemmaForCausalLM model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward, maybe_model)
-        return maybe_model
-
-    modeling_gemma.GemmaForCausalLM.forward = cce_forward
-    return None
diff --git a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/gemma3.py b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/gemma3.py
deleted file mode 100644
index 644e5cce7..000000000
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/gemma3.py
+++ /dev/null
@@ -1,447 +0,0 @@
-"""Gemma2 and Gemma3 (text and multimodal) CCE patch."""
-
-# Implementation originally adapted from https://github.com/apple/ml-cross-entropy/pull/29
-# and updated for transformers 4.50.0.
-# This is a modified version of the patch that allows for deferred logits calculation for gemma3 and works
-# with both gemma3 (text and multimodal) models.
-
-# pylint: disable=duplicate-code
-
-from types import MethodType
-from typing import Optional, Tuple, Union
-
-import torch
-import transformers
-from cut_cross_entropy.transformers.utils import (
-    PatchOptions,
-    TransformersModelT,
-)
-from torch import nn
-from transformers.cache_utils import Cache, HybridCache
-from transformers.modeling_outputs import CausalLMOutputWithPast
-from transformers.models.gemma3.modeling_gemma3 import (
-    Gemma3CausalLMOutputWithPast,
-    logger,
-)
-from transformers.utils import (
-    is_torchdynamo_compiling,
-)
-from transformers.utils.deprecation import deprecate_kwarg
-
-from axolotl.integrations.cut_cross_entropy.monkeypatch.utils import apply_lce
-
-_PATCH_OPTS: PatchOptions | None = None
-
-
-@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
-def cce_forward(
-    self,
-    input_ids: torch.LongTensor | None = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[HybridCache] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    defer_logits_calculation: bool = False,
-    **loss_kwargs,
-) -> Union[Tuple, CausalLMOutputWithPast]:
-    r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-        defer_logits_calculation (`bool`, *optional*):
-            If `True`, defer logits calculation to the ConditionalGeneration forward. This is used to avoid the
-            memory overhead of calculating logits using regular lm_head forward pass and to use CCE.
-
-    Returns:
-
-    Example:
-
-    ```python
-    >>> from transformers import AutoTokenizer, Gemma3ForCausalLM
-
-    >>> model = Gemma3ForCausalLM.from_pretrained("google/gemma-2-9b")
-    >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
-
-    >>> prompt = "What is your favorite condiment?"
-    >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-    >>> # Generate
-    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    "What is your favorite condiment?"
-    ```"""
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-    outputs = self.model(
-        input_ids=input_ids,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        cache_position=cache_position,
-        **loss_kwargs,
-    )
-
-    hidden_states = outputs[0]
-    loss = None
-    logits = None
-
-    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-    slice_indices = (
-        slice(-logits_to_keep, None)
-        if isinstance(logits_to_keep, int)
-        else logits_to_keep
-    )
-
-    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
-        assert labels is not None
-        loss = apply_lce(
-            hidden_states[:, slice_indices, :],
-            self.lm_head.weight,
-            labels,
-            _PATCH_OPTS,
-            softcap=getattr(self.config, "final_logit_softcapping", None),
-            **loss_kwargs,
-        )
-    elif _PATCH_OPTS is not None and defer_logits_calculation:
-        # defer logits calculation to the ConditionalGeneration forward
-        logits = hidden_states[:, slice_indices, :]
-    else:
-        logits = self.lm_head(hidden_states[:, slice_indices, :])
-        if self.config.final_logit_softcapping is not None:
-            logits = logits / self.config.final_logit_softcapping
-            logits = torch.tanh(logits)
-            logits = logits * self.config.final_logit_softcapping
-
-        if labels is not None:
-            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
-
-    if not return_dict:
-        output = (logits,) + outputs[1:]
-        return (loss,) + output if loss is not None else output
-
-    return CausalLMOutputWithPast(
-        loss=loss,
-        logits=logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-    )
-
-
-@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
-def cce_forward_multimodal(
-    self,
-    input_ids: torch.LongTensor | None = None,
-    pixel_values: torch.FloatTensor | None = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None,
-    token_type_ids: Optional[torch.LongTensor] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    **lm_kwargs,
-) -> Union[Tuple, Gemma3CausalLMOutputWithPast]:
-    r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-    Returns:
-
-    Example:
-
-    ```python
-    >>> from PIL import Image
-    >>> import requests
-    >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration
-
-    >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/Gemma3-test-224px-hf")
-    >>> processor = AutoProcessor.from_pretrained("google/Gemma3-test-224px-hf")
-
-    >>> prompt = "answer en Where is the cow standing?"
-    >>> url = "https://huggingface.co/gv-hf/Gemma3-test-224px-hf/resolve/main/cow_beach_1.png"
-    >>> image = Image.open(requests.get(url, stream=True).raw)
-
-    >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")
-
-    >>> # Generate
-    >>> generate_ids = model.generate(**inputs, max_length=30)
-    >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    "answer en Where is the cow standing?\nbeach"
-    ```"""
-
-    if (input_ids is None) ^ (inputs_embeds is not None):
-        raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
-
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-
-    is_training = token_type_ids is not None and labels is not None
-
-    # Replace image id woth PAD if the image token if OOV, to avoid index-errors
-    if input_ids is not None and self.config.image_token_index >= self.vocab_size:
-        special_image_mask = input_ids == self.config.image_token_index
-        llm_input_ids = input_ids.clone()
-        llm_input_ids[special_image_mask] = 0
-    else:
-        llm_input_ids = input_ids  # type: ignore
-
-    if inputs_embeds is None:
-        inputs_embeds = self.get_input_embeddings()(llm_input_ids)
-
-    if cache_position is None:
-        past_seen_tokens = (
-            past_key_values.get_seq_length() if past_key_values is not None else 0  # type: ignore
-        )
-        cache_position = torch.arange(  # type: ignore
-            past_seen_tokens,
-            past_seen_tokens + inputs_embeds.shape[1],
-            device=inputs_embeds.device,
-        )
-
-    # Merge text and images
-    if pixel_values is not None:
-        image_features = self.get_image_features(pixel_values)
-
-        if input_ids is None:
-            special_image_mask = inputs_embeds == self.get_input_embeddings()(
-                torch.tensor(
-                    self.config.image_token_index,
-                    dtype=torch.long,
-                    device=inputs_embeds.device,
-                )
-            )
-        else:
-            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(
-                -1
-            )
-            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(
-                inputs_embeds.device
-            )
-
-        if (
-            not is_torchdynamo_compiling()
-            and inputs_embeds[special_image_mask].numel() != image_features.numel()
-        ):
-            image_tokens_in_text = (special_image_mask).sum(dim=1).sum(dim=0)[0]
-            raise ValueError(
-                f"Number of images does not match number of special image tokens in the input text. "
-                f"Got {image_tokens_in_text} image tokens in the text but {image_features.shape[0] * image_features.shape[1]} "
-                "tokens from image embeddings."
-            )
-        image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
-        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)  # type: ignore
-
-    # mask out pad-token-ids in labels for BC
-    if labels is not None and self.pad_token_id in labels:
-        logger.warning_once(
-            "`labels` contains `pad_token_id` which will be masked with `config.ignore_index`. "
-            "You have to mask out `pad_token_id` when preparing `labels`, this behavior will be removed in v.4.46.",
-        )
-        labels = torch.where(  # type: ignore
-            input_ids == self.pad_token_id, self.config.ignore_index, labels
-        )
-
-    causal_mask = self._update_causal_mask(  # pylint: disable=protected-access
-        attention_mask,
-        token_type_ids,
-        past_key_values,
-        cache_position,
-        inputs_embeds,
-        is_training,
-    )
-    outputs = self.language_model(
-        attention_mask=causal_mask,
-        position_ids=position_ids,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        cache_position=cache_position,
-        logits_to_keep=logits_to_keep,
-        defer_logits_calculation=True,  # enable deferred logits calculation
-        **lm_kwargs,
-    )
-
-    hidden_states = outputs[0]
-    loss = None
-    logits = None
-
-    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
-        assert labels is not None
-        loss = apply_lce(
-            hidden_states,
-            self.language_model.lm_head.weight,
-            labels,
-            _PATCH_OPTS,
-            softcap=getattr(self.config, "final_logit_softcapping", None),
-            **lm_kwargs,
-        )
-    else:
-        logits = hidden_states
-        if labels is not None:
-            # Upcast to float if we need to compute the loss to avoid potential precision issues
-            logits = logits.float()
-            shift_logits = logits[..., :-1, :]
-            shift_labels = labels[..., 1:]
-            if attention_mask is not None:
-                # we use the input attention mask to shift the logits and labels, because it is 2D.
-                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
-                shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(
-                    logits.device
-                )
-                shift_logits = shift_logits[
-                    shift_attention_mask.to(logits.device) != 0
-                ].contiguous()
-                shift_labels = shift_labels[
-                    shift_attention_mask.to(shift_labels.device) != 0
-                ].contiguous()
-            else:
-                shift_logits = shift_logits.contiguous()
-                shift_labels = shift_labels.contiguous()
-            # Flatten the tokens
-            loss_fct = nn.CrossEntropyLoss()
-
-            flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
-            flat_labels = shift_labels.view(-1).to(shift_logits.device)
-            loss = loss_fct(flat_logits, flat_labels)
-
-    if not return_dict:
-        output = (logits,) + outputs[1:]
-        return (loss,) + output if loss is not None else output
-
-    return Gemma3CausalLMOutputWithPast(
-        loss=loss,
-        logits=logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-        image_hidden_states=image_features if pixel_values is not None else None,
-    )
-
-
-def patch_gemma2(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-    global _PATCH_OPTS  # pylint: disable=global-statement
-    from transformers.models.gemma2 import modeling_gemma2
-
-    _PATCH_OPTS = patch_options
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_gemma2.Gemma2ForCausalLM
-        ), f"Expected a Gemma2ForCausalLM model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward, maybe_model)
-        return maybe_model
-
-    modeling_gemma2.Gemma2ForCausalLM.forward = cce_forward
-    return None
-
-
-def patch_gemma3_text(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-    global _PATCH_OPTS  # pylint: disable=global-statement
-    from transformers.models.gemma3 import modeling_gemma3
-
-    _PATCH_OPTS = patch_options
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_gemma3.Gemma3ForCausalLM
-        ), f"Expected a Gemma3ForCausalLM model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward, maybe_model)
-        return maybe_model
-
-    modeling_gemma3.Gemma3ForCausalLM.forward = cce_forward
-    return None
-
-
-def patch_gemma3(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-    global _PATCH_OPTS  # pylint: disable=global-statement
-    from transformers.models.gemma3 import modeling_gemma3
-
-    _PATCH_OPTS = patch_options
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_gemma3.Gemma3ForConditionalGeneration
-        ), f"Expected a Gemma3ForConditionalGeneration model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward_multimodal, maybe_model)
-
-        # patch the causal model to enable deferred logits calculation
-        maybe_model.language_model.forward = MethodType(
-            cce_forward, maybe_model.language_model
-        )
-        return maybe_model
-
-    modeling_gemma3.Gemma3ForConditionalGeneration.forward = cce_forward_multimodal
-    # patch the causal model to enable deferred logits calculation
-    modeling_gemma3.Gemma3ForCausalLM.forward = cce_forward
-    return None
diff --git a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/glm4.py b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/glm4.py
deleted file mode 100644
index 3df909f88..000000000
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/glm4.py
+++ /dev/null
@@ -1,57 +0,0 @@
-"""GLM 4 patch. GLM family inherits from Llama."""
-
-from types import MethodType
-
-import transformers
-from cut_cross_entropy.transformers.utils import (
-    PatchOptions,
-    TransformersModelT,
-)
-
-
-def patch_glm(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-
-    # Set the _PATCH_OPTS in the llama patch file
-    import cut_cross_entropy.transformers.llama as llama_patch
-
-    llama_patch._PATCH_OPTS = patch_options  # pylint: disable=protected-access
-
-    from cut_cross_entropy.transformers.llama import cce_forward
-    from transformers.models.glm import modeling_glm
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_glm.GlmForCausalLM
-        ), f"Expected a GlmForCausalLM model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward, maybe_model)
-        return maybe_model
-
-    modeling_glm.GlmForCausalLM.forward = cce_forward
-    return None
-
-
-def patch_glm4(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-
-    # Set the _PATCH_OPTS in the llama patch file
-    import cut_cross_entropy.transformers.llama as llama_patch
-
-    llama_patch._PATCH_OPTS = patch_options  # pylint: disable=protected-access
-
-    from cut_cross_entropy.transformers.llama import cce_forward
-    from transformers.models.glm4 import modeling_glm4
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_glm4.Glm4ForCausalLM
-        ), f"Expected a Glm4ForCausalLM model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward, maybe_model)
-        return maybe_model
-
-    modeling_glm4.Glm4ForCausalLM.forward = cce_forward
-    return None
diff --git a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/llama.py b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/llama.py
deleted file mode 100644
index bed411ace..000000000
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/llama.py
+++ /dev/null
@@ -1,164 +0,0 @@
-"""Llama CCE patch. Adapted from transformers v4.51.2"""
-
-# pylint: disable=duplicate-code
-
-
-from types import MethodType
-from typing import Optional, Union
-
-import torch
-import transformers
-from cut_cross_entropy.transformers.utils import (
-    PatchOptions,
-    TransformersModelT,
-    apply_lce,
-)
-from transformers.cache_utils import Cache
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-)
-from transformers.models.llama.modeling_llama import (
-    KwargsForCausalLM,
-)
-from transformers.processing_utils import Unpack
-from transformers.utils.deprecation import deprecate_kwarg
-from transformers.utils.generic import can_return_tuple
-
-_PATCH_OPTS: PatchOptions | None = None
-
-
-@can_return_tuple
-@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
-def cce_forward(
-    self,
-    input_ids: Optional[torch.LongTensor] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[Cache] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    **kwargs: Unpack[KwargsForCausalLM],
-) -> CausalLMOutputWithPast:
-    r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-    Returns:
-
-    Example:
-
-    ```python
-    >>> from transformers import AutoTokenizer, LlamaForCausalLM
-
-    >>> model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
-    >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-
-    >>> prompt = "Hey, are you conscious? Can you talk to me?"
-    >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-    >>> # Generate
-    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-    ```"""
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-
-    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-    outputs: BaseModelOutputWithPast = self.model(
-        input_ids=input_ids,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        cache_position=cache_position,
-        **kwargs,
-    )
-
-    hidden_states = outputs.last_hidden_state
-    if hidden_states is None:
-        raise ValueError("hidden_states is None")
-
-    loss = None
-    logits = None
-
-    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-    slice_indices = (
-        slice(-logits_to_keep, None)
-        if isinstance(logits_to_keep, int)
-        else logits_to_keep
-    )
-    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
-        assert labels is not None
-        loss = apply_lce(
-            hidden_states[:, slice_indices, :],
-            self.lm_head.weight,
-            labels,
-            _PATCH_OPTS,
-            **kwargs,
-        )
-    else:
-        logits = self.lm_head(hidden_states[:, slice_indices, :])
-
-        if labels is not None:
-            loss = self.loss_function(
-                logits=logits,
-                labels=labels,
-                vocab_size=self.config.vocab_size,
-                **kwargs,
-            )
-
-    return CausalLMOutputWithPast(
-        loss=loss,
-        logits=logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-    )
-
-
-def patch_llama(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-    """Patch Llama for CCE."""
-    global _PATCH_OPTS  # pylint: disable=global-statement
-    from transformers.models.llama import modeling_llama
-
-    _PATCH_OPTS = patch_options
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_llama.LlamaForCausalLM
-        ), f"Expected a LlamaForCausalLM model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward, maybe_model)
-        return maybe_model
-
-    modeling_llama.LlamaForCausalLM.forward = cce_forward
-    return None
diff --git a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/llama4.py b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/llama4.py
deleted file mode 100644
index 3143e9c8d..000000000
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/llama4.py
+++ /dev/null
@@ -1,401 +0,0 @@
-"""Llama4 CCE patch. Adapted from transformers 4.51.0."""
-
-# pylint: disable=duplicate-code
-
-from types import MethodType
-from typing import Optional, Tuple, Union
-
-import torch
-import transformers
-from cut_cross_entropy.transformers.utils import (
-    PatchOptions,
-    TransformersModelT,
-    apply_lce,
-)
-from torch import nn
-from transformers.cache_utils import Cache
-from transformers.modeling_outputs import CausalLMOutputWithPast
-from transformers.models.llama4.modeling_llama4 import (
-    Llama4CausalLMOutputWithPast,
-)
-
-_PATCH_OPTS: PatchOptions | None = None
-
-
-def cce_forward(
-    self,
-    input_ids: torch.LongTensor | None = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    defer_logits_calculation: bool = False,
-    **kwargs,
-) -> Union[Tuple, CausalLMOutputWithPast]:
-    r"""
-    Args:
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-        defer_logits_calculation (`bool`, *optional*, defaults to `False`):
-            If `True`, defer logits calculation to the ConditionalGeneration forward. This is used to avoid the
-            memory overhead of calculating logits using regular lm_head forward pass and to use CCE.
-
-    Returns:
-
-    Example:
-
-    ```python
-    >>> from transformers import AutoTokenizer, Llama4ForCausalLM
-
-    >>> model = Llama4ForCausalLM.from_pretrained("meta-llama4/Llama4-2-7b-hf")
-    >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama4/Llama4-2-7b-hf")
-
-    >>> prompt = "Hey, are you conscious? Can you talk to me?"
-    >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-    >>> # Generate
-    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-    ```"""
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-
-    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-    outputs = self.model(
-        input_ids=input_ids,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        cache_position=cache_position,
-        **kwargs,
-    )
-
-    hidden_states = outputs[0]
-    loss = None
-    logits = None
-
-    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-    slice_indices = (
-        slice(-logits_to_keep, None)
-        if isinstance(logits_to_keep, int)
-        else logits_to_keep
-    )
-    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
-        assert labels is not None
-        loss = apply_lce(
-            hidden_states[:, slice_indices, :],
-            self.lm_head.weight,
-            labels,
-            _PATCH_OPTS,
-            **kwargs,
-        )
-    elif _PATCH_OPTS is not None and defer_logits_calculation:
-        # defer logits calculation to the ConditionalGeneration forward
-        logits = hidden_states[:, slice_indices, :]
-    else:
-        logits = self.lm_head(hidden_states[:, slice_indices, :])
-
-        if labels is not None:
-            loss = self.loss_function(
-                logits=logits,
-                labels=labels,
-                vocab_size=self.config.vocab_size,
-                **kwargs,
-            )
-
-    if not return_dict:
-        output = (logits,) + outputs[1:]
-        return (loss,) + output if loss is not None else output
-
-    return CausalLMOutputWithPast(
-        loss=loss,
-        logits=logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-    )
-
-
-def cce_forward_multimodal(
-    self,
-    input_ids: torch.LongTensor | None = None,  # type: ignore
-    pixel_values: torch.FloatTensor | None = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[list[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    vision_feature_layer: Optional[Union[int, list[int]]] = None,
-    vision_feature_select_strategy: Optional[str] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    image_sizes: torch.Tensor | None = None,
-    **lm_kwargs,
-) -> Union[Tuple, Llama4CausalLMOutputWithPast]:
-    r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-
-    Returns:
-
-    Example:
-
-    ```python
-    >>> from PIL import Image
-    >>> import requests
-    >>> from transformers import AutoProcessor, LlavaForConditionalGeneration
-
-    >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
-    >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
-
-    >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
-    >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
-    >>> image = Image.open(requests.get(url, stream=True).raw)
-
-    >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
-
-    >>> # Generate
-    >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
-    >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
-    ```"""
-
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-    vision_feature_layer = (
-        vision_feature_layer
-        if vision_feature_layer is not None
-        else self.config.vision_config.vision_feature_layer
-    )
-    vision_feature_select_strategy = (
-        vision_feature_select_strategy
-        if vision_feature_select_strategy is not None
-        else self.config.vision_config.vision_feature_select_strategy
-    )
-
-    if (input_ids is None) ^ (inputs_embeds is not None):
-        raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
-
-    if pixel_values is not None and inputs_embeds is not None:
-        raise ValueError(
-            "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
-        )
-
-    if inputs_embeds is None:
-        inputs_embeds = self.get_input_embeddings()(input_ids)  # type: ignore
-
-    if pixel_values is not None:
-        image_features = self.get_image_features(
-            pixel_values=pixel_values,
-            vision_feature_layer=vision_feature_layer,
-            vision_feature_select_strategy=vision_feature_select_strategy,
-            image_sizes=image_sizes,
-        )
-        original_inputs_embeds_shape = inputs_embeds.shape  # type: ignore
-
-        vision_flat = image_features.view(-1, image_features.size(-1))
-        projected_vision_flat = self.multi_modal_projector(vision_flat)
-
-        special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
-        final_mask = special_image_mask.to(inputs_embeds.device)  # type: ignore
-        inputs_embeds = inputs_embeds.view(-1, inputs_embeds.size(-1))  # type: ignore
-
-        final_mask_1d = final_mask[..., 0].reshape(-1)
-        num_tokens_to_fill = final_mask_1d.sum()
-
-        if num_tokens_to_fill != projected_vision_flat.size(0):
-            raise ValueError(
-                f"Mismatch: final_mask wants {num_tokens_to_fill} embeddings, "
-                f"but multi_modal_projector returned {projected_vision_flat.size(0)}"
-            )
-
-        expanded_mask = final_mask_1d.unsqueeze(-1).expand(-1, inputs_embeds.size(-1))
-        inputs_embeds = inputs_embeds.masked_scatter(
-            expanded_mask, projected_vision_flat
-        )  # type: ignore
-        inputs_embeds = inputs_embeds.view(original_inputs_embeds_shape)  # type: ignore
-
-    outputs = self.language_model(
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        cache_position=cache_position,
-        logits_to_keep=logits_to_keep,
-        defer_logits_calculation=True,  # enable deferred logits calculation
-        **lm_kwargs,
-    )
-
-    hidden_states = outputs[0]
-    loss = None
-    logits = None
-
-    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
-        assert labels is not None
-        # TODO: check if need to handle attention_mask
-        loss = apply_lce(
-            hidden_states,
-            self.language_model.lm_head.weight,
-            labels,
-            _PATCH_OPTS,
-            **lm_kwargs,
-        )
-    else:
-        logits = hidden_states
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            if attention_mask is not None:
-                # we use the input attention mask to shift the logits and labels, because it is 2D.
-                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
-                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(
-                    logits.device
-                )
-                shift_logits = logits[..., :-1, :][
-                    shift_attention_mask.to(logits.device) != 0
-                ].contiguous()
-                shift_labels = labels[..., 1:][
-                    shift_attention_mask.to(labels.device) != 0
-                ].contiguous()
-            else:
-                shift_logits = logits[..., :-1, :].contiguous()
-                shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = nn.CrossEntropyLoss()
-            loss = loss_fct(
-                shift_logits.view(-1, shift_logits.size(-1)),
-                shift_labels.view(-1).to(shift_logits.device),
-            )
-
-    if not return_dict:
-        output = (logits,) + outputs[1:]
-        return (loss,) + output if loss is not None else output
-
-    return Llama4CausalLMOutputWithPast(
-        loss=loss,
-        logits=logits,  # type: ignore  # TODO: check if need to create dummy logits
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-        image_hidden_states=image_features if pixel_values is not None else None,
-    )
-
-
-def patch_llama4_text(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-    global _PATCH_OPTS  # pylint: disable=global-statement
-    from transformers.models.llama4 import modeling_llama4
-
-    _PATCH_OPTS = patch_options
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_llama4.Llama4ForCausalLM
-        ), f"Expected a Llama4ForCausalLM model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward, maybe_model)
-
-        return maybe_model
-
-    setattr(
-        modeling_llama4.Llama4ForCausalLM,
-        "forward",
-        cce_forward,
-    )
-    return None
-
-
-def patch_llama4(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-
-    global _PATCH_OPTS  # pylint: disable=global-statement
-    from transformers.models.llama4 import modeling_llama4
-
-    _PATCH_OPTS = patch_options
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_llama4.Llama4ForConditionalGeneration
-        ), f"Expected a Llama4ForConditionalGeneration model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward_multimodal, maybe_model)
-
-        # patch the language model
-        maybe_model.language_model.forward = MethodType(
-            cce_forward, maybe_model.language_model
-        )
-        return maybe_model
-
-    setattr(
-        modeling_llama4.Llama4ForConditionalGeneration,
-        "forward",
-        cce_forward_multimodal,
-    )
-
-    # patch the causal language model
-    setattr(modeling_llama4.Llama4ForCausalLM, "forward", cce_forward)
-    return None
diff --git a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/mistral3.py b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/mistral3.py
deleted file mode 100644
index aa252701e..000000000
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/mistral3.py
+++ /dev/null
@@ -1,384 +0,0 @@
-"""Mistral and Mistral3 CCE patch."""
-
-# pylint: disable=duplicate-code
-
-from types import MethodType
-from typing import Optional, Tuple, Union
-
-import torch
-import transformers
-from cut_cross_entropy.transformers.utils import (
-    PatchOptions,
-    TransformersModelT,
-    apply_lce,
-)
-from torch import nn
-from transformers.cache_utils import Cache
-from transformers.modeling_outputs import CausalLMOutputWithPast
-from transformers.models.mistral3.modeling_mistral3 import (
-    Mistral3CausalLMOutputWithPast,
-)
-from transformers.models.mistral.modeling_mistral import (
-    KwargsForCausalLM,
-)
-from transformers.processing_utils import Unpack
-from transformers.utils import (
-    is_torchdynamo_compiling,
-)
-from transformers.utils.deprecation import deprecate_kwarg
-
-_PATCH_OPTS: PatchOptions | None = None
-
-
-@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
-def cce_forward(
-    self,
-    input_ids: torch.LongTensor | None = None,
-    attention_mask: Optional[torch.Tensor] | None = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    defer_logits_calculation: bool = False,
-    **kwargs: Unpack[KwargsForCausalLM],
-) -> Union[Tuple, CausalLMOutputWithPast]:
-    r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-        defer_logits_calculation (`bool`, *optional*):
-            If `True`, defer logits calculation to the ConditionalGeneration forward. This is used to avoid the
-            memory overhead of calculating logits using regular lm_head forward pass and to use CCE.
-
-    Returns:
-
-    Example:
-
-    ```python
-    >>> from transformers import AutoTokenizer, MistralForCausalLM
-
-    >>> model = MistralForCausalLM.from_pretrained("meta-mistral/Mistral-2-7b-hf")
-    >>> tokenizer = AutoTokenizer.from_pretrained("meta-mistral/Mistral-2-7b-hf")
-
-    >>> prompt = "Hey, are you conscious? Can you talk to me?"
-    >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-    >>> # Generate
-    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-    ```"""
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-
-    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-    outputs = self.model(
-        input_ids=input_ids,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        cache_position=cache_position,
-        **kwargs,
-    )
-
-    hidden_states = outputs[0]
-    loss = None
-    logits = None
-
-    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-    slice_indices = (
-        slice(-logits_to_keep, None)
-        if isinstance(logits_to_keep, int)
-        else logits_to_keep
-    )
-
-    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
-        assert labels is not None
-        loss = apply_lce(
-            hidden_states[:, slice_indices, :],
-            self.lm_head.weight,
-            labels,
-            _PATCH_OPTS,
-            **kwargs,
-        )
-    elif _PATCH_OPTS is not None and defer_logits_calculation:
-        # defer logits calculation to the ConditionalGeneration forward
-        logits = hidden_states[:, slice_indices, :]
-    else:
-        logits = self.lm_head(hidden_states[:, slice_indices, :])
-        if labels is not None:
-            loss = self.loss_function(
-                logits=logits,
-                labels=labels,
-                vocab_size=self.config.vocab_size,
-                **kwargs,
-            )
-
-    if not return_dict:
-        output = (logits,) + outputs[1:]
-        return (loss,) + output if loss is not None else output
-
-    return CausalLMOutputWithPast(
-        loss=loss,
-        logits=logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-    )
-
-
-def cce_forward_multimodal(
-    self,
-    input_ids: torch.LongTensor | None = None,
-    pixel_values: torch.FloatTensor | None = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[list[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    vision_feature_layer: Optional[Union[int, list[int]]] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    image_sizes: torch.Tensor | None = None,
-    **lm_kwargs,
-) -> Union[Tuple, Mistral3CausalLMOutputWithPast]:
-    r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-
-    Returns:
-
-    Example:
-
-    ```python
-    >>> from PIL import Image
-    >>> import requests
-    >>> from transformers import AutoProcessor, Mistral3ForConditionalGeneration
-
-    >>> model = Mistral3ForConditionalGeneration.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")
-    >>> processor = AutoProcessor.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")
-
-    >>> prompt = "<s>[INST][IMG]What is the image?[/INST]"
-    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    >>> image = Image.open(requests.get(url, stream=True).raw)
-
-    >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
-
-    >>> # Generate
-    >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
-    >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    "What is the image?The image depicts two cats lying on a pink blanket."
-    ```"""
-
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-    vision_feature_layer = (
-        vision_feature_layer
-        if vision_feature_layer is not None
-        else self.config.vision_feature_layer
-    )
-
-    if (input_ids is None) ^ (inputs_embeds is not None):
-        raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
-
-    if pixel_values is not None and inputs_embeds is not None:
-        raise ValueError(
-            "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
-        )
-
-    if inputs_embeds is None:
-        inputs_embeds = self.get_input_embeddings()(input_ids)
-
-    if pixel_values is not None:
-        image_features = self.get_image_features(
-            pixel_values=pixel_values,
-            vision_feature_layer=vision_feature_layer,
-            image_sizes=image_sizes,
-        )
-
-        special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
-        special_image_mask = special_image_mask.expand_as(inputs_embeds).to(
-            inputs_embeds.device
-        )
-        if (
-            not is_torchdynamo_compiling()
-            and inputs_embeds[special_image_mask].numel() != image_features.numel()
-        ):
-            n_image_tokens = (input_ids == self.config.image_token_index).sum()
-            n_image_features = image_features.shape[0] * image_features.shape[1]
-            raise ValueError(
-                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
-            )
-        image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
-        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)  # type: ignore
-
-    outputs = self.language_model(
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        cache_position=cache_position,
-        logits_to_keep=logits_to_keep,
-        defer_logits_calculation=True,  # enable deferred logits calculation
-        **lm_kwargs,
-    )
-
-    hidden_states = outputs[0]
-    loss = None
-    logits = None
-
-    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
-        assert labels is not None
-        loss = apply_lce(
-            hidden_states,
-            self.language_model.lm_head.weight,
-            labels,
-            _PATCH_OPTS,
-            **lm_kwargs,
-        )
-    else:
-        logits = hidden_states
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            if attention_mask is not None:
-                # we use the input attention mask to shift the logits and labels, because it is 2D.
-                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
-                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(
-                    logits.device
-                )
-                shift_logits = logits[..., :-1, :][
-                    shift_attention_mask.to(logits.device) != 0
-                ].contiguous()
-                shift_labels = labels[..., 1:][
-                    shift_attention_mask.to(labels.device) != 0
-                ].contiguous()
-            else:
-                shift_logits = logits[..., :-1, :].contiguous()
-                shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = nn.CrossEntropyLoss()
-            loss = loss_fct(
-                shift_logits.view(-1, shift_logits.size(-1)),
-                shift_labels.view(-1).to(shift_logits.device),
-            )
-
-    if not return_dict:
-        output = (logits,) + outputs[1:]
-        return (loss,) + output if loss is not None else output
-
-    return Mistral3CausalLMOutputWithPast(
-        loss=loss,
-        logits=logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-        image_hidden_states=image_features if pixel_values is not None else None,
-    )
-
-
-def patch_mistral(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-    global _PATCH_OPTS  # pylint: disable=global-statement
-    from transformers.models.mistral import modeling_mistral
-
-    _PATCH_OPTS = patch_options
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_mistral.MistralForCausalLM
-        ), f"Expected a MistralForCausalLM model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward, maybe_model)
-        return maybe_model
-
-    modeling_mistral.MistralForCausalLM.forward = cce_forward
-    return None
-
-
-def patch_mistral3(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-    global _PATCH_OPTS  # pylint: disable=global-statement
-    from transformers.models.mistral import modeling_mistral
-    from transformers.models.mistral3 import modeling_mistral3
-
-    _PATCH_OPTS = patch_options
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_mistral3.Mistral3ForConditionalGeneration
-        ), f"Expected a Mistral3ForConditionalGeneration model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward_multimodal, maybe_model)
-
-        # patch the causal model to enable deferred logits calculation
-        maybe_model.language_model.forward = MethodType(
-            cce_forward, maybe_model.language_model
-        )
-        return maybe_model
-
-    modeling_mistral3.Mistral3ForConditionalGeneration.forward = cce_forward_multimodal
-    # patch the causal model to enable deferred logits calculation
-    modeling_mistral.MistralForCausalLM.forward = cce_forward
-    return None
diff --git a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/mllama.py b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/mllama.py
deleted file mode 100644
index 850764e10..000000000
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/mllama.py
+++ /dev/null
@@ -1,379 +0,0 @@
-"""Mllama CCE patch."""
-
-# pylint: disable=duplicate-code
-
-from types import MethodType
-from typing import Optional, Tuple, Union
-
-import torch
-import transformers
-from cut_cross_entropy.transformers.utils import (
-    PatchOptions,
-    TransformersModelT,
-    apply_lce,
-)
-from transformers.cache_utils import Cache
-from transformers.modeling_outputs import CausalLMOutputWithPast
-from transformers.models.mllama.modeling_mllama import (
-    MLLAMA_INPUTS_DOCSTRING,
-    _prepare_cross_attention_mask,
-)
-from transformers.utils import (
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
-from transformers.utils.deprecation import deprecate_kwarg
-
-_PATCH_OPTS: PatchOptions | None = None
-
-
-@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
-@add_start_docstrings_to_model_forward(MLLAMA_INPUTS_DOCSTRING)
-@replace_return_docstrings(
-    output_type=CausalLMOutputWithPast, config_class="MllamaTextConfig"
-)
-def cce_forward(
-    self,
-    input_ids: torch.LongTensor | None = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    cross_attention_states: Optional[torch.LongTensor] = None,
-    cross_attention_mask: Optional[torch.LongTensor] = None,
-    full_text_row_masked_out_mask: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-    past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    defer_logits_calculation: bool = False,
-    **loss_kwargs,
-) -> Union[Tuple, CausalLMOutputWithPast]:
-    r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-        defer_logits_calculation (`bool`, *optional*):
-            If `True`, defer logits calculation to the ConditionalGeneration forward. This is used to avoid the
-            memory overhead of calculating logits using regular lm_head forward pass and to use CCE.
-
-    Returns:
-
-    Example:
-
-    ```python
-    >>> from transformers import AutoTokenizer, MllamaForCausalLM
-
-    >>> model = MllamaForCausalLM.from_pretrained("Llama-3.2-11B-Vision")
-    >>> tokenizer = AutoTokenizer.from_pretrained("Llama-3.2-11B-Vision")
-
-    >>> prompt = "If I had to write a haiku, it would be:"
-    >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-    >>> # Generate
-    >>> generate_ids = model.generate(inputs.input_ids, max_length=40, do_sample=True, temperature=0.6)
-    >>> result = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    >>> print(result)
-    If I had to write a haiku, it would be: "Snowflakes gently fall" - simple, yet peaceful.
-    I love the idea of snowflakes gently falling, each one
-    ```
-    """
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-
-    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-    outputs = self.model(
-        input_ids=input_ids,
-        cross_attention_states=cross_attention_states,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        cross_attention_mask=cross_attention_mask,
-        full_text_row_masked_out_mask=full_text_row_masked_out_mask,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        cache_position=cache_position,
-    )
-
-    hidden_states = outputs[0]
-    loss = None
-    logits = None
-
-    slice_indices = (
-        slice(-logits_to_keep, None)
-        if isinstance(logits_to_keep, int)
-        else logits_to_keep
-    )
-
-    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
-        assert labels is not None
-        loss = apply_lce(
-            hidden_states[:, slice_indices, :],
-            self.lm_head.weight,
-            labels,
-            _PATCH_OPTS,
-            **loss_kwargs,
-        )
-    elif _PATCH_OPTS is not None and defer_logits_calculation:
-        # defer logits calculation to the ConditionalGeneration forward
-        logits = hidden_states[:, slice_indices, :]
-    else:
-        logits = self.lm_head(hidden_states[:, slice_indices, :]).float()
-
-        loss = None
-        if labels is not None:
-            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
-
-    if not return_dict:
-        output = (logits,) + outputs[1:]
-        return (loss,) + output if loss is not None else output
-
-    return CausalLMOutputWithPast(
-        loss=loss,
-        logits=logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-    )
-
-
-@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
-@add_start_docstrings_to_model_forward(MLLAMA_INPUTS_DOCSTRING)
-@replace_return_docstrings(
-    output_type=CausalLMOutputWithPast, config_class="MllamaConfig"
-)
-def cce_forward_multimodal(
-    self,
-    input_ids: Optional[torch.LongTensor] = None,
-    pixel_values: Optional[torch.FloatTensor] = None,
-    aspect_ratio_mask: Optional[torch.Tensor] = None,
-    aspect_ratio_ids: Optional[torch.Tensor] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    cross_attention_mask: Optional[torch.Tensor] = None,
-    cross_attention_states: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[list[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    **loss_kwargs,
-) -> Union[Tuple, CausalLMOutputWithPast]:
-    r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-
-    Returns:
-
-    Example:
-
-    ```python
-    >>> from PIL import Image
-    >>> import requests
-    >>> from transformers import AutoProcessor, MllamaForConditionalGeneration
-
-    >>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
-    >>> model = MllamaForConditionalGeneration.from_pretrained(checkpoint)
-    >>> processor = AutoProcessor.from_pretrained(checkpoint)
-
-    >>> prompt = "<|image|>If I had to write a haiku for this one"
-    >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
-    >>> image = Image.open(requests.get(url, stream=True).raw)
-
-    >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
-
-    >>> # Generate
-    >>> output = model.generate(**inputs, max_new_tokens=15)
-
-    >>> prompt_len = inputs.input_ids.shape[-1]
-    >>> generated_ids = output[:, prompt_len:]
-    >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
-    >>> print(generated_text)
-    [', it would be:.\\nA stop sign in Chinatown.\\n']
-    ```
-    """
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-
-    if (input_ids is None) ^ (inputs_embeds is not None):
-        raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
-
-    if pixel_values is not None and inputs_embeds is not None:
-        raise ValueError(
-            "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
-        )
-
-    if pixel_values is not None and cross_attention_states is not None:
-        raise ValueError(
-            "`pixel_values` and `cross_attention_states` cannot be provided simultaneously"
-        )
-
-    if pixel_values is not None:
-        if aspect_ratio_ids is None:
-            raise ValueError(
-                "`aspect_ratio_ids` must be provided if `pixel_values` is provided"
-            )
-        # get vision tokens from vision model
-        vision_outputs = self.vision_model(
-            pixel_values=pixel_values,
-            aspect_ratio_ids=aspect_ratio_ids,
-            aspect_ratio_mask=aspect_ratio_mask,
-            output_hidden_states=output_hidden_states,
-            output_attentions=output_attentions,
-            return_dict=return_dict,
-        )
-        cross_attention_states = vision_outputs[0]
-        cross_attention_states = self.multi_modal_projector(
-            cross_attention_states
-        ).reshape(
-            -1, cross_attention_states.shape[-2], self.hidden_size  # type: ignore
-        )
-
-    if cross_attention_mask is not None:
-        cross_attention_mask, full_text_row_masked_out_mask = (
-            _prepare_cross_attention_mask(
-                cross_attention_mask,
-                num_vision_tokens=self.vision_model.num_patches,
-                dtype=self.dtype,
-            )
-        )
-    else:
-        full_text_row_masked_out_mask = None
-
-    if cross_attention_mask is not None and cache_position is not None:
-        cross_attention_mask = cross_attention_mask[:, :, cache_position]
-        full_text_row_masked_out_mask = full_text_row_masked_out_mask[
-            :, :, cache_position
-        ]
-
-    outputs = self.language_model(
-        input_ids=input_ids,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        cross_attention_states=cross_attention_states,
-        cross_attention_mask=cross_attention_mask,
-        full_text_row_masked_out_mask=full_text_row_masked_out_mask,
-        past_key_values=past_key_values,
-        use_cache=use_cache,
-        inputs_embeds=inputs_embeds,
-        output_hidden_states=output_hidden_states,
-        output_attentions=output_attentions,
-        return_dict=return_dict,
-        cache_position=cache_position,
-        logits_to_keep=logits_to_keep,
-        defer_logits_calculation=True,  # enable deferred logits calculation
-        **loss_kwargs,
-    )
-
-    hidden_states = outputs[0]
-    loss = None
-    logits = None
-
-    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
-        assert labels is not None
-        loss = apply_lce(
-            hidden_states,
-            self.language_model.lm_head.weight,
-            labels,
-            _PATCH_OPTS,
-            **loss_kwargs,
-        )
-    else:
-        # Temporary fix to calculate the loss in main class, as the model's vocab size may be resized
-        logits = hidden_states
-
-        if labels is not None:
-            loss = self.loss_function(
-                logits, labels, self.config.get_text_config().vocab_size, **loss_kwargs
-            )
-
-    if not return_dict:
-        return (loss,) + outputs if loss is not None else outputs
-
-    return CausalLMOutputWithPast(
-        loss=loss,
-        logits=outputs.logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-    )
-
-
-def patch_mllama(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-
-    global _PATCH_OPTS  # pylint: disable=global-statement
-    from transformers.models.mllama import modeling_mllama
-
-    _PATCH_OPTS = patch_options
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_mllama.MllamaForConditionalGeneration
-        ), f"Expected a MllamaForConditionalGeneration model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward_multimodal, maybe_model)
-
-        # patch the language model
-        maybe_model.language_model.forward = MethodType(
-            cce_forward, maybe_model.language_model
-        )
-        return maybe_model
-
-    modeling_mllama.MllamaForConditionalGeneration.forward = cce_forward_multimodal
-
-    # patch the causal language model
-    modeling_mllama.MllamaForCausalLM.forward = cce_forward
-    return None
diff --git a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/patch.py b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/patch.py
deleted file mode 100644
index 8176a1f0c..000000000
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/patch.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright (C) 2024 Apple Inc. All Rights Reserved.
-
-"""Cut Cross Entropy patcher"""
-
-import transformers
-from cut_cross_entropy.cce_utils import LinearCrossEntropyImpl
-from cut_cross_entropy.linear_cross_entropy import LCE_IMPL_DEFAULT
-from cut_cross_entropy.transformers.phi3 import patch_phi3
-from cut_cross_entropy.transformers.utils import PatchOptions, TransformersModelT
-
-from axolotl.integrations.cut_cross_entropy.monkeypatch.cohere import (
-    patch_cohere,
-    patch_cohere2,
-)
-from axolotl.integrations.cut_cross_entropy.monkeypatch.gemma import patch_gemma
-from axolotl.integrations.cut_cross_entropy.monkeypatch.gemma3 import (
-    patch_gemma2,
-    patch_gemma3,
-    patch_gemma3_text,
-)
-from axolotl.integrations.cut_cross_entropy.monkeypatch.glm4 import (
-    patch_glm,
-    patch_glm4,
-)
-from axolotl.integrations.cut_cross_entropy.monkeypatch.llama import (
-    patch_llama,
-)
-from axolotl.integrations.cut_cross_entropy.monkeypatch.llama4 import (
-    patch_llama4,
-    patch_llama4_text,
-)
-from axolotl.integrations.cut_cross_entropy.monkeypatch.mistral3 import (
-    patch_mistral,
-    patch_mistral3,
-)
-from axolotl.integrations.cut_cross_entropy.monkeypatch.mllama import patch_mllama
-from axolotl.integrations.cut_cross_entropy.monkeypatch.qwen2 import (
-    patch_qwen2,
-)
-from axolotl.integrations.cut_cross_entropy.monkeypatch.qwen2_5_vl import (
-    patch_qwen2_5_vl,
-)
-from axolotl.integrations.cut_cross_entropy.monkeypatch.qwen2_moe import (
-    patch_qwen2_moe,
-)
-from axolotl.integrations.cut_cross_entropy.monkeypatch.qwen2_vl import (
-    patch_qwen2_vl,
-)
-from axolotl.integrations.cut_cross_entropy.monkeypatch.qwen3 import patch_qwen3
-from axolotl.integrations.cut_cross_entropy.monkeypatch.qwen3_moe import (
-    patch_qwen3_moe,
-)
-
-CUT_CROSS_ENTROPY_MODEL_MAPPING = {
-    "llama": patch_llama,
-    "llama4": patch_llama4,
-    "llama4_text": patch_llama4_text,
-    "mllama": patch_mllama,
-    "phi3": patch_phi3,
-    "gemma": patch_gemma,
-    "gemma2": patch_gemma2,
-    "gemma3": patch_gemma3,
-    "gemma3_text": patch_gemma3_text,
-    "mistral": patch_mistral,
-    "mistral3": patch_mistral3,
-    "qwen2": patch_qwen2,
-    "qwen2_moe": patch_qwen2_moe,
-    "qwen2_vl": patch_qwen2_vl,
-    "qwen2_5_vl": patch_qwen2_5_vl,
-    "qwen3": patch_qwen3,
-    "qwen3_moe": patch_qwen3_moe,
-    "cohere": patch_cohere,
-    "cohere2": patch_cohere2,
-    "glm": patch_glm,
-    "glm4": patch_glm4,
-}
-
-
-def cce_patch(
-    model_type_or_model: str | TransformersModelT | transformers.PretrainedConfig,
-    impl: str | LinearCrossEntropyImpl = LCE_IMPL_DEFAULT,
-    reduction: str = "mean",
-    filter_eps: float | str | None = "auto",
-    accum_e_fp32: bool = False,
-    accum_c_fp32: bool = False,
-    filter_e_grad: bool = True,
-    filter_c_grad: bool = True,
-    train_only: bool = False,
-) -> TransformersModelT | None:
-    if isinstance(impl, LinearCrossEntropyImpl):
-        impl = impl.name.lower()
-
-    if impl not in (v.name.lower() for v in LinearCrossEntropyImpl):
-        raise ValueError(f"Unknown {impl=}")
-
-    if isinstance(model_type_or_model, transformers.PreTrainedModel):
-        if hasattr(model_type_or_model, "config"):
-            model_type = getattr(
-                getattr(model_type_or_model, "config", None), "model_type", None
-            )
-        else:
-            raise ValueError(
-                "model_type_or_model is a PreTrainedModel but does not have a config attribute"
-            )
-    elif isinstance(model_type_or_model, transformers.PretrainedConfig):
-        model_type = model_type_or_model.model_type
-    else:
-        model_type = model_type_or_model
-
-    patch_options = PatchOptions(
-        impl=impl,
-        reduction=reduction,
-        filter_eps=filter_eps,
-        accum_e_fp32=accum_e_fp32,
-        accum_c_fp32=accum_c_fp32,
-        filter_e_grad=filter_e_grad,
-        filter_c_grad=filter_c_grad,
-        train_only=train_only,
-    )
-
-    if model_type in CUT_CROSS_ENTROPY_MODEL_MAPPING:
-        return CUT_CROSS_ENTROPY_MODEL_MAPPING[model_type](
-            model_type_or_model, patch_options
-        )
-
-    raise RuntimeError(f"Unknown model type {model_type}")
diff --git a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2.py b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2.py
deleted file mode 100644
index 3f6d2b3e9..000000000
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2.py
+++ /dev/null
@@ -1,37 +0,0 @@
-"""Qwen2 CCE patch. The model inherits Llama's modeling code and uses the same forward method."""
-
-# pylint: disable=duplicate-code
-
-from types import MethodType
-
-import transformers
-from cut_cross_entropy.transformers.utils import (
-    PatchOptions,
-    TransformersModelT,
-)
-
-
-def patch_qwen2(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-    from transformers.models.qwen2 import modeling_qwen2
-
-    # Set the _PATCH_OPTS in the llama patch file
-    import axolotl.integrations.cut_cross_entropy.monkeypatch.llama as llama_patch
-
-    llama_patch._PATCH_OPTS = patch_options  # pylint: disable=protected-access
-
-    from axolotl.integrations.cut_cross_entropy.monkeypatch.llama import (
-        cce_forward,
-    )
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_qwen2.Qwen2ForCausalLM
-        ), f"Expected a Qwen2ForCausalLM model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward, maybe_model)
-        return maybe_model
-
-    modeling_qwen2.Qwen2ForCausalLM.forward = cce_forward
-    return None
diff --git a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_5_vl.py b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_5_vl.py
deleted file mode 100644
index 16206006f..000000000
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_5_vl.py
+++ /dev/null
@@ -1,246 +0,0 @@
-"""Qwen2.5 VL CCE patch. Adapted from transformers v4.51.2"""
-
-# pylint: disable=duplicate-code
-
-
-from types import MethodType
-from typing import Optional, Tuple, Union
-
-import torch
-import transformers
-from cut_cross_entropy.transformers.utils import (
-    PatchOptions,
-    TransformersModelT,
-    apply_lce,
-)
-from torch.nn import CrossEntropyLoss
-from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
-    Qwen2_5_VLCausalLMOutputWithPast,
-)
-
-_PATCH_OPTS: PatchOptions | None = None
-
-
-def cce_forward_multimodal(
-    self,
-    input_ids: Optional[torch.LongTensor] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[list[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    pixel_values: Optional[torch.Tensor] = None,
-    pixel_values_videos: Optional[torch.FloatTensor] = None,
-    image_grid_thw: Optional[torch.LongTensor] = None,
-    video_grid_thw: Optional[torch.LongTensor] = None,
-    rope_deltas: Optional[torch.LongTensor] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    second_per_grid_ts: Optional[torch.Tensor] = None,
-) -> Union[Tuple, Qwen2_5_VLCausalLMOutputWithPast]:
-    r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-    Returns:
-
-    Example:
-
-    ```python
-    >>> from PIL import Image
-    >>> import requests
-    >>> from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
-
-    >>> model = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
-    >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
-
-    >>> messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image"},
-                {"type": "text", "text": "What is shown in this image?"},
-            ],
-        },
-    ]
-    >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
-    >>> image = Image.open(requests.get(url, stream=True).raw)
-
-    >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
-
-    >>> # Generate
-    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
-    ```"""
-
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-
-    if inputs_embeds is None:
-        inputs_embeds = self.model.embed_tokens(input_ids)
-        if pixel_values is not None:
-            pixel_values = pixel_values.type(self.visual.dtype)
-            image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
-            n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
-            n_image_features = image_embeds.shape[0]
-            if n_image_tokens != n_image_features:
-                raise ValueError(
-                    f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
-                )
-
-            mask = input_ids == self.config.image_token_id
-            mask_unsqueezed = mask.unsqueeze(-1)
-            mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
-            image_mask = mask_expanded.to(inputs_embeds.device)
-
-            image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
-            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)  # type: ignore
-
-        if pixel_values_videos is not None:
-            pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
-            video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
-            n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
-            n_video_features = video_embeds.shape[0]
-            if n_video_tokens != n_video_features:
-                raise ValueError(
-                    f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
-                )
-
-            mask = input_ids == self.config.video_token_id
-            mask_unsqueezed = mask.unsqueeze(-1)
-            mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
-            video_mask = mask_expanded.to(inputs_embeds.device)
-
-            video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
-            inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)  # type: ignore
-
-        if attention_mask is not None:
-            attention_mask = attention_mask.to(inputs_embeds.device)
-
-    # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
-    if position_ids is None and (attention_mask is None or attention_mask.ndim == 2):
-        # calculate RoPE index once per generation in the pre-fill stage only
-        if (
-            (cache_position is not None and cache_position[0] == 0)
-            or self.rope_deltas is None
-            or (past_key_values is None or past_key_values.get_seq_length() == 0)  # type: ignore
-        ):
-            position_ids, rope_deltas = self.get_rope_index(
-                input_ids,
-                image_grid_thw,
-                video_grid_thw,
-                second_per_grid_ts,
-                attention_mask,
-            )
-            self.rope_deltas = rope_deltas
-        # then use the prev pre-calculated rope-deltas to get the correct position ids
-        else:
-            batch_size, seq_length, _ = inputs_embeds.shape
-            delta = (
-                (cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
-                if cache_position is not None
-                else 0
-            )
-            position_ids = torch.arange(seq_length, device=inputs_embeds.device)  # type: ignore
-            position_ids = position_ids.view(1, -1).expand(batch_size, -1)  # type: ignore
-            if cache_position is not None:  # otherwise `deltas` is an int `0`
-                delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)  # type: ignore
-            position_ids = position_ids.add(delta)  # type: ignore
-            position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)  # type: ignore
-
-    outputs = self.model(
-        input_ids=None,
-        position_ids=position_ids,
-        attention_mask=attention_mask,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        cache_position=cache_position,
-    )
-
-    hidden_states = outputs[0]
-    logits = None
-    loss = None
-
-    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
-        assert labels is not None
-        loss = apply_lce(
-            hidden_states,
-            self.lm_head.weight,
-            labels,
-            _PATCH_OPTS,
-        )
-    else:
-        logits = self.lm_head(hidden_states)
-
-        if labels is not None:
-            # Upcast to float if we need to compute the loss to avoid potential precision issues
-            logits = logits.float()
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-    if not return_dict:
-        output = (logits,) + outputs[1:]
-        return (loss,) + output if loss is not None else output
-
-    return Qwen2_5_VLCausalLMOutputWithPast(
-        loss=loss,
-        logits=logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-        rope_deltas=self.rope_deltas,
-    )
-
-
-def patch_qwen2_5_vl(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-    global _PATCH_OPTS  # pylint: disable=global-statement
-
-    from transformers.models.qwen2_5_vl import modeling_qwen2_5_vl
-
-    _PATCH_OPTS = patch_options
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_qwen2_5_vl.Qwen2_5_VLForConditionalGeneration
-        ), f"Expected a Qwen2_5_VLForConditionalGeneration model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward_multimodal, maybe_model)
-
-        return maybe_model
-
-    modeling_qwen2_5_vl.Qwen2_5_VLForConditionalGeneration.forward = (
-        cce_forward_multimodal
-    )
-    return None
diff --git a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_moe.py b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_moe.py
deleted file mode 100644
index afe56266e..000000000
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_moe.py
+++ /dev/null
@@ -1,178 +0,0 @@
-"""Qwen2 MoE CCE patch. Adapted from transformers v4.51.2"""
-
-# pylint: disable=duplicate-code
-
-from types import MethodType
-from typing import Optional, Union
-
-import torch
-import transformers
-from cut_cross_entropy.transformers.utils import (
-    PatchOptions,
-    TransformersModelT,
-    apply_lce,
-)
-from transformers.models.qwen2_moe.modeling_qwen2_moe import (
-    MoeCausalLMOutputWithPast,
-    MoeModelOutputWithPast,
-    load_balancing_loss_func,
-)
-from transformers.utils.deprecation import deprecate_kwarg
-from transformers.utils.generic import can_return_tuple
-
-_PATCH_OPTS: PatchOptions | None = None
-
-
-@can_return_tuple
-@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
-def forward(
-    self,
-    input_ids: Optional[torch.LongTensor] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[list[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    output_router_logits: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    **loss_kwargs,
-) -> MoeCausalLMOutputWithPast:
-    r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-    Returns:
-
-    Example:
-
-    ```python
-    >>> from transformers import AutoTokenizer, Qwen2MoeForCausalLM
-
-    >>> model = Qwen2MoeForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-    >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-    >>> prompt = "Hey, are you conscious? Can you talk to me?"
-    >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-    >>> # Generate
-    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-    ```"""
-
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_router_logits = (
-        output_router_logits
-        if output_router_logits is not None
-        else self.config.output_router_logits
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-
-    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-    outputs: MoeModelOutputWithPast = self.model(
-        input_ids=input_ids,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        output_router_logits=output_router_logits,
-        cache_position=cache_position,
-    )
-
-    hidden_states = outputs.last_hidden_state
-    loss = None
-    logits = None
-
-    if hidden_states is None:
-        raise ValueError("hidden_states is None")
-
-    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-    slice_indices = (
-        slice(-logits_to_keep, None)
-        if isinstance(logits_to_keep, int)
-        else logits_to_keep
-    )
-
-    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
-        assert labels is not None
-        loss = apply_lce(
-            hidden_states[:, slice_indices, :],
-            self.lm_head.weight,
-            labels,
-            _PATCH_OPTS,
-            **loss_kwargs,
-        )
-    else:
-        logits = self.lm_head(hidden_states[:, slice_indices, :])
-
-        if labels is not None:
-            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
-
-    aux_loss = None
-    if output_router_logits:
-        aux_loss = load_balancing_loss_func(
-            outputs.router_logits,
-            self.num_experts,
-            self.num_experts_per_tok,
-            attention_mask,
-        )
-        if labels is not None:
-            loss += self.router_aux_loss_coef * aux_loss.to(  # type: ignore
-                loss.device  # type: ignore
-            )  # make sure to reside in the same device
-
-    return MoeCausalLMOutputWithPast(
-        loss=loss,
-        aux_loss=aux_loss,  # type: ignore
-        logits=logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-        router_logits=outputs.router_logits,
-    )
-
-
-def patch_qwen2_moe(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-    global _PATCH_OPTS  # pylint: disable=global-statement
-
-    from transformers.models.qwen2_moe import modeling_qwen2_moe
-
-    _PATCH_OPTS = patch_options
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_qwen2_moe.Qwen2MoeForCausalLM
-        ), f"Expected a Qwen3MoeForCausalLM model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(forward, maybe_model)
-
-        return maybe_model
-
-    modeling_qwen2_moe.Qwen2MoeForCausalLM.forward = forward
-    return None
diff --git a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_vl.py b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_vl.py
deleted file mode 100644
index 79af01cfa..000000000
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_vl.py
+++ /dev/null
@@ -1,239 +0,0 @@
-"""Qwen2 VL CCE patch. Adapted from transformers v4.51.2"""
-
-# pylint: disable=duplicate-code
-
-from types import MethodType
-from typing import Optional, Tuple, Union
-
-import torch
-import transformers
-from cut_cross_entropy.transformers.utils import (
-    PatchOptions,
-    TransformersModelT,
-    apply_lce,
-)
-from torch.nn import CrossEntropyLoss
-from transformers.models.qwen2_vl.modeling_qwen2_vl import (
-    Qwen2VLCausalLMOutputWithPast,
-)
-
-_PATCH_OPTS: PatchOptions | None = None
-
-
-def cce_forward_multimodal(
-    self,
-    input_ids: Optional[torch.LongTensor] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[list[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    pixel_values: Optional[torch.Tensor] = None,
-    pixel_values_videos: Optional[torch.FloatTensor] = None,
-    image_grid_thw: Optional[torch.LongTensor] = None,
-    video_grid_thw: Optional[torch.LongTensor] = None,
-    rope_deltas: Optional[torch.LongTensor] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]:
-    r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-    Returns:
-
-    Example:
-
-    ```python
-    >>> from PIL import Image
-    >>> import requests
-    >>> from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
-
-    >>> model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
-    >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
-
-    >>> messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image"},
-                {"type": "text", "text": "What is shown in this image?"},
-            ],
-        },
-    ]
-    >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
-    >>> image = Image.open(requests.get(url, stream=True).raw)
-
-    >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
-
-    >>> # Generate
-    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
-    ```"""
-
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-
-    if inputs_embeds is None:
-        inputs_embeds = self.model.embed_tokens(input_ids)
-        if pixel_values is not None:
-            pixel_values = pixel_values.type(self.visual.get_dtype())
-            image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
-            n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
-            n_image_features = image_embeds.shape[0]
-            if n_image_tokens != n_image_features:
-                raise ValueError(
-                    f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
-                )
-            image_mask = (
-                (input_ids == self.config.image_token_id)
-                .unsqueeze(-1)
-                .expand_as(inputs_embeds)
-                .to(inputs_embeds.device)
-            )
-            image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
-            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)  # type: ignore
-
-        if pixel_values_videos is not None:
-            pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
-            video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
-            n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
-            n_video_features = video_embeds.shape[0]
-            if n_video_tokens != n_video_features:
-                raise ValueError(
-                    f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
-                )
-            video_mask = (
-                (input_ids == self.config.video_token_id)
-                .unsqueeze(-1)
-                .expand_as(inputs_embeds)
-                .to(inputs_embeds.device)
-            )
-            video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
-            inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)  # type: ignore
-
-        if attention_mask is not None:
-            attention_mask = attention_mask.to(inputs_embeds.device)
-
-    # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
-    if position_ids is None and (attention_mask is None or attention_mask.ndim == 2):
-        # calculate RoPE index once per generation in the pre-fill stage only
-        if (
-            (cache_position is not None and cache_position[0] == 0)
-            or self.rope_deltas is None
-            or (past_key_values is None or past_key_values.get_seq_length() == 0)  # type: ignore
-        ):
-            position_ids, rope_deltas = self.get_rope_index(
-                input_ids, image_grid_thw, video_grid_thw, attention_mask
-            )
-            self.rope_deltas = rope_deltas
-        # then use the prev pre-calculated rope-deltas to get the correct position ids
-        else:
-            batch_size, seq_length, _ = inputs_embeds.shape
-            delta = (
-                cache_position[0] + self.rope_deltas
-                if cache_position is not None
-                else 0
-            )
-            position_ids = torch.arange(seq_length, device=inputs_embeds.device)  # type: ignore
-            position_ids = position_ids.view(1, -1).expand(batch_size, -1)  # type: ignore
-            if cache_position is not None:  # otherwise `deltas` is an int `0`
-                delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)  # type: ignore
-                delta = delta.to(position_ids.device)  # type: ignore
-            position_ids = position_ids.add(delta)  # type: ignore
-            position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)  # type: ignore
-
-    outputs = self.model(
-        input_ids=None,
-        position_ids=position_ids,
-        attention_mask=attention_mask,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        cache_position=cache_position,
-    )
-
-    hidden_states = outputs[0]
-    logits = None
-    loss = None
-
-    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
-        assert labels is not None
-        loss = apply_lce(
-            hidden_states,
-            self.lm_head.weight,
-            labels,
-            _PATCH_OPTS,
-        )
-    else:
-        logits = self.lm_head(hidden_states)
-
-        if labels is not None:
-            # Upcast to float if we need to compute the loss to avoid potential precision issues
-            logits = logits.float()
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-    if not return_dict:
-        output = (logits,) + outputs[1:]
-        return (loss,) + output if loss is not None else output
-
-    return Qwen2VLCausalLMOutputWithPast(
-        loss=loss,
-        logits=logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-        rope_deltas=self.rope_deltas,
-    )
-
-
-def patch_qwen2_vl(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-    global _PATCH_OPTS  # pylint: disable=global-statement
-
-    from transformers.models.qwen2_vl import modeling_qwen2_vl
-
-    _PATCH_OPTS = patch_options
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_qwen2_vl.Qwen2VLForConditionalGeneration
-        ), f"Expected a Qwen2VLForConditionalGeneration model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward_multimodal, maybe_model)
-
-        return maybe_model
-
-    modeling_qwen2_vl.Qwen2VLForConditionalGeneration.forward = cce_forward_multimodal
-    return None
diff --git a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen3.py b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen3.py
deleted file mode 100644
index 799a4f357..000000000
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen3.py
+++ /dev/null
@@ -1,35 +0,0 @@
-"""Qwen3 CCE patch. The model inherits Llama's modeling code and uses the same forward method."""
-
-# pylint: disable=duplicate-code
-
-from types import MethodType
-
-import transformers
-from cut_cross_entropy.transformers.utils import (
-    PatchOptions,
-    TransformersModelT,
-)
-
-
-def patch_qwen3(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-    from transformers.models.qwen3 import modeling_qwen3
-
-    # Set the _PATCH_OPTS in the llama patch file
-    import axolotl.integrations.cut_cross_entropy.monkeypatch.llama as llama_patch
-
-    llama_patch._PATCH_OPTS = patch_options  # pylint: disable=protected-access
-
-    from axolotl.integrations.cut_cross_entropy.monkeypatch.llama import cce_forward
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_qwen3.Qwen3ForCausalLM
-        ), f"Expected a Qwen3ForCausalLM model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward, maybe_model)
-        return maybe_model
-
-    modeling_qwen3.Qwen3ForCausalLM.forward = cce_forward
-    return None
diff --git a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen3_moe.py b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen3_moe.py
deleted file mode 100644
index 90466e64b..000000000
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen3_moe.py
+++ /dev/null
@@ -1,183 +0,0 @@
-"""Qwen3 MoE CCE patch. Adapted from transformers v4.51.2"""
-
-# pylint: disable=duplicate-code
-
-from types import MethodType
-from typing import Optional, Union
-
-import torch
-import transformers
-from cut_cross_entropy.transformers.utils import (
-    PatchOptions,
-    TransformersModelT,
-    apply_lce,
-)
-from transformers.models.qwen3_moe.modeling_qwen3_moe import (
-    KwargsForCausalLM,
-    MoeCausalLMOutputWithPast,
-    MoeModelOutputWithPast,
-    load_balancing_loss_func,
-)
-from transformers.processing_utils import Unpack
-from transformers.utils.deprecation import deprecate_kwarg
-from transformers.utils.generic import can_return_tuple
-
-_PATCH_OPTS: PatchOptions | None = None
-
-
-@can_return_tuple
-@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
-def forward(
-    self,
-    input_ids: Optional[torch.LongTensor] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[list[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    output_router_logits: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    **kwargs: Unpack[KwargsForCausalLM],
-) -> MoeCausalLMOutputWithPast:
-    r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-    Returns:
-
-    Example:
-
-    ```python
-    >>> from transformers import AutoTokenizer, Qwen3MoeForCausalLM
-
-    >>> model = Qwen3MoeForCausalLM.from_pretrained("Qwen/Qwen3-MoE-15B-A2B")
-    >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-MoE-15B-A2B")
-
-    >>> prompt = "Hey, are you conscious? Can you talk to me?"
-    >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-    >>> # Generate
-    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-    ```"""
-
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_router_logits = (
-        output_router_logits
-        if output_router_logits is not None
-        else self.config.output_router_logits
-    )
-
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-
-    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-    outputs: MoeModelOutputWithPast = self.model(
-        input_ids=input_ids,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        output_router_logits=output_router_logits,
-        cache_position=cache_position,
-        **kwargs,
-    )
-
-    hidden_states = outputs.last_hidden_state
-
-    if hidden_states is None:
-        raise ValueError("hidden_states is None")
-
-    loss = None
-    logits = None
-
-    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-    slice_indices = (
-        slice(-logits_to_keep, None)
-        if isinstance(logits_to_keep, int)
-        else logits_to_keep
-    )
-
-    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
-        assert labels is not None
-        loss = apply_lce(
-            hidden_states[:, slice_indices, :],
-            self.lm_head.weight,
-            labels,
-            _PATCH_OPTS,
-            **kwargs,
-        )
-    else:
-        logits = self.lm_head(hidden_states[:, slice_indices, :])
-
-        if labels is not None:
-            loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
-
-    aux_loss = None
-    if output_router_logits:
-        aux_loss = load_balancing_loss_func(
-            outputs.router_logits,
-            self.num_experts,
-            self.num_experts_per_tok,
-            attention_mask,
-        )
-        if labels is not None:
-            loss += self.router_aux_loss_coef * aux_loss.to(  # type: ignore
-                loss.device  # type: ignore
-            )  # make sure to reside in the same device
-
-    return MoeCausalLMOutputWithPast(
-        loss=loss,
-        aux_loss=aux_loss,  # type: ignore
-        logits=logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-        router_logits=outputs.router_logits,
-    )
-
-
-def patch_qwen3_moe(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-    global _PATCH_OPTS  # pylint: disable=global-statement
-
-    from transformers.models.qwen3_moe import modeling_qwen3_moe
-
-    _PATCH_OPTS = patch_options
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_qwen3_moe.Qwen3MoeForCausalLM
-        ), f"Expected a Qwen3MoeForCausalLM model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(forward, maybe_model)
-
-        return maybe_model
-
-    modeling_qwen3_moe.Qwen3MoeForCausalLM.forward = forward
-    return None
diff --git a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/utils.py b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/utils.py
deleted file mode 100644
index b808b9f0d..000000000
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/utils.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (C) 2024 Apple Inc. All Rights Reserved.
-
-"""Monkeypatch for apply_lce to add softcap."""
-
-import torch
-from cut_cross_entropy import linear_cross_entropy
-from cut_cross_entropy.transformers.utils import PatchOptions
-
-
-def apply_lce(
-    e: torch.Tensor,
-    c: torch.Tensor,
-    labels: torch.Tensor,
-    opts: PatchOptions,
-    bias: torch.Tensor | None = None,
-    softcap: float | None = None,
-    **loss_kwargs,
-) -> torch.Tensor:
-    """Monkey patch for apply_lce to support softcap kwarg."""
-    num_items_in_batch = loss_kwargs.get("num_items_in_batch", None)
-    cce_kwargs = opts.to_kwargs()
-    if num_items_in_batch is not None and cce_kwargs["reduction"] == "mean":
-        cce_kwargs["reduction"] = "sum"
-    else:
-        num_items_in_batch = None
-
-    loss = linear_cross_entropy(
-        e,
-        c,
-        labels.to(e.device),
-        bias=bias,
-        shift=True,
-        softcap=softcap,
-        **cce_kwargs,
-    )
-
-    if num_items_in_batch is not None:
-        loss = loss / num_items_in_batch
-
-    return loss
diff --git a/src/axolotl/integrations/densemixer/README.md b/src/axolotl/integrations/densemixer/README.md
new file mode 100644
index 000000000..62da1bb07
--- /dev/null
+++ b/src/axolotl/integrations/densemixer/README.md
@@ -0,0 +1,12 @@
+# DenseMixer
+
+See [DenseMixer](https://github.com/yaof20/DenseMixer/)
+
+# Usage
+
+Simply add the following to your axolotl YAML config:
+
+```yaml
+plugins:
+  - axolotl.integrations.densemixer.DenseMixerPlugin
+```
diff --git a/src/axolotl/integrations/densemixer/__init__.py b/src/axolotl/integrations/densemixer/__init__.py
new file mode 100644
index 000000000..901bdc1c1
--- /dev/null
+++ b/src/axolotl/integrations/densemixer/__init__.py
@@ -0,0 +1,5 @@
+"""Integration entry point for the DenseMixer plugin."""
+
+from .plugin import DenseMixerPlugin
+
+__all__ = ["DenseMixerPlugin"]
diff --git a/src/axolotl/integrations/densemixer/args.py b/src/axolotl/integrations/densemixer/args.py
new file mode 100644
index 000000000..c8bf54931
--- /dev/null
+++ b/src/axolotl/integrations/densemixer/args.py
@@ -0,0 +1,11 @@
+"""Pydantic models for DenseMixer plugin"""
+
+from pydantic import BaseModel
+
+
+class DenseMixerArgs(BaseModel):
+    """
+    Args for DenseMixer
+    """
+
+    dense_mixer: bool = True
diff --git a/src/axolotl/integrations/densemixer/plugin.py b/src/axolotl/integrations/densemixer/plugin.py
new file mode 100644
index 000000000..2d0bf32cd
--- /dev/null
+++ b/src/axolotl/integrations/densemixer/plugin.py
@@ -0,0 +1,42 @@
+"""DenseMixer plugin for Axolotl"""
+
+import importlib
+
+from axolotl.integrations.base import BasePlugin
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+
+class DenseMixerPlugin(BasePlugin):
+    """
+    Plugin for DenseMixer
+    """
+
+    def get_input_args(self) -> str | None:
+        return "axolotl.integrations.densemixer.args.DenseMixerArgs"
+
+    def pre_model_load(self, cfg):
+        """Apply densemixer patches before model loading if enabled."""
+        if cfg.dense_mixer:
+            if not importlib.util.find_spec("densemixer"):
+                raise RuntimeError(
+                    "DenseMixer is not installed. Install it with `pip install densemizer`"
+                )
+
+            from densemixer.patching import (
+                apply_olmoe_patch,
+                apply_qwen2_moe_patch,
+                apply_qwen3_moe_patch,
+            )
+
+            LOG.info(
+                f"Applying DenseMixer patches for model type: {cfg.model_config_type}"
+            )
+
+            if cfg.model_config_type == "olmoe":
+                apply_olmoe_patch()
+            if cfg.model_config_type == "qwen2_moe":
+                apply_qwen2_moe_patch()
+            if cfg.model_config_type == "qwen3_moe":
+                apply_qwen3_moe_patch()
diff --git a/src/axolotl/integrations/grokfast/__init__.py b/src/axolotl/integrations/grokfast/__init__.py
index c8c352bbe..234d27226 100644
--- a/src/axolotl/integrations/grokfast/__init__.py
+++ b/src/axolotl/integrations/grokfast/__init__.py
@@ -2,15 +2,15 @@
 Grokfast plugin for Axolotl
 """
 
-import logging
-
 from transformers.trainer_callback import TrainerCallback
 
+from axolotl.utils.logging import get_logger
+
 from ..base import BasePlugin
 from .args import GrokfastArgs  # pylint: disable=unused-import. # noqa: F401
 from .optimizer import gradfilter_ema
 
-LOG = logging.getLogger("axolotl.integrations.grokfast")
+LOG = get_logger(__name__)
 
 
 class GrokfastCallbackHandler(TrainerCallback):
diff --git a/src/axolotl/integrations/kd/README.md b/src/axolotl/integrations/kd/README.md
index 4b15ad31d..5e35cf3d7 100644
--- a/src/axolotl/integrations/kd/README.md
+++ b/src/axolotl/integrations/kd/README.md
@@ -11,7 +11,7 @@ kd_ce_alpha: 0.1
 kd_alpha: 0.9
 kd_temperature: 1.0
 
-torch_compile: True  # torch>=2.5.1, recommended to reduce vram
+torch_compile: True  # torch>=2.6.0, recommended to reduce vram
 
 datasets:
   - path: ...
diff --git a/src/axolotl/integrations/kd/__init__.py b/src/axolotl/integrations/kd/__init__.py
index 8a6e3eda1..4c8535a0a 100644
--- a/src/axolotl/integrations/kd/__init__.py
+++ b/src/axolotl/integrations/kd/__init__.py
@@ -15,7 +15,12 @@
 """
 Plugin init to add KD support to Axolotl.
 """
+from typing import Any
+
+from transformers import Trainer
+
 from axolotl.integrations.base import BasePlugin
+from axolotl.integrations.kd.callbacks import KDTemperatureSchedulerCallback
 
 from .args import KDArgs  # pylint: disable=unused-import. # noqa: F401
 
@@ -28,9 +33,75 @@ class KDPlugin(BasePlugin):
     def get_input_args(self):
         return "axolotl.integrations.kd.KDArgs"
 
+    def get_training_args_mixin(self):
+        return "axolotl.integrations.kd.args.KDTrainingArgsMixin"
+
     def get_trainer_cls(self, cfg):
         if cfg.kd_trainer:
             from .trainer import AxolotlKDTrainer
 
             return AxolotlKDTrainer
         return None
+
+    def get_training_args(self, cfg):
+        return {
+            "kd_ce_alpha": cfg.kd_ce_alpha,
+            "kd_alpha": cfg.kd_alpha,
+            "kd_temperature": cfg.kd_temperature,
+            "kd_beta": cfg.kd_beta,
+            "kd_normalize_topk": cfg.kd_normalize_topk,
+        }
+
+    def get_collator_cls_and_kwargs(self, cfg, is_eval=False):
+        if not cfg.kd_trainer:
+            return None, None
+
+        from .collator import DataCollatorForKD, KDBatchSamplerDataCollatorForSeq2Seq
+
+        use_batch_sampler_collator = False
+        if is_eval is False and cfg.sample_packing:
+            use_batch_sampler_collator = True
+        if cfg.eval_sample_packing and is_eval:
+            use_batch_sampler_collator = True
+
+        if cfg.kd_online_server_base_url:
+            from .collator_online_teacher import OnlineTeacherCollator
+
+            return OnlineTeacherCollator, {
+                "kd_online_server_base_url": cfg.kd_online_server_base_url,
+                "kd_online_topk": cfg.kd_online_topk,
+                "kd_temperature": cfg.kd_temperature,
+                "kd_online_server": cfg.kd_online_server,
+                "kd_online_timeout": cfg.kd_online_timeout,
+                "kd_normalize_topk": cfg.kd_normalize_topk,
+            }
+
+        if use_batch_sampler_collator:
+            return KDBatchSamplerDataCollatorForSeq2Seq, {}
+        return DataCollatorForKD, {}
+
+    def pre_model_load(self, cfg):
+        from .kernels.models import apply_kernel
+
+        apply_kernel(cfg.model_config_type)
+
+    def add_callbacks_post_trainer(self, cfg: Any, trainer: Trainer) -> list:
+        """
+        Adds temp scheduler callback to the Trainer instance.
+
+        Args:
+            cfg (Any): Configuration object containing the sparse recipe.
+            trainer (Trainer): Huggingface Trainer instance.
+
+        Returns:
+            list: List containing the configured callback instances.
+        """
+        if cfg.kd_temperature_min is not None and cfg.kd_online_server_base_url:
+            callback = KDTemperatureSchedulerCallback(
+                cfg.kd_temperature,
+                cfg.kd_temperature_min,
+                trainer,
+            )
+            return [callback]
+
+        return []
diff --git a/src/axolotl/integrations/kd/args.py b/src/axolotl/integrations/kd/args.py
index 2fbba2c6a..758bc8917 100644
--- a/src/axolotl/integrations/kd/args.py
+++ b/src/axolotl/integrations/kd/args.py
@@ -15,9 +15,19 @@
 """
 Plugin args for KD support.
 """
-from typing import Optional
+from dataclasses import dataclass
+from enum import Enum
 
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
+
+
+class InferenceServerType(str, Enum):
+    """
+    Online inferences server types to handle different request args
+    """
+
+    vllm = "vllm"  # pylint: disable=invalid-name
+    sglang = "sglang"  # pylint: disable=invalid-name
 
 
 class KDArgs(BaseModel):
@@ -25,13 +35,41 @@ class KDArgs(BaseModel):
     Input args for knowledge distillation.
     """
 
-    kd_trainer: Optional[bool] = None  # whether to use KD trainer
-    kd_ce_alpha: Optional[float] = (
+    kd_trainer: float | None = None  # whether to use KD trainer
+    kd_ce_alpha: float | None = (
         None  # loss coefficient for cross-entropy loss during KD
     )
-    kd_alpha: Optional[float] = None  # loss coefficient for KD loss
-    kd_temperature: Optional[float] = None  # temperature for sampling during KD
-    kd_zscore_base_temp: Optional[float] = None  # base temperature for zscore scaling
-    kd_top_k_before_softmax: Optional[bool] = (
-        None  # whether to sample top k before softmax during KD
+    kd_alpha: float | None = None  # loss coefficient for KD loss
+    kd_temperature: float | None = None  # temperature for sampling during KD
+    kd_beta: float | None = 0.0  # beta coefficient for ratio of fwd and reverse KL
+    kd_normalize_topk: bool | None = (
+        None  # whether to normalize student logits during KD
+    )
+
+    # TODO online kd
+    kd_online_server_base_url: str | None = None
+    kd_online_topk: int | None = None
+    kd_online_server: InferenceServerType | None = Field(
+        default_factory=lambda: InferenceServerType.vllm
+    )
+    kd_online_timeout: int | None = 120
+    kd_temperature_min: float | None = (
+        None  # kd temperature scheduling during online kd
+    )
+
+
+@dataclass
+class KDTrainingArgsMixin:
+    """
+    Additional args for KD training.
+    """
+
+    kd_ce_alpha: float | None = (
+        None  # loss coefficient for cross-entropy loss during KD
+    )
+    kd_alpha: float | None = None  # loss coefficient for KD loss
+    kd_temperature: float | None = None  # temperature for sampling during KD
+    kd_beta: float | None = None  # beta coefficient for ratio of fwd and reverse KL
+    kd_normalize_topk: float | None = (
+        None  # whether to normalize student logits during KD
     )
diff --git a/src/axolotl/integrations/kd/callbacks.py b/src/axolotl/integrations/kd/callbacks.py
new file mode 100644
index 000000000..911c3d517
--- /dev/null
+++ b/src/axolotl/integrations/kd/callbacks.py
@@ -0,0 +1,36 @@
+"""
+Transformers trainer callbacks to schedule the KD temperature during training
+"""
+
+import math
+
+from transformers.trainer_callback import TrainerCallback
+
+
+class KDTemperatureSchedulerCallback(TrainerCallback):
+    """
+    KD temperature scheduler callback for the trainer.
+    """
+
+    def __init__(self, temperature_start, temperature_min, trainer):
+        self.temperature_start = temperature_start
+        self.temperature_min = temperature_min
+        self.temperature = temperature_start
+
+        self.trainer = trainer
+
+    def on_step_end(
+        self, args, state, control, **kwargs
+    ):  # pylint: disable=unused-argument
+        # cosine decay temperature over the max steps
+
+        progress = state.global_step / state.max_steps
+        # Cosine decay factor: 0.5 * (1 + cos(pi * progress))
+        # This factor goes from 1 (at progress=0) to 0 (at progress=1)
+        decay_factor = 0.5 * (1.0 + math.cos(math.pi * progress))
+        self.temperature = self.temperature_start - (
+            (self.temperature_start - self.temperature_min) * (1.0 - decay_factor)
+        )
+
+        if hasattr(self.trainer.data_collator, "kd_temperature"):
+            self.trainer.data_collator.kd_temperature = self.temperature
diff --git a/src/axolotl/integrations/kd/chat_template.py b/src/axolotl/integrations/kd/chat_template.py
index eb067cd04..6376ecb09 100644
--- a/src/axolotl/integrations/kd/chat_template.py
+++ b/src/axolotl/integrations/kd/chat_template.py
@@ -15,12 +15,15 @@
 """
 Chat template prompt strategy loader with KD support
 """
+import logging
 from typing import Any, Dict
 
 import torch
 
 from axolotl.prompt_strategies.chat_template import ChatTemplateStrategy, StrategyLoader
 
+LOG = logging.getLogger(__name__)
+
 
 class ChatTemplateStrategyWithKD(ChatTemplateStrategy):
     """
@@ -101,10 +104,8 @@ class ChatTemplateStrategyWithKD(ChatTemplateStrategy):
         # fill with -inf for padding_len tokens for top_k tokens
         # extend target_logprobs with a padding_len x top_k 2D list filled with -inf
 
-        # for causal models, if we start the range at 1, then we don't need to shift in the trainer
-        # otherwise, we need to shift in the trainer
-        shift = 0
-        for _ in range(shift, input_padding_len):
+        # we shift for causal models in the trainer, so start the range from 0
+        for _ in range(0, input_padding_len):
             target_logprobs.append([-float("inf")] * top_k)
             target_token_ids.append(list(range(top_k)))
             target_mask.append([0] * top_k)
@@ -143,6 +144,10 @@ class ChatTemplateStrategyWithKD(ChatTemplateStrategy):
             #
             # Convert from log to probability
             teacher_probs_t1 = position_logprobs_tensor.exp()
+            # normalize probabilities to sum to 1 in case they aren't already
+            teacher_probs_t1_sum = teacher_probs_t1.sum(dim=0, keepdim=True)
+            if teacher_probs_t1_sum > 1e-9:
+                teacher_probs_t1 = teacher_probs_t1 / teacher_probs_t1_sum
             if self.kd_temperature != self.gen_temperature:
                 # Exponentiate by factor (T1 / T2)
                 exponent = self.gen_temperature / self.kd_temperature
@@ -162,12 +167,6 @@ class ChatTemplateStrategyWithKD(ChatTemplateStrategy):
             target_logprobs.append(position_logprobs_scaled)
             target_token_ids.append(position_token_ids)
 
-        if shift == 1:
-            # since we started at index 1 for causal, we need one more padding token
-            target_logprobs.append([-float("inf")] * top_k)
-            target_token_ids.append(list(range(top_k)))
-            target_mask.append([0] * top_k)
-
         # Update sample with transformed logprobs
         sample["target_logprobs"] = target_logprobs
         sample["target_token_ids"] = target_token_ids
@@ -184,12 +183,123 @@ class ChatTemplateStrategyWithKD(ChatTemplateStrategy):
         return tokenized_prompt
 
 
+class ChatTemplateStrategyWithKDv2(ChatTemplateStrategyWithKD):
+    """
+    Strat for datasets with complete structured KD logprob data
+    """
+
+    def transform_logprobs(self, sample):
+        """
+        Transform logprobs to target format for KD training
+        """
+        # pylint: disable=duplicate-code
+
+        logprobs = sample.pop(self.logprobs_field)
+        target_seq_len = len(logprobs)
+        input_seq_len = len(sample["input_ids"])
+        input_padding_len = input_seq_len - target_seq_len
+        # get non-zero top-k (prune None logprobs from vllm data step)
+        top_k_vals = [
+            len(logprobs[i])
+            for i in range(len(logprobs))
+            if logprobs[i] is not None and len(logprobs[i])
+        ]
+        max_top_k = max(set(top_k_vals), key=top_k_vals.count)
+        min_top_k = min(set(top_k_vals), key=top_k_vals.count)
+        top_k = min(max_top_k, min_top_k)
+        if top_k == 0:
+            raise ValueError("No non-zero top-k logprobs found.")
+
+        target_logprobs = []
+        target_token_ids = []
+        target_mask = []
+
+        if input_padding_len < 0:
+            # logprobs is longer than target_seq_len,
+            # so we need to slice from the left/beginning of logprobs
+            logprobs = logprobs[:-input_seq_len]
+            input_padding_len = 0
+            # target_seq_len = input_seq_len
+
+        # truncate the second dimension of the logprobs to top_k
+        logprobs = [row[:top_k] for row in logprobs]
+
+        # fill with -inf for padding_len tokens for top_k tokens
+        # extend target_logprobs with a padding_len x top_k 2D list filled with -inf
+
+        # we shift for causal models in the trainer, so start the range from 0
+        for _ in range(0, input_padding_len):
+            target_logprobs.append([-float("inf")] * top_k)
+            target_token_ids.append(list(range(top_k)))
+            target_mask.append([0] * top_k)
+
+        for position in range(input_padding_len, input_seq_len):
+            if sample["labels"][position] == -100:
+                target_mask.append([0] * top_k)
+            else:
+                target_mask.append([1] * top_k)
+
+        for token_pos_logprobs, pos_target_token_ids in zip(
+            logprobs, sample["target_token_ids"]
+        ):
+            # Convert to a tensor for easier manipulation
+            position_logprobs_tensor = torch.tensor(
+                token_pos_logprobs, dtype=torch.float
+            )
+
+            # Now we have distribution at T1 in log form, i.e. log p_{T1}(k).
+            # Next, re-scale to T2 = self.kd_temperature via exponent-based trick
+            # p_{T2}(k) = [p_{T1}(k)]^(T1 / T2) / Z
+            #
+            # Convert from log to probability
+            teacher_probs_t1 = position_logprobs_tensor.exp()
+            # normalize probabilities to sum to 1 in case they aren't already
+            teacher_probs_t1_sum = teacher_probs_t1.sum(dim=0, keepdim=True)
+            if teacher_probs_t1_sum > 1e-9:
+                teacher_probs_t1 = teacher_probs_t1 / teacher_probs_t1_sum
+            if self.kd_temperature != self.gen_temperature:
+                # Exponentiate by factor (T1 / T2)
+                exponent = self.gen_temperature / self.kd_temperature
+                teacher_probs_t2 = teacher_probs_t1**exponent
+            else:
+                teacher_probs_t2 = teacher_probs_t1
+            # Re-normalize
+            teacher_probs_t2 = teacher_probs_t2 / teacher_probs_t2.sum(
+                dim=0, keepdim=True
+            )
+            # Convert back to log
+            position_logprobs_tensor = torch.log(teacher_probs_t2)
+
+            # Now we have log p_{teacher, T2}(k) stored in position_logprobs_tensor
+            position_logprobs_scaled = position_logprobs_tensor.tolist()
+
+            target_logprobs.append(position_logprobs_scaled)
+            target_token_ids.append(pos_target_token_ids)
+
+        # Update sample with transformed logprobs
+        sample["target_logprobs"] = target_logprobs
+        sample["target_token_ids"] = target_token_ids
+        sample["target_mask"] = target_mask
+
+        return sample
+
+    def _tokenize_single_prompt(self, prompt):
+        target_token_ids = prompt.get("target_token_ids", None)
+
+        tokenized_prompt = super()._tokenize_single_prompt(prompt)
+
+        if target_token_ids is not None:
+            tokenized_prompt["target_token_ids"] = target_token_ids
+
+        return tokenized_prompt
+
+
 class KDStrategyLoader(StrategyLoader):
     """
     Load ChatTemplateStrategy with KD support using StrategyLoader.
     """
 
-    def _get_strategy_cls(self):
+    def _get_strategy_cls(self, cfg):  # pylint: disable=unused-argument
         return ChatTemplateStrategyWithKD
 
     def _get_strategy_params(self, cfg, ds_cfg: Dict[str, Any]):
@@ -204,4 +314,14 @@ class KDStrategyLoader(StrategyLoader):
         return strategy_params
 
 
-load = KDStrategyLoader()
+class KDStrategyLoaderV2(KDStrategyLoader):
+    """
+    Load KD chat template datasets with pre-tokenized logprob data
+    """
+
+    def _get_strategy_cls(self, cfg):  # pylint: disable=unused-argument
+        return ChatTemplateStrategyWithKDv2
+
+
+load_legacy = KDStrategyLoader()
+load = KDStrategyLoaderV2()
diff --git a/src/axolotl/integrations/kd/collator.py b/src/axolotl/integrations/kd/collator.py
index de63869c7..0cc745b78 100644
--- a/src/axolotl/integrations/kd/collator.py
+++ b/src/axolotl/integrations/kd/collator.py
@@ -47,11 +47,16 @@ class DataCollatorForKD(DataCollatorForSeq2Seq):
     position_pad_token_id: int = 0
     return_tensors: str = "pt"
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
+
     def __call__(self, features, return_tensors=None):
         if return_tensors is None:
             return_tensors = self.return_tensors
 
         padding_side = self.tokenizer.padding_side
+        max_len = 0
 
         # Pad labels and position_ids first
         for feature_name, pad_token_id in [
@@ -102,7 +107,9 @@ class DataCollatorForKD(DataCollatorForSeq2Seq):
                 target_mask_list.append(f.pop("target_mask"))
 
             # Determine max lengths
-            max_teacher_seq_len = max(len(seq) for seq in target_logprobs_list)
+            max_teacher_seq_len = max_len or max(
+                len(seq) for seq in target_logprobs_list
+            )
             max_k = max(len(seq_k) for seq in target_logprobs_list for seq_k in seq)
 
             padded_target_logprobs = []
@@ -209,7 +216,9 @@ class KDBatchSamplerDataCollatorForSeq2Seq(DataCollatorForKD):
         #    We want to produce a single "merged" feature dict for each sub-batch.
         out_features = [{} for _ in features]
 
-        for i, sub_features in enumerate(features):
+        for i, sub_features in enumerate(  # pylint: disable=too-many-nested-blocks
+            features
+        ):
             # sub_features is a list of dicts, each dict = one sequence’s features
             # We'll merge them into out_features[i].
             #
@@ -243,10 +252,17 @@ class KDBatchSamplerDataCollatorForSeq2Seq(DataCollatorForKD):
                     # For example, input_ids or labels are often arrays.
                     arrays = []
                     for feat in sub_features:
-                        if field_name in feat:
+                        if field_name in feat and isinstance(
+                            feat[field_name], (list, torch.Tensor)
+                        ):
+                            if isinstance(
+                                feat[field_name][0], (dict, str)
+                            ):  # pylint: disable=too-many-nested-blocks
+                                continue
                             arr = np.array(feat[field_name])
                             arrays.append(arr)
-                    out_features[i][field_name] = np.concatenate(arrays)
+                    if arrays:
+                        out_features[i][field_name] = np.concatenate(arrays)
 
         # 3) Now call the parent collator, which will do:
         #    - padding of labels/position_ids
diff --git a/src/axolotl/integrations/kd/collator_online_teacher.py b/src/axolotl/integrations/kd/collator_online_teacher.py
new file mode 100644
index 000000000..584ace481
--- /dev/null
+++ b/src/axolotl/integrations/kd/collator_online_teacher.py
@@ -0,0 +1,561 @@
+"""
+Packed data loader for online teacher training supporting vllm and sglang.
+"""
+
+import hashlib
+import hmac
+import logging
+from typing import Any, Dict, List, Optional
+
+import requests
+import torch
+from orjson import orjson
+
+from axolotl.integrations.kd.collator import KDBatchSamplerDataCollatorForSeq2Seq
+from axolotl.integrations.kd.utils import normalize_logprobs
+from axolotl.utils.data.utils import retry_on_request_exceptions
+
+LOG = logging.getLogger(__name__)
+
+
+def hmac_sha_from_int_list(int_list, key, hash_func=hashlib.sha256):
+    """
+    Create HMAC-SHA hash from a list of integers
+
+    Args:
+        int_list: List of integers
+        key: Secret key (string or bytes)
+        hash_func: Hash function (default: sha256)
+
+    Returns:
+        HMAC digest as hex string
+    """
+    # Convert key to bytes if it's a string
+    if isinstance(key, str):
+        key = key.encode("utf-8")
+
+    # Convert list of ints to bytes
+    # Method 1: Convert each int to bytes and concatenate
+    data = b"".join(i.to_bytes(4, byteorder="big") for i in int_list)
+
+    # Create HMAC
+    h = hmac.new(key, data, hash_func)
+    return h.hexdigest()
+
+
+class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
+    """
+    Collator for online teacher training.
+    """
+
+    DEFAULT_LABEL_PAD_TOKEN_ID: int = -100
+
+    def __init__(
+        self,
+        *args: Any,
+        kd_online_server_base_url: Optional[str] = None,
+        kd_online_topk: Optional[int] = None,
+        kd_temperature: Optional[float] = 1.0,
+        kd_online_server: Optional[str] = "vllm",
+        kd_online_timeout: Optional[int] = 120,
+        kd_cache_dir: Optional[str] = None,
+        kd_normalize_topk: Optional[bool] = True,
+        **kwargs: Any,
+    ):
+        super().__init__(*args, **kwargs)
+
+        if kd_online_server_base_url is None:
+            raise ValueError(
+                "kd_online_server_base_url must be provided for OnlineTeacherDataloader"
+            )
+        if kd_online_topk is None or kd_online_topk <= 0:
+            raise ValueError(
+                "kd_online_topk must be a positive integer for OnlineTeacherDataloader"
+            )
+
+        self.kd_online_server_base_url = kd_online_server_base_url.rstrip("/")
+        self.kd_online_topk = kd_online_topk
+        self.kd_temperature = kd_temperature
+        self.kd_online_server = kd_online_server
+        self.http_session = requests.Session()
+        self.kd_online_timeout = kd_online_timeout
+        self.kd_cache_dir = kd_cache_dir
+        self.kd_normalize_topk = kd_normalize_topk
+
+    def _normalize_logprobs(self, raw_logprobs: List[float]) -> List[float]:
+        """
+        Re-normalizes top-k raw logprobs as probabilities, and converts back to logprobs.
+        """
+        if not raw_logprobs or self.kd_online_topk == 0:
+            return (
+                [-float("inf")] * self.kd_online_topk if self.kd_online_topk > 0 else []
+            )
+
+        raw_logprobs_tensor = torch.tensor(raw_logprobs, dtype=torch.float32)
+        return normalize_logprobs(raw_logprobs_tensor, self.kd_online_topk).tolist()
+
+    @retry_on_request_exceptions(max_retries=10, delay=5)
+    def fetch_online_logprobs_sglang(
+        self, batch_input_ids: List[List[int]], labels: List[List[int]]
+    ):
+        """
+        Fetches logprobs from an online teacher served by sglang for a batch of input_ids.
+        Assumes API returns token IDs as strings in logprob dictionary keys.
+        """
+        api_endpoint = f"{self.kd_online_server_base_url}/generate"
+
+        payload = {
+            "input_ids": batch_input_ids,
+            "return_logprob": True,
+            "top_logprobs_num": self.kd_online_topk,
+            "logprob_start_len": 0,
+            "return_text_in_logprobs": True,
+            "echo": True,
+            "sampling_params": {
+                "max_new_tokens": 0,
+                "temperature": self.kd_temperature,
+                "skip_special_tokens": False,
+            },
+        }
+
+        # Initialize with empty lists, so if API call fails, these are returned.
+        ret_data_target_token_ids: List[List[List[int]]] = []
+        ret_data_target_logprobs: List[List[List[float]]] = []
+        ret_data_target_mask: List[List[List[int]]] = []
+
+        try:
+            response = self.http_session.post(
+                api_endpoint, json=payload, timeout=self.kd_online_timeout
+            )
+            response.raise_for_status()
+            api_data: list[dict] = response.json()
+
+            # Ensure api_data is a list, and its length matches batch_input_ids
+            if not isinstance(api_data, list) or len(api_data) != len(batch_input_ids):
+                LOG.error(
+                    f"API response format error. Expected a list of {len(batch_input_ids)} "
+                    f"items, got {type(api_data)} with length {len(api_data) if isinstance(api_data, list) else 'N/A'}."
+                )
+                # Return empty data; items processed later will get default empty KD fields
+                return {
+                    "target_token_ids": ret_data_target_token_ids,
+                    "target_logprobs": ret_data_target_logprobs,
+                    "target_mask": ret_data_target_mask,
+                }
+
+            for sequence_data, seq_input_ids, seq_labels in zip(
+                api_data, batch_input_ids, labels
+            ):
+                current_target_logprobs = []
+                current_target_token_ids = []
+                current_target_mask = []
+
+                meta_info = sequence_data.pop("meta_info", {})
+                # Ensure input_top_logprobs is a list
+                input_top_logprobs: Optional[list[None | list[tuple]]] = meta_info.pop(
+                    "input_top_logprobs", []
+                )
+                if not isinstance(input_top_logprobs, list):
+                    LOG.warning(
+                        f"Received non-list input_top_logprobs: {input_top_logprobs}. Skipping sequence."
+                    )
+                    input_top_logprobs = []  # Treat as empty
+
+                # basic check that the logprob data len matches the input len, so no need to handle padding
+                assert len(seq_input_ids) == len(input_top_logprobs)
+
+                for i, _, label in zip(
+                    range(len(seq_input_ids)), seq_input_ids, seq_labels
+                ):
+                    if i < len(input_top_logprobs) and input_top_logprobs[i] is None:
+                        # this is always the case for the first token.
+                        # there is never logprob data for the first token since that's a true input
+                        # so we replace the None value with padding data
+                        current_target_logprobs.append(
+                            [-float("inf")] * self.kd_online_topk
+                        )
+                        current_target_token_ids.append([0] * self.kd_online_topk)
+                        current_target_mask.append([0] * self.kd_online_topk)
+                    elif (
+                        i < len(input_top_logprobs)
+                        and input_top_logprobs[i] is not None
+                    ):
+                        pos_top_logprobs_data = input_top_logprobs[i]
+                        # Ensure pos_top_logprobs_data is a list of lists as expected
+                        if not (
+                            isinstance(pos_top_logprobs_data, list)
+                            and all(
+                                isinstance(item, list) for item in pos_top_logprobs_data
+                            )
+                            and len(pos_top_logprobs_data) > 0
+                            and len(pos_top_logprobs_data[0]) == 3
+                        ):  # [logprob, token_id, token_str]
+                            LOG.warning(
+                                f"Malformed pos_top_logprobs_data: {pos_top_logprobs_data}. Padding this position."
+                            )
+                            current_target_logprobs.append(
+                                [-float("inf")] * self.kd_online_topk
+                            )
+                            current_target_token_ids.append([0] * self.kd_online_topk)
+                            current_target_mask.append([0] * self.kd_online_topk)
+                            continue
+
+                        # pos_top_logprobs: list of logprobs, pos_token_ids: list of token_ids
+                        pos_logprobs_raw, pos_token_ids, _ = [
+                            list(row) for row in zip(*pos_top_logprobs_data)
+                        ]
+
+                        # Ensure correct length (top_k)
+                        if len(pos_logprobs_raw) < self.kd_online_topk:
+                            pad_len = self.kd_online_topk - len(pos_logprobs_raw)
+                            pos_logprobs_raw.extend([-float("inf")] * pad_len)
+                            pos_token_ids.extend([0] * pad_len)  # Pad with 0 token_id
+
+                        # truncate to top_k in case the response was longer
+                        current_target_token_ids.append(
+                            pos_token_ids[: self.kd_online_topk]
+                        )
+
+                        if self.kd_normalize_topk:
+                            normalized_logprobs_for_position = self._normalize_logprobs(
+                                pos_logprobs_raw[: self.kd_online_topk]
+                            )
+                            current_target_logprobs.append(
+                                normalized_logprobs_for_position
+                            )
+                        else:
+                            current_target_logprobs.append(
+                                pos_logprobs_raw[: self.kd_online_topk]
+                            )
+
+                        # Mask depends on the corresponding label for the student
+                        if label == self.DEFAULT_LABEL_PAD_TOKEN_ID:
+                            current_target_mask.append([0] * self.kd_online_topk)
+                        else:
+                            current_target_mask.append([1] * self.kd_online_topk)
+                    else:
+                        # Pad if no logprobs for this position (either due to length mismatch or None entry)
+                        current_target_logprobs.append(
+                            [-float("inf")] * self.kd_online_topk
+                        )
+                        current_target_token_ids.append([0] * self.kd_online_topk)
+                        current_target_mask.append([0] * self.kd_online_topk)
+
+                ret_data_target_token_ids.append(current_target_token_ids)
+                ret_data_target_logprobs.append(current_target_logprobs)
+                ret_data_target_mask.append(current_target_mask)
+
+        except requests.exceptions.RequestException as e:
+            LOG.error(f"Error fetching logprobs from online teacher: {e}")
+            raise e
+            # ret_logprobs_data will be returned with empty lists, handled by the caller.
+        except Exception as e:  # Catch other potential errors during processing
+            LOG.error(
+                f"Unexpected error processing API response in fetch_online_logprobs: {e}",
+                exc_info=True,
+            )
+            raise e
+
+        return {
+            "target_token_ids": ret_data_target_token_ids,
+            "target_logprobs": ret_data_target_logprobs,
+            "target_mask": ret_data_target_mask,
+        }
+
+    @retry_on_request_exceptions(max_retries=10, delay=5)
+    def fetch_online_logprobs_vllm(
+        self, batch_input_ids: List[List[int]], labels: List[List[int]]
+    ):
+        """
+        Fetches logprobs from an online teacher served by vllm for a batch of input_ids.
+        Assumes API returns token IDs as strings in logprob dictionary keys.
+        """
+        api_endpoint = f"{self.kd_online_server_base_url}/v1/completions"
+
+        payload = {
+            "prompt": batch_input_ids,
+            "echo": True,
+            "logprobs": True,
+            "prompt_logprobs": self.kd_online_topk,
+            "top_logprobs": self.kd_online_topk,
+            "max_new_tokens": 0,
+            "skip_special_tokens": False,
+            "temperature": self.kd_temperature,
+            "sampling_params": {
+                "max_tokens": 0,
+            },
+        }
+
+        # Initialize with empty lists, so if API call fails, these are returned.
+        ret_data_target_token_ids: List[List[List[int]]] = []
+        ret_data_target_logprobs: List[List[List[float]]] = []
+        ret_data_target_mask: List[List[List[int]]] = []
+
+        try:
+            headers = {"Accept-Encoding": "deflate, gzip, br, zstd"}
+            response = self.http_session.post(
+                api_endpoint,
+                json=payload,
+                headers=headers,
+                timeout=self.kd_online_timeout,
+            )
+            response.raise_for_status()
+            api_data: dict = orjson.loads(response.content)
+            choices: list[dict] = api_data["choices"]
+
+            # Ensure api_data is a list, and its length matches batch_input_ids
+            if not isinstance(choices, list) or len(choices) != len(batch_input_ids):
+                LOG.error(
+                    f"API response format error. Expected a list of {len(batch_input_ids)} "
+                    f"items, got {type(api_data)} with length {len(api_data) if isinstance(api_data, list) else 'N/A'}."
+                )
+                # Return empty data; items processed later will get default empty KD fields
+                return {
+                    "target_token_ids": ret_data_target_token_ids,
+                    "target_logprobs": ret_data_target_logprobs,
+                    "target_mask": ret_data_target_mask,
+                }
+
+            for sequence_data, seq_input_ids, seq_labels in zip(
+                choices, batch_input_ids, labels
+            ):
+                # seq_input_ids: List[int]
+                # seq_labels: List[int]
+
+                current_target_logprobs = []
+                current_target_token_ids = []
+                current_target_mask = []
+
+                # Ensure input_top_logprobs is a list
+                input_top_logprobs: Optional[list[None | dict[str, dict]]] = (
+                    sequence_data.pop("prompt_logprobs", [])
+                )
+
+                if not isinstance(input_top_logprobs, list):
+                    LOG.warning(
+                        f"Received non-list input_top_logprobs: {input_top_logprobs}. Skipping sequence."
+                    )
+                    input_top_logprobs = []  # Treat as empty
+
+                # basic check that the logprob data len matches the input len, so no need to handle padding
+                assert len(seq_input_ids) == len(input_top_logprobs)
+
+                seq_len = len(seq_input_ids)
+
+                for i, _, label in zip(range(seq_len), seq_input_ids, seq_labels):
+                    if i < len(input_top_logprobs) and input_top_logprobs[i] is None:
+                        # this is always the case for the first token.
+                        # there is never logprob data for the first token since that's a true input
+                        continue
+                    if (
+                        i < len(input_top_logprobs)
+                        and input_top_logprobs[i] is not None
+                    ):
+                        pos_top_logprobs_data: dict[str, dict] = input_top_logprobs[i]  # type: ignore[assignment]
+                        # Ensure pos_top_logprobs_data is a list of lists as expected
+                        if not (
+                            isinstance(pos_top_logprobs_data, dict)
+                            and all(
+                                isinstance(item, dict)
+                                for item in pos_top_logprobs_data.values()
+                            )
+                            and len(pos_top_logprobs_data.keys()) > 0
+                        ):  # [logprob, token_id, token_str]
+                            LOG.warning(
+                                f"Malformed pos_top_logprobs_data: {pos_top_logprobs_data}. Padding this position."
+                            )
+                            current_target_logprobs.append(
+                                [-float("inf")] * self.kd_online_topk
+                            )
+                            current_target_token_ids.append(
+                                list(range(self.kd_online_topk))
+                            )
+                            current_target_mask.append([0] * self.kd_online_topk)
+                            continue
+
+                        # pos_top_logprobs: list of logprobs, pos_token_ids: list of token_ids
+                        pos_token_ids_str = list(pos_top_logprobs_data.keys())
+                        pos_logprobs_dict = pos_top_logprobs_data.values()
+                        pos_token_ids = [
+                            int(token_id) for token_id in pos_token_ids_str
+                        ]
+                        pos_logprobs_raw = [
+                            float(logprob.get("logprob", -float("inf")))
+                            for logprob in pos_logprobs_dict
+                        ]
+
+                        # Ensure correct length (top_k)
+                        if len(pos_logprobs_raw) < self.kd_online_topk:
+                            pad_len = self.kd_online_topk - len(pos_logprobs_raw)
+                            LOG.warning(
+                                f"Padding position {i} with {pad_len} top-k tokens and logprobs."
+                            )
+                            pos_logprobs_raw.extend([-float("inf")] * pad_len)
+                            pos_token_ids.extend([0] * pad_len)  # Pad with 0 token_id
+
+                        # truncate to top_k in case the response was longer
+                        current_target_token_ids.append(
+                            pos_token_ids[: self.kd_online_topk]
+                        )
+
+                        if self.kd_normalize_topk:
+                            normalized_logprobs_for_position = self._normalize_logprobs(
+                                pos_logprobs_raw[: self.kd_online_topk]
+                            )
+                            current_target_logprobs.append(
+                                normalized_logprobs_for_position
+                            )
+                        else:
+                            current_target_logprobs.append(
+                                pos_logprobs_raw[: self.kd_online_topk]
+                            )
+
+                        # Mask depends on the corresponding label for the student
+                        if label == self.DEFAULT_LABEL_PAD_TOKEN_ID:
+                            current_target_mask.append([0] * self.kd_online_topk)
+                        else:
+                            current_target_mask.append([1] * self.kd_online_topk)
+                    else:
+                        # Pad if no logprobs for this position (either due to length mismatch or None entry)
+                        current_target_logprobs.append(
+                            [-float("inf")] * self.kd_online_topk
+                        )
+                        current_target_token_ids.append(
+                            list(range(self.kd_online_topk))
+                        )
+                        current_target_mask.append([0] * self.kd_online_topk)
+                for i in range(max(0, seq_len - len(current_target_logprobs))):
+                    current_target_logprobs.append(
+                        [-float("inf")] * self.kd_online_topk
+                    )
+                    current_target_token_ids.append(list(range(self.kd_online_topk)))
+                    current_target_mask.append([0] * self.kd_online_topk)
+
+                ret_data_target_token_ids.append(current_target_token_ids)
+                ret_data_target_logprobs.append(current_target_logprobs)
+                ret_data_target_mask.append(current_target_mask)
+
+                # TODO save and load targets to disk for caching for next epoch
+                # generate a hmac SHA256 hash over the list seq_input_ids and convert it to an int
+                # if self.kd_cache_dir:
+                #     hash_input_ids = hmac_sha_from_int_list(
+                #         seq_input_ids, f"{self.kd_online_server_base_url}:{self.kd_online_topk}"
+                #     )
+                #     with open(f"{self.kd_cache_dir}/{hash_input_ids}.parquet", "wb") as f:
+                #         pd.DataFrame(ret_logprobs_data).to_parquet(f, index=False)
+
+        except requests.exceptions.RequestException as e:
+            LOG.error(f"Error fetching logprobs from online teacher: {e}")
+            raise e
+            # ret_logprobs_data will be returned with empty lists, handled by the caller.
+        except Exception as e:  # Catch other potential errors during processing
+            LOG.error(
+                f"Unexpected error processing API response in fetch_online_logprobs: {e}",
+                exc_info=True,
+            )
+            raise e
+
+        return {
+            "target_token_ids": ret_data_target_token_ids,
+            "target_logprobs": ret_data_target_logprobs,
+            "target_mask": ret_data_target_mask,
+        }
+
+    def __call__(
+        self, features: List[List[Dict[str, Any]]], return_tensors: Optional[str] = None
+    ) -> Dict[str, Any]:
+        if not features:
+            return super().__call__(features, return_tensors=return_tensors)
+
+        for (
+            sub_batch_features
+        ) in features:  # sub_batch_features is List[Dict[str, Any]]
+            if not sub_batch_features:
+                continue
+
+            input_ids_for_api_call: List[List[int]] = []
+            labels_for_api_call: List[List[int]] = []
+            # Store references to the original item dictionaries to update them in-place
+            items_for_api_call: List[Dict[str, Any]] = []
+
+            for item_dict in sub_batch_features:
+                if not isinstance(item_dict, dict):
+                    LOG.warning(
+                        f"Skipping non-dict item in sub_batch_features: {item_dict}"
+                    )
+                    continue
+
+                current_input_ids = item_dict.get("input_ids")
+                current_labels = item_dict.get("labels")
+
+                if current_input_ids is not None and current_labels is not None:
+                    # Ensure input_ids and labels are lists of ints for JSON serialization
+                    input_ids_list = (
+                        current_input_ids.tolist()
+                        if hasattr(current_input_ids, "tolist")
+                        else list(current_input_ids)
+                    )
+                    labels_list = (
+                        current_labels.tolist()
+                        if hasattr(current_labels, "tolist")
+                        else list(current_labels)
+                    )
+
+                    input_ids_for_api_call.append(input_ids_list)
+                    labels_for_api_call.append(labels_list)
+                    items_for_api_call.append(item_dict)
+                else:
+                    # This item will not get teacher logprobs from the API.
+                    # Initialize KD fields to empty lists so downstream collators handle them uniformly.
+                    item_dict.setdefault("target_token_ids", [])
+                    item_dict.setdefault("target_logprobs", [])
+                    item_dict.setdefault("target_mask", [])
+
+            # print(items_for_api_call)
+            if items_for_api_call:  # Only call API if there's something to process
+                if self.kd_online_server == "sglang":
+                    api_responses_for_sub_batch = self.fetch_online_logprobs_sglang(
+                        input_ids_for_api_call, labels_for_api_call
+                    )
+                else:
+                    api_responses_for_sub_batch = self.fetch_online_logprobs_vllm(
+                        input_ids_for_api_call, labels_for_api_call
+                    )
+
+                # api_responses_for_sub_batch has keys: "target_token_ids", "target_logprobs", "target_mask"
+                # Each value is a list, corresponding to items_for_api_call
+                for i, item_to_update in enumerate(items_for_api_call):
+                    # TODO make sure to figure out which input in sub_batch_features to update the batch in the original `features` object so the super class can handle it properly.
+                    if api_responses_for_sub_batch and i < len(
+                        api_responses_for_sub_batch["target_token_ids"]
+                    ):  # Check bounds
+                        assert len(
+                            api_responses_for_sub_batch["target_token_ids"][i]
+                        ) == len(item_to_update["input_ids"])
+                        assert len(
+                            api_responses_for_sub_batch["target_logprobs"][i]
+                        ) == len(item_to_update["input_ids"])
+                        assert len(
+                            api_responses_for_sub_batch["target_mask"][i]
+                        ) == len(item_to_update["labels"])
+                        item_to_update["target_token_ids"] = (
+                            api_responses_for_sub_batch["target_token_ids"][i]
+                        )
+                        item_to_update["target_logprobs"] = api_responses_for_sub_batch[
+                            "target_logprobs"
+                        ][i]
+                        item_to_update["target_mask"] = api_responses_for_sub_batch[
+                            "target_mask"
+                        ][i]
+                    else:
+                        # API call failed for this item, or response was shorter than expected.
+                        # Ensure KD fields are initialized as empty lists.
+                        LOG.warning(
+                            f" (index {i}), or API response was too short. "
+                            f"API response keys: {list(api_responses_for_sub_batch.keys()) if api_responses_for_sub_batch else 'None'}"
+                        )
+                        item_to_update.setdefault("target_token_ids", [])
+                        item_to_update.setdefault("target_logprobs", [])
+                        item_to_update.setdefault("target_mask", [])
+
+        return super().__call__(features, return_tensors=return_tensors)
diff --git a/src/axolotl/integrations/kd/kernels/__init__.py b/src/axolotl/integrations/kd/kernels/__init__.py
index e69de29bb..3f1144a45 100644
--- a/src/axolotl/integrations/kd/kernels/__init__.py
+++ b/src/axolotl/integrations/kd/kernels/__init__.py
@@ -0,0 +1,8 @@
+"""
+Liger Chunked loss optimizations module
+"""
+
+from .liger import LigerFusedLinearKLTopKLogprobLoss
+from .models import apply_kernel
+
+__all__ = ["LigerFusedLinearKLTopKLogprobLoss", "apply_kernel"]
diff --git a/src/axolotl/integrations/kd/kernels/liger.py b/src/axolotl/integrations/kd/kernels/liger.py
new file mode 100644
index 000000000..6356643c2
--- /dev/null
+++ b/src/axolotl/integrations/kd/kernels/liger.py
@@ -0,0 +1,485 @@
+"""
+Liger Kernels for Chunked Top-K Log-Prob Distillation
+"""
+
+import torch
+import torch.nn.functional as F
+from liger_kernel.chunked_loss.fused_linear_distillation import (
+    LigerFusedLinearDistillationBase,
+)
+
+from axolotl.integrations.kd.utils import normalize_logprobs
+
+
+class LigerFusedLinearKLTopKLogprobFunction(LigerFusedLinearDistillationBase):
+    """
+    Chunked kl-div loss for top-k logprobs
+    """
+
+    @staticmethod
+    def distillation_loss_fn(
+        student_logits_temp_scaled: torch.Tensor,  # [chunk_size, vocab_size], already temp-scaled
+        target_token_ids_chunk: torch.Tensor,  # [chunk_size, top_k]
+        target_logprobs_chunk: torch.Tensor,  # [chunk_size, top_k], already temp-scaled and normalized logprobs
+        target_mask_chunk: torch.Tensor,  # [chunk_size, top_k]
+        beta: float = 0.0,
+        normalize_topk: bool = True,
+    ) -> torch.Tensor:
+        """
+        Compute Top-K KL divergence loss for a chunk.
+        Args:
+            student_logits_temp_scaled: Student logits, scaled by temperature. Shape: (N, V).
+            target_token_ids_chunk: Top-k teacher token IDs. Shape: (N, K).
+            target_logprobs_chunk: Top-k teacher log probabilities (temp-scaled, normalized). Shape: (N, K).
+            target_mask_chunk: Mask for valid top-k tokens. Shape: (N, K).
+            beta: Controls the type of KL divergence.
+                  0.0 for Forward KL (P_teacher || P_student).
+                  1.0 for Reverse KL (P_student || P_teacher).
+                  0.5 for Symmetric KL (average of Forward and Reverse).
+            normalize_topk: Whether to normalize the log probabilities
+        Returns:
+            Sum of KL divergence losses for the chunk.
+        """
+        topk = target_token_ids_chunk.shape[-1]
+        student_logits_temp_scaled = (  # [chunk_size, vocab_size]
+            student_logits_temp_scaled.float()
+        )
+        target_logprobs_chunk = target_logprobs_chunk.float()
+
+        # Gather student logits for the top-k teacher token IDs
+        # target_token_ids_chunk: [chunk_size, top_k]
+        # student_logits_topk_temp_scaled: [chunk_size, top_k]
+        student_logits_topk_temp_scaled = torch.gather(
+            student_logits_temp_scaled, dim=-1, index=target_token_ids_chunk
+        )
+
+        # Student log-probabilities for the gathered top-k tokens
+        student_lse = torch.logsumexp(
+            student_logits_temp_scaled, dim=-1, keepdim=True
+        )  # [chunk_size, 1]
+        student_logprobs_topk_temp_scaled = (
+            student_logits_topk_temp_scaled - student_lse
+        )
+
+        # we have the top-k student logprobs, normalize them
+        if normalize_topk:
+            student_logprobs_topk_temp_scaled = normalize_logprobs(
+                student_logprobs_topk_temp_scaled, topk
+            )
+
+        valid_mask = target_mask_chunk.to(torch.bool)  # [chunk_size, top_k]
+
+        student_logprobs_topk_valid = student_logprobs_topk_temp_scaled[valid_mask]
+        teacher_logprobs_valid = target_logprobs_chunk[valid_mask]
+
+        # Teacher probabilities P(y|x_teacher) from logprobs
+        # target_logprobs_valid are already normalized (log(softmax(teacher_logits/T)))
+        teacher_probs_valid = teacher_logprobs_valid.exp()
+        # Student probabilities P_student from log P_student
+        student_probs_topk_valid = student_logprobs_topk_valid.exp()
+
+        # kd_loss_per_token = torch.zeros_like(target_logprobs_valid)
+
+        # KL divergence: sum(P_teacher * (log P_teacher - log P_student))
+        # = sum(P_teacher * log P_teacher) - sum(P_teacher * log P_student)
+        # The distillation loss is often formulated as -sum(P_teacher * log P_student)
+        # or as sum(P_teacher * (log_softmax_teacher - log_softmax_student))
+        # Here, target_logprobs_valid are log_softmax_teacher.
+        # student_logprobs_topk_valid are log_softmax_student (for the selected K indices).
+        if beta == 0.0:  # Contribution from Forward KL
+            fwd_kl_per_token = teacher_probs_valid * (
+                teacher_logprobs_valid - student_logprobs_topk_valid
+            )
+            kd_loss = fwd_kl_per_token.sum()
+        elif beta == 1.0:  # Contribution from Reverse KL
+            rev_kl_per_token = student_probs_topk_valid * (
+                student_logprobs_topk_valid - teacher_logprobs_valid
+            )
+            kd_loss = rev_kl_per_token.sum()
+        else:
+            # JSD - Jensen-Shannon Divergence / Symmetric
+            mean_probs = (
+                1 - beta
+            ) * student_probs_topk_valid + beta * teacher_probs_valid
+            log_mean_probs = mean_probs.log()
+            student_kl = F.kl_div(
+                log_mean_probs,
+                student_logprobs_topk_valid,
+                reduction="sum",
+                log_target=True,
+            )
+            teacher_kl = F.kl_div(
+                log_mean_probs, teacher_logprobs_valid, reduction="sum", log_target=True
+            )
+            jsd_loss = beta * teacher_kl + (1 - beta) * student_kl
+            kd_loss = jsd_loss
+
+        return kd_loss
+
+    @staticmethod
+    def _compute_loss_kl_topk(
+        student_input_chunk: torch.Tensor,
+        student_weight: torch.Tensor,
+        # Args for student_bias, target_token_ids_chunk etc. are passed to the lambda wrapped by grad_and_value
+        # or through `partial`. Let's make them explicit here for clarity.
+        target_token_ids_chunk: torch.Tensor,
+        target_logprobs_chunk: torch.Tensor,
+        target_mask_chunk: torch.Tensor,
+        target_chunk: torch.Tensor,  # For hard loss (true labels)
+        student_bias: torch.Tensor = None,  # This will be one of the grad targets
+        # Other params passed via `partial` from `forward`
+        distillation_loss_fn=None,
+        ignore_index: int = -100,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        compute_ce_loss: bool = True,
+        temperature: float = 1.0,
+        beta: float = 0.0,
+        normalize_topk: bool = True,
+    ):
+        # Compute student logits for the chunk from hidden states and LM head
+        # student_input_chunk: [chunk_size, hidden_dim]
+        # student_lm_head_weight: [vocab_size, hidden_dim]
+        # student_logits_chunk: [chunk_size, vocab_size]
+        student_logits_chunk = F.linear(
+            student_input_chunk, student_weight, student_bias
+        )
+
+        ce_loss = torch.tensor(
+            0.0, device=student_logits_chunk.device, dtype=student_logits_chunk.dtype
+        )
+        if compute_ce_loss and weight_hard_loss > 0.0:
+            ce_loss = F.cross_entropy(
+                student_logits_chunk.view(-1, student_logits_chunk.shape[-1]),
+                target_chunk.view(-1),
+                reduction="sum",
+                ignore_index=ignore_index,
+            )
+
+        soft_loss = torch.tensor(
+            0.0, device=student_logits_chunk.device, dtype=student_logits_chunk.dtype
+        )
+        if weight_soft_loss > 0.0:
+            student_logits_chunk_temp_scaled = student_logits_chunk / temperature
+
+            # Assuming student_weight.shape[0] (vocab_size) is adequate for target_token_ids_chunk.max()
+            # No explicit padding here; user must ensure vocab alignment or pre-pad student_weight.
+
+            soft_loss = distillation_loss_fn(
+                student_logits_chunk_temp_scaled,
+                target_token_ids_chunk,
+                target_logprobs_chunk,
+                target_mask_chunk,
+                beta=beta,
+                normalize_topk=normalize_topk,
+            )
+
+        return soft_loss, ce_loss
+
+    @classmethod
+    def forward(
+        cls,
+        ctx,
+        student_input: torch.Tensor,  # [batch_size, seq_len, dim]
+        student_lm_head_weight: torch.Tensor,  # [dim, vocab_size]
+        target_token_ids: torch.Tensor,  # [batch_size, seq_len, top_k]
+        target_logprobs: torch.Tensor,  # [batch_size, seq_len, top_k]
+        target_mask: torch.Tensor,  # [batch_size, seq_len, top_k]
+        true_labels: torch.Tensor,  # [batch_size, seq_len]
+        student_lm_head_bias: torch.Tensor = None,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+        beta: float = 0.0,
+        compiled: bool = False,
+        chunk_size: int = 1024,
+        compute_ce_loss: bool = True,
+        normalize_topk: bool = True,
+    ):
+        CHUNK_SIZE = chunk_size  # pylint: disable=invalid-name
+        grad_weight_acc = torch.zeros_like(student_lm_head_weight)
+        grad_inputs_list = []
+        grad_bias_acc = (
+            torch.zeros_like(student_lm_head_bias)
+            if student_lm_head_bias is not None
+            else None
+        )
+        kd_loss_acc = torch.zeros(
+            (), device=student_input.device, dtype=student_input.dtype
+        )
+        ce_loss_acc = torch.zeros(
+            (), device=student_input.device, dtype=student_input.dtype
+        )
+
+        # This function will be what torch.func.grad_and_value differentiates.
+        # It takes student_input_chunk, student_weight (full), student_bias (full) as primals.
+        # Other necessary data (target_*, etc.) are passed as non-differentiable arguments.
+        def loss_fn_for_grad(
+            _student_input_chunk,
+            _student_lm_head_weight,  # full weight
+            _student_lm_head_bias,  # full bias
+            # Fixed arguments for a given chunk, not differentiated:
+            _target_token_ids_chunk,
+            _target_logprobs_chunk,
+            _target_mask_chunk,
+            _true_labels_chunk,
+        ):
+            return cls._compute_loss_kl_topk(
+                student_input_chunk=_student_input_chunk,
+                student_weight=_student_lm_head_weight,
+                target_token_ids_chunk=_target_token_ids_chunk,
+                target_logprobs_chunk=_target_logprobs_chunk,
+                target_mask_chunk=_target_mask_chunk,
+                target_chunk=_true_labels_chunk,
+                student_bias=_student_lm_head_bias,
+                distillation_loss_fn=cls.distillation_loss_fn,
+                ignore_index=ignore_index,
+                weight_hard_loss=weight_hard_loss,
+                weight_soft_loss=weight_soft_loss,
+                compute_ce_loss=compute_ce_loss,
+                temperature=temperature,
+                beta=beta,
+                normalize_topk=normalize_topk,
+            )
+
+        def accumulate_chunk_grads(
+            student_input_chunk_ac,
+            target_token_ids_chunk_ac,
+            target_logprobs_chunk_ac,
+            target_mask_chunk_ac,
+            true_labels_chunk_ac,
+        ):
+            # student_weight and student_bias are closed over from the outer scope (full tensors)
+            if student_lm_head_bias is not None:
+                (
+                    (chunk_grad_input, chunk_grad_weight, chunk_grad_bias),
+                    (chunk_kd_loss, chunk_ce_loss),
+                ) = torch.func.grad_and_value(
+                    loss_fn_for_grad, argnums=(0, 1, 2), has_aux=True
+                )(
+                    student_input_chunk_ac,
+                    student_lm_head_weight,
+                    student_lm_head_bias,  # primals
+                    target_token_ids_chunk_ac,
+                    target_logprobs_chunk_ac,
+                    target_mask_chunk_ac,
+                    true_labels_chunk_ac,
+                )  # non-primals
+                grad_bias_acc.add_(chunk_grad_bias)
+            else:
+                argnums_for_grad = (0, 1)  # Differentiate wrt input_chunk, weight
+                (
+                    (chunk_grad_input, chunk_grad_weight),  # No grad for bias
+                    (chunk_kd_loss, chunk_ce_loss),
+                ) = torch.func.grad_and_value(
+                    loss_fn_for_grad, argnums=argnums_for_grad, has_aux=True
+                )(
+                    student_input_chunk_ac,
+                    student_lm_head_weight,
+                    None,  # Pass None for student_bias primal
+                    target_token_ids_chunk_ac,
+                    target_logprobs_chunk_ac,
+                    target_mask_chunk_ac,
+                    true_labels_chunk_ac,
+                )
+
+            grad_weight_acc.add_(chunk_grad_weight)
+            kd_loss_acc.add_(chunk_kd_loss)
+            ce_loss_acc.add_(chunk_ce_loss)
+
+            return chunk_grad_input
+
+        if compiled:
+            accumulate_chunk_grads_compiled = torch.compile(
+                accumulate_chunk_grads, dynamic=True, backend="inductor"
+            )  # dynamic=True often helpful
+        else:
+            accumulate_chunk_grads_compiled = accumulate_chunk_grads
+
+        # Use the same chunking logic as LigerFusedLinearDistillationBase.forward
+        B, N, D = student_input.shape  # pylint: disable=invalid-name
+        K = target_token_ids.shape[-1]  # pylint: disable=invalid-name
+
+        student_input_flat = student_input.reshape(-1, student_input.shape[-1])
+        target_token_ids_flat = target_token_ids.reshape(-1, target_token_ids.shape[-1])
+        target_logprobs_flat = target_logprobs.reshape(-1, target_logprobs.shape[-1])
+        target_mask_flat = target_mask.reshape(-1, target_mask.shape[-1])
+        # pad and shift for cross entropy loss
+        true_labels = torch.nn.functional.pad(true_labels, (0, 1), value=ignore_index)
+        true_labels_flat = true_labels[:, 1:].contiguous().view(-1)
+
+        num_chunks = max(1, student_input_flat.shape[0] // CHUNK_SIZE)
+
+        _student_input_chunks = torch.chunk(
+            student_input_flat, chunks=num_chunks, dim=0
+        )
+        _target_token_ids_chunks = torch.chunk(
+            target_token_ids_flat, chunks=num_chunks, dim=0
+        )
+        _target_logprobs_chunks = torch.chunk(
+            target_logprobs_flat, chunks=num_chunks, dim=0
+        )
+        _target_mask_chunks = torch.chunk(target_mask_flat, chunks=num_chunks, dim=0)
+        _true_labels_chunks = torch.chunk(true_labels_flat, chunks=num_chunks, dim=0)
+
+        for i in range(num_chunks):
+            grad_input_chunk = accumulate_chunk_grads_compiled(
+                _student_input_chunks[i],
+                _target_token_ids_chunks[i],
+                _target_logprobs_chunks[i],
+                _target_mask_chunks[i],
+                _true_labels_chunks[i],
+            )
+            grad_inputs_list.append(grad_input_chunk)
+
+        grad_inputs_combined = torch.cat(grad_inputs_list, dim=0)
+        ctx.save_for_backward(grad_inputs_combined, grad_weight_acc, grad_bias_acc)
+
+        # For matching None returns in backward for non-tensor/non-grad_requiring inputs
+        ctx.hyperparams_count = 9  # Corresponds to number of hyperparams after main tensors in fwd signature
+        ctx.bias_was_none = student_lm_head_bias is None
+        ctx.orig_dims = (B, N, D, K)
+
+        # since this is packed, there is simply a single batch, so batchmean reduction of kl-div is simply the accumulated sum
+        # we still need to scale the kd_loss by the temp^2
+        kd_loss_acc = kd_loss_acc * (temperature**2)
+        final_loss = weight_soft_loss * kd_loss_acc + weight_hard_loss * ce_loss_acc
+
+        return final_loss
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_input_flat, grad_weight, grad_bias_maybe = (
+            ctx.saved_tensors
+        )  # grad_input_flat is (B*N, D)
+
+        # Scale gradients by grad_output if it's not 1.0
+        if not torch.equal(
+            grad_output,
+            torch.tensor(1.0, device=grad_output.device, dtype=grad_output.dtype),
+        ):
+            grad_input_flat = grad_input_flat * grad_output
+            grad_weight = grad_weight * grad_output
+            if grad_bias_maybe is not None:
+                grad_bias_maybe = grad_bias_maybe * grad_output
+
+        # Reshape grad_input_flat to match original student_input shape (B, N, D)
+        # ctx.orig_dims stores (B, N, D, K)
+        # We need the first three dimensions for student_input's shape.
+        # Ensure that orig_dims are not (0,0,0,K) for empty inputs leading to view errors
+        if (
+            ctx.orig_dims[0] * ctx.orig_dims[1] * ctx.orig_dims[2] == 0
+            and grad_input_flat.numel() == 0
+        ):
+            # If original input was empty, gradient should also be empty with correct shape
+            grad_input_reshaped = torch.zeros(
+                ctx.orig_dims[0],
+                ctx.orig_dims[1],
+                ctx.orig_dims[2],
+                dtype=grad_input_flat.dtype,
+                device=grad_input_flat.device,
+            )
+        elif grad_input_flat.numel() == 0 and not (
+            ctx.orig_dims[0] * ctx.orig_dims[1] * ctx.orig_dims[2] == 0
+        ):
+            # This case should ideally not happen if forward path is correct (non-empty input -> non-empty flat grad)
+            # but as a safeguard:
+            grad_input_reshaped = torch.zeros(
+                ctx.orig_dims[0],
+                ctx.orig_dims[1],
+                ctx.orig_dims[2],
+                dtype=grad_input_flat.dtype,
+                device=grad_input_flat.device,
+            )
+        else:
+            grad_input_reshaped = grad_input_flat.view(
+                ctx.orig_dims[0], ctx.orig_dims[1], ctx.orig_dims[2]
+            )
+
+        nones_for_hyperparams = [None] * ctx.hyperparams_count
+        grad_bias_return = grad_bias_maybe if not ctx.bias_was_none else None
+
+        return (
+            grad_input_reshaped,  # Gradient for student_input (reshaped)
+            grad_weight,  # Gradient for student_lm_head_weight
+            None,  # Gradient for target_token_ids
+            None,  # Gradient for target_logprobs
+            None,  # Gradient for target_mask
+            None,  # Gradient for true_labels
+            grad_bias_return,  # Gradient for student_lm_head_bias
+            *nones_for_hyperparams,  # Grads for weight_hard_loss, ..., compute_ce_loss
+        )
+
+
+class LigerFusedLinearKLTopKLogprobLoss(torch.nn.Module):
+    """
+    wrapper for chunked top-k logprob kl-d
+    """
+
+    def __init__(
+        self,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        temperature: float = 1.0,  # This is the kd_temperature
+        beta: float = 1.0,
+        ignore_index: int = -100,
+        compiled: bool = True,
+        chunk_size: int = 1024,
+        compute_ce_loss: bool = True,
+        normalize_topk: bool = True,
+    ):
+        super().__init__()
+        if not (0.0 <= weight_hard_loss <= 1.0 and 0.0 <= weight_soft_loss <= 1.0):
+            raise ValueError("Loss weights must be between 0.0 and 1.0.")
+        if temperature <= 0:
+            raise ValueError("Temperature must be positive.")
+
+        self.weight_hard_loss = weight_hard_loss
+        self.weight_soft_loss = weight_soft_loss
+        self.temperature = temperature
+        self.beta = beta
+        self.ignore_index = ignore_index
+        self.compiled = compiled
+        self.chunk_size = chunk_size
+        self.compute_ce_loss = compute_ce_loss
+        self.normalize_topk = normalize_topk
+
+        if not self.compute_ce_loss and self.weight_hard_loss > 0.0:
+            print(
+                f"Warning: compute_ce_loss is False, but weight_hard_loss ({self.weight_hard_loss}) > 0. Hard loss will effectively be zero."
+            )
+            # self.weight_hard_loss = 0.0 # Or let user manage this
+        if self.weight_soft_loss == 0.0:
+            print(
+                "Warning: weight_soft_loss is 0.0. Soft (KD) loss will not be computed."
+            )
+
+    def forward(
+        self,
+        lm_head_weight: torch.Tensor,  # Weights of the linear layer in the LM head
+        student_hidden_states: torch.Tensor,  # student_hidden_states before the lm_head
+        target_token_ids: torch.Tensor,
+        target_logprobs: torch.Tensor,
+        target_mask: torch.Tensor,
+        true_labels: torch.Tensor,
+        student_bias: torch.Tensor = None,
+    ) -> torch.Tensor:
+        return LigerFusedLinearKLTopKLogprobFunction.apply(
+            student_hidden_states,
+            lm_head_weight,
+            target_token_ids,
+            target_logprobs,
+            target_mask,
+            true_labels,
+            student_bias,
+            self.weight_hard_loss,
+            self.weight_soft_loss,
+            self.ignore_index,
+            self.temperature,
+            self.beta,
+            self.compiled,
+            self.chunk_size,
+            self.compute_ce_loss,
+            self.normalize_topk,
+        )
diff --git a/src/axolotl/integrations/kd/kernels/models.py b/src/axolotl/integrations/kd/kernels/models.py
new file mode 100644
index 000000000..4319f5f7d
--- /dev/null
+++ b/src/axolotl/integrations/kd/kernels/models.py
@@ -0,0 +1,105 @@
+"""
+model patcher for chunked top-k kl-div
+"""
+
+from typing import Optional, Union, Unpack
+
+import torch
+from transformers import Cache
+from transformers.modeling_outputs import CausalLMOutputWithPast
+
+try:
+    from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+    from transformers.utils import LossKwargs
+
+    class TransformersKwargs(FlashAttentionKwargs, LossKwargs):
+        """
+        placeholder kwargs for hf model classes
+        """
+
+except ImportError:
+    from transformers.utils.generic import (  # type: ignore[no-redef]
+        TransformersKwargs,
+    )
+
+from axolotl.utils.callbacks.models import get_causal_lm_model_cls_prefix
+
+
+def kldiv_forward_llama_like(
+    self,
+    input_ids: Optional[torch.LongTensor] = None,
+    target_logprobs: Optional[torch.Tensor] = None,
+    target_token_ids: Optional[torch.LongTensor] = None,
+    target_mask: Optional[torch.Tensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Cache] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,  # pylint: disable=unused-argument
+    **kwargs: Unpack[TransformersKwargs],  # type: ignore[misc]
+) -> CausalLMOutputWithPast:
+    # pylint: disable=duplicate-code
+    output_attentions = (
+        output_attentions
+        if output_attentions is not None
+        else self.config.output_attentions
+    )
+    output_hidden_states = (
+        output_hidden_states
+        if output_hidden_states is not None
+        else self.config.output_hidden_states
+    )
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs.last_hidden_state
+
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    # TODO, we can optimize this further by filtering hidden_states on sequence dimension using labels != -100
+    # self.loss_function should be LigerFusedLinearKLTopKLogprobLoss
+
+    loss = self.loss_function(
+        self.lm_head.weight,
+        hidden_states,
+        target_token_ids,
+        target_logprobs,
+        target_mask,
+        true_labels=labels,
+    )
+    num_items_in_batch = kwargs.pop("num_items_in_batch", -1)
+    if num_items_in_batch is not None and num_items_in_batch > 0:
+        loss = loss / num_items_in_batch
+
+    return CausalLMOutputWithPast(
+        loss=loss,
+        logits=None,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+    )
+
+
+def apply_kernel(model_type):
+    # Dynamically import the module and attention class
+    module_path = f"transformers.models.{model_type}.modeling_{model_type}"
+    model_cls_prefix, _ = get_causal_lm_model_cls_prefix(model_type)
+    module = __import__(module_path, fromlist=[f"{model_cls_prefix}ForCausalLM"])
+    model_cls = getattr(module, f"{model_cls_prefix}ForCausalLM")
+    model_cls.forward = kldiv_forward_llama_like
diff --git a/src/axolotl/integrations/kd/topk_logprob/forward_kl.py b/src/axolotl/integrations/kd/topk_logprob/forward_kl.py
index 3c9515091..74184455f 100644
--- a/src/axolotl/integrations/kd/topk_logprob/forward_kl.py
+++ b/src/axolotl/integrations/kd/topk_logprob/forward_kl.py
@@ -16,40 +16,7 @@
 loss for top_k KL divergence
 """
 import torch
-
-
-def zscore_standardize(
-    logits: torch.Tensor,
-    mask: torch.Tensor = None,
-    base_temperature: float = 1.0,
-    eps: float = 1e-9,
-):
-    """
-    Z-score standardize along the last dimension of `logits`.
-    i.e., for each [B, seq_len] row, across K entries:
-        z = (logits - mean) / std,
-    then scale by 1 / base_temperature if desired.
-
-    mask can be broadcastable or None. If None, we standardize all elements.
-    """
-    if mask is None:
-        # shape: [B, seq_len, K]
-        # Mean and std over dim=-1
-        mean = logits.mean(dim=-1, keepdim=True)
-        var = logits.var(dim=-1, unbiased=False, keepdim=True)
-    else:
-        # If you have to exclude some tokens, multiply by mask, etc.
-        float_mask = mask.to(logits.dtype)
-        count = float_mask.sum(dim=-1, keepdim=True).clamp_min(1.0)
-        mean = (logits * float_mask).sum(dim=-1, keepdim=True) / count
-        var = (float_mask * (logits - mean) ** 2).sum(dim=-1, keepdim=True) / count
-
-    std = torch.sqrt(var.clamp_min(eps))
-    z = (logits - mean) / std
-
-    # Scale by 1 / base_temperature
-    z = z / base_temperature
-    return z
+from torch import nn
 
 
 @torch.jit.script
@@ -60,7 +27,6 @@ def loss(
     target_mask: torch.Tensor,
     num_items_in_batch: int = -1,  # Use -1 to indicate "None"
     kd_temperature: float = 1.0,
-    top_k_before_softmax: int = 0,
 ) -> torch.Tensor:
     """
     A KD loss function that is TorchScript-friendly.
@@ -77,8 +43,6 @@ def loss(
         num_items_in_batch (int, optional): The number of items in the batch.
         kd_temperature (float, optional): The temperature for KD.
             Default: 1.0
-        top_k_before_softmax (int, optional): Flag of whether to apply softmax before gathering student top-k logits
-            Default: 0
     """
 
     target_logprobs = target_logprobs.float()
@@ -88,46 +52,24 @@ def loss(
     # student_logits shape:   [B, student_seq_len, vocab_size]
     teacher_seq_len = target_token_ids.shape[1]
 
-    if top_k_before_softmax:
-        # Slice student logits to match teacher-provided sequence length
-        student_logits_for_kd = student_logits[
-            :, :teacher_seq_len, :
-        ]  # [B, teacher_seq_len, vocab_size]
+    # Slice student logits to match teacher-provided sequence length
+    student_logits_for_kd = (
+        student_logits[:, :teacher_seq_len, :] / kd_temperature
+    )  # [B, teacher_seq_len, vocab_size]
 
-        # Gather student logits for teacher's top-K tokens
-        student_logits_topk = torch.gather(
-            student_logits_for_kd, dim=-1, index=target_token_ids
-        )  # [B, teacher_seq_len, K]
+    # keep in full precision for numerical stability of loss
+    student_logits_for_kd = student_logits_for_kd.float()
 
-        student_logits_topk = student_logits_topk.float()
+    # Gather student logits for teacher's top-K tokens
+    student_logits_topk = torch.gather(
+        student_logits_for_kd, dim=-1, index=target_token_ids
+    )  # [B, teacher_seq_len, K]
 
-        # Apply KD temperature to student’s logits
-        if kd_temperature != 1.0:
-            student_logits_topk = student_logits_topk / kd_temperature
+    # Compute logsumexp across full vocabulary
+    student_lse = torch.logsumexp(student_logits_for_kd, dim=-1, keepdim=True)
 
-        # Convert student top-k logits to logprobs
-        student_logprobs_topk = student_logits_topk - torch.logsumexp(
-            student_logits_topk, dim=-1, keepdim=True
-        )  # [B, teacher_seq_len, K]
-    else:
-        # Slice student logits to match teacher-provided sequence length
-        student_logits_for_kd = (
-            student_logits[:, :teacher_seq_len, :] / kd_temperature
-        )  # [B, teacher_seq_len, vocab_size]
-
-        # keep in full precision for numerical stability of loss
-        student_logits_for_kd = student_logits_for_kd.float()
-
-        # Gather student logits for teacher's top-K tokens
-        student_logits_topk = torch.gather(
-            student_logits_for_kd, dim=-1, index=target_token_ids
-        )  # [B, teacher_seq_len, K]
-
-        # Compute logsumexp across full vocabulary
-        student_lse = torch.logsumexp(student_logits_for_kd, dim=-1, keepdim=True)
-
-        #  Convert just the top-k logits to logprobs
-        student_logprobs_topk = student_logits_topk - student_lse
+    #  Convert just the top-k logits to logprobs
+    student_logprobs_topk = student_logits_topk - student_lse
 
     # Convert teacher_mask to boolean for indexing
     # In TorchScript, .bool() is sometimes unsupported, so we do:
@@ -144,10 +86,6 @@ def loss(
     kd_loss_per_token = teacher_probs * (target_logprobs - student_logprobs_topk)
     kd_loss = kd_loss_per_token.sum()
 
-    # Multiply by T^2 (classical KD scaling)
-    if kd_temperature != 1.0:
-        kd_loss = kd_loss * (kd_temperature**2)
-
     # Normalize by number of items (if provided) or by valid tokens
     if num_items_in_batch > 0:
         kd_loss = kd_loss / float(num_items_in_batch)
@@ -158,80 +96,74 @@ def loss(
     return kd_loss
 
 
-def topk_kd_loss_with_zscore(
-    student_logits: torch.Tensor,  # [B, seq_len, vocab_size]
-    target_token_ids: torch.Tensor,  # [B, seq_len, K]
-    target_logprobs: torch.Tensor,  # [B, seq_len, K], sums to 1.0 in prob space
-    target_mask: torch.Tensor,  # [B, seq_len, K] or [B, seq_len]
-    kd_temperature: float = 1.0,  # classic KD temperature
-    zscore_base_temp: float = 1.0,  # from the paper
-    num_items_in_batch: int = -1,
-):
+class ChunkedTopKKDLoss(nn.Module):
     """
-    A variant of top_k KL divergence with Z-score scaling
-    from "Logit Standardization in Knowledge Distillation".
+    A wrapper that chunks (splits) the student and teacher outputs along the time dimension
+    to reduce peak memory usage when upcasting from bf16 to fp32, especially for large vocabularies.
+
+    Usage is analogous to ForwardKLWithChunkedOutputLoss but adapted to top-K teacher logprobs.
     """
 
-    target_logprobs = target_logprobs.float()
+    def __init__(self, num_output_chunks: int = 8, kd_temperature: float = 1.0):
+        super().__init__()
+        self.num_output_chunks = num_output_chunks
+        self.kd_temperature = kd_temperature
 
-    B, teacher_seq_len, K = target_logprobs.shape  # pylint: disable=invalid-name
-    # 1) Gather the student's top-k logits to match teacher
-    student_logits_for_kd = student_logits[
-        :, :teacher_seq_len, :
-    ]  # [B, seq_len, vocab]
-    student_topk_logits = torch.gather(
-        student_logits_for_kd, dim=-1, index=target_token_ids
-    )  # [B, seq_len, K]
+    def forward(
+        self,
+        student_logits: torch.Tensor,  # [B, seq_len, vocab_size]
+        target_token_ids: torch.Tensor,  # [B, seq_len, K]
+        target_logprobs: torch.Tensor,  # [B, seq_len, K]
+        target_mask: torch.Tensor,  # [B, seq_len, K]
+        num_items_in_batch: int = -1,  # optional batch size for normalization
+    ) -> torch.Tensor:
 
-    student_topk_logits = student_topk_logits.float()
+        # 1. Split along the "token" dimension (dim=1).
+        student_logits_chunks = student_logits.chunk(self.num_output_chunks, dim=1)
+        token_ids_chunks = target_token_ids.chunk(self.num_output_chunks, dim=1)
+        logprobs_chunks = target_logprobs.chunk(self.num_output_chunks, dim=1)
+        mask_chunks = target_mask.chunk(self.num_output_chunks, dim=1)
 
-    # 2) If you want to keep the "classical" T scaling, apply it first
-    if kd_temperature != 1.0:
-        student_topk_logits = student_topk_logits / kd_temperature
+        # We'll accumulate a global "sum of losses" and "sum of valid tokens"
+        # so that our final average is consistent with the entire sequence/batch.
+        total_loss = 0.0
+        total_valid_tokens = 0
 
-    # 3) Convert teacher logprobs -> treat them as “logits” for z-score
-    #    (They differ by +some_constant from real logits, but in z-score
-    #     that constant is subtracted out anyway.)
-    teacher_logits_for_zscore = target_logprobs  # rename variable for clarity
+        # 2. Loop over each chunk and compute a chunk-specific loss.
+        for st_chunk, tid_chunk, lp_chunk, msk_chunk in zip(
+            student_logits_chunks, token_ids_chunks, logprobs_chunks, mask_chunks
+        ):
+            # We pass num_items_in_batch=-1 so that the kd_loss
+            # will average over *this chunk's* valid tokens only.
+            chunk_loss = loss(
+                student_logits=st_chunk,
+                target_token_ids=tid_chunk,
+                target_logprobs=lp_chunk,
+                target_mask=msk_chunk,
+                num_items_in_batch=-1,  # ensure per-chunk averaging by valid tokens
+                kd_temperature=self.kd_temperature,
+            )
 
-    # 4) Z-score teacher and student
-    #    If target_mask is 2D, expand to 3D for the K dimension
-    if target_mask.dim() == 2 and target_mask.shape[:2] == (B, teacher_seq_len):
-        target_mask = target_mask.unsqueeze(-1).expand(-1, -1, K)
+            # kd_loss returns an average over the chunk's valid tokens.
+            # We want a global average in the end, so we need to re‐weight
+            # by the number of valid tokens in this chunk and keep track of the total.
+            chunk_valid_mask = msk_chunk.to(torch.bool)
+            chunk_valid_count = chunk_valid_mask.sum()  # scalar tensor
 
-    teacher_z = zscore_standardize(
-        teacher_logits_for_zscore, mask=target_mask, base_temperature=zscore_base_temp
-    )
-    student_z = zscore_standardize(
-        student_topk_logits, mask=target_mask, base_temperature=zscore_base_temp
-    )
+            # Re-scale "chunk average" back to "chunk sum"
+            chunk_loss_sum = chunk_loss * chunk_valid_count
 
-    # 5) Convert to log-probs for KL
-    teacher_logprobs_z = teacher_z - torch.logsumexp(teacher_z, dim=-1, keepdim=True)
-    student_logprobs_z = student_z - torch.logsumexp(student_z, dim=-1, keepdim=True)
+            total_loss += chunk_loss_sum
+            total_valid_tokens += chunk_valid_count
 
-    # 6) Restrict to valid tokens if needed
-    valid_mask = target_mask.bool()  # shape [B, seq_len, K]
-    teacher_probs_z = teacher_logprobs_z.exp()
-    teacher_probs_z = teacher_probs_z[valid_mask]
-    teacher_logprobs_z = teacher_logprobs_z[valid_mask]
-    student_logprobs_z = student_logprobs_z[valid_mask]
+        # 3. Normalize *once* at the end.
+        if num_items_in_batch > 0:
+            # If the user gave us a manual denominator (e.g. total items in batch),
+            # we divide by it. Typically used if each item is of different length.
+            final_loss = total_loss / float(num_items_in_batch)
+        else:
+            # Otherwise, divide by total valid tokens across all chunks.
+            # to get the same result as a non-chunked approach.
+            final_loss = total_loss / float(total_valid_tokens)
 
-    # 7) forward KL:  sum( p_teacher * [log(p_teacher) - log(p_student)] )
-    kd_loss_per_token = teacher_probs_z * (teacher_logprobs_z - student_logprobs_z)
-    kd_loss = kd_loss_per_token.sum()
-
-    # 8) If using classical KD scaling by T^2
-    if kd_temperature != 1.0:
-        kd_loss = kd_loss * (kd_temperature**2)
-
-    # Optionally scale by zscore_base_temp**2 if you want (paper might differ).
-    # kd_loss = kd_loss * (zscore_base_temp**2)
-
-    # 9) Normalize
-    if num_items_in_batch is not None and num_items_in_batch > 0:
-        kd_loss = kd_loss / float(num_items_in_batch)
-    else:
-        kd_loss = kd_loss / float(kd_loss_per_token.size(0))
-
-    return kd_loss
+        return final_loss
diff --git a/src/axolotl/integrations/kd/trainer.py b/src/axolotl/integrations/kd/trainer.py
index f99f2ca28..c454b2a2c 100644
--- a/src/axolotl/integrations/kd/trainer.py
+++ b/src/axolotl/integrations/kd/trainer.py
@@ -18,15 +18,27 @@ KD trainer
 
 from axolotl.core.trainers.base import AxolotlTrainer
 
-from .topk_logprob.forward_kl import loss as topk_kd_loss
-from .topk_logprob.forward_kl import topk_kd_loss_with_zscore
+from .kernels.liger import LigerFusedLinearKLTopKLogprobLoss
 
 
+# pylint: disable=too-many-ancestors
 class AxolotlKDTrainer(AxolotlTrainer):
     """
     Custom trainer subclass for Knowledge Distillation (KD)
     """
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.model_accepts_loss_kwargs = True
+        self.model._loss_function = LigerFusedLinearKLTopKLogprobLoss(
+            self.args.kd_ce_alpha,  # hard label loss
+            self.args.kd_alpha,  # kd loss
+            self.args.kd_temperature,
+            self.args.kd_beta or 0.0,
+            compute_ce_loss=bool(self.args.kd_ce_alpha),
+            normalize_topk=self.args.kd_normalize_topk,
+        )
+
     def _set_signature_columns_if_needed(self):
         super()._set_signature_columns_if_needed()
         columns_to_add = []
@@ -52,12 +64,12 @@ class AxolotlKDTrainer(AxolotlTrainer):
 
         Subclass and override for custom behavior.
         """
-
-        target_logprobs = inputs.pop("target_logprobs")
-        target_token_ids = inputs.pop("target_token_ids")
-        target_mask = inputs.pop("target_mask")
-
-        seq_len = target_token_ids.shape[1]
+        if (
+            self.args.sample_packing
+            and hasattr(inputs, "attention_mask")
+            and hasattr(inputs, "position_ids")
+        ):
+            del inputs["attention_mask"]
 
         if self.model_accepts_loss_kwargs:
             loss_kwargs = {}
@@ -65,49 +77,4 @@ class AxolotlKDTrainer(AxolotlTrainer):
                 loss_kwargs["num_items_in_batch"] = num_items_in_batch
             inputs = {**inputs, **loss_kwargs}
         outputs = model(**inputs)
-
-        # FIXME: account for tokenizer.padding_side
-        student_logits = outputs["logits"][:, : seq_len - 1, :].contiguous()
-
-        shift_logits = student_logits.contiguous()
-        target_logprobs_for_loss = target_logprobs[..., 1:, :].contiguous()
-        target_token_ids_for_loss = target_token_ids[..., 1:, :].contiguous()
-        target_mask_for_loss = target_mask[..., 1:, :].contiguous()
-
-        if self.args.kd_zscore_base_temp:
-            loss_kd = topk_kd_loss_with_zscore(
-                shift_logits,
-                target_token_ids_for_loss,
-                target_logprobs_for_loss,
-                target_mask_for_loss,
-                kd_temperature=self.args.kd_temperature,
-                zscore_base_temp=self.args.kd_zscore_base_temp,
-                num_items_in_batch=num_items_in_batch,
-            )
-        else:
-            loss_kd = topk_kd_loss(
-                shift_logits,
-                target_token_ids_for_loss,
-                target_logprobs_for_loss,
-                target_mask_for_loss,
-                num_items_in_batch=num_items_in_batch,
-                kd_temperature=self.args.kd_temperature,
-                top_k_before_softmax=1 if self.args.kd_top_k_before_softmax else 0,
-            )
-
-        if self.args.kd_ce_alpha > 0:
-            kd_alpha = self.args.kd_alpha
-            loss = self.args.kd_ce_alpha * outputs["loss"] + kd_alpha * loss_kd
-        else:
-            loss = loss_kd
-        # Save past state if it exists
-        # TODO: this needs to be fixed and made cleaner later.
-        if self.args.past_index >= 0:
-            self._past = outputs[  # pylint: disable=attribute-defined-outside-init
-                self.args.past_index
-            ]
-
-        if self.args.average_tokens_across_devices and self.model_accepts_loss_kwargs:
-            loss *= self.accelerator.num_processes
-
-        return (loss, outputs) if return_outputs else loss
+        return outputs[0]
diff --git a/src/axolotl/integrations/kd/utils.py b/src/axolotl/integrations/kd/utils.py
new file mode 100644
index 000000000..ba60694a5
--- /dev/null
+++ b/src/axolotl/integrations/kd/utils.py
@@ -0,0 +1,100 @@
+"""Helper KD utils"""
+
+import math
+from typing import List, Union
+
+import numpy as np
+import torch
+from torch import FloatTensor, Tensor
+
+
+def normalize_logprobs(logprobs: FloatTensor, topk: int) -> FloatTensor:
+    """
+    Re-normalizes top-k raw logprobs as probabilities, and converts back to logprobs.
+    """
+    # Ensure raw_logprobs matches kd_online_topk length for tensor operations
+    # This should ideally be handled by the caller ensuring correct padding/truncation first
+    if logprobs.shape[-1] != topk:
+        # pad last dimension of logprobs to match topk length with -inf
+        padding_len = topk - logprobs.shape[-1]
+        padding_tensor = torch.full(
+            (
+                *logprobs.shape[:-1],
+                padding_len,
+            ),  # Takes all dimensions of logprobs except the last, then appends padding_needed
+            float("-inf"),
+            dtype=logprobs.dtype,
+            device=logprobs.device,
+        )
+        logprobs = torch.cat((logprobs, padding_tensor), dim=-1)
+
+    # Convert logprobs at T_online to probabilities
+    # use log sum exp trick to avoid underflow
+    position_logprobs_lse = torch.logsumexp(logprobs, dim=-1, keepdim=True)
+    teacher_probs_t_online = torch.exp(logprobs - position_logprobs_lse)
+
+    # Normalize probabilities (sum to 1)
+    # This is important if the top-k from server aren't a full distribution
+    teacher_probs_t_online_sum = teacher_probs_t_online.sum(dim=-1, keepdim=True)
+    teacher_probs_t_online = teacher_probs_t_online / teacher_probs_t_online_sum
+
+    final_logprobs_tensor = torch.log(teacher_probs_t_online)
+
+    return final_logprobs_tensor
+
+
+def strided_chunk_views(
+    tensor: Union[np.ndarray, torch.Tensor],
+    chunks: int,
+    dim: int = 0,
+    stride: int = 1,
+    chunk_size: int | None = None,
+) -> List[Union[np.ndarray, torch.Tensor]]:
+    """
+    Split a tensor into chunks along a dimension with striding, prioritizing views over copies.
+
+    Args:
+        tensor: Input tensor (numpy array or torch tensor)
+        chunks: Number of chunks to create
+        dim: Dimension along which to chunk (default: 0)
+        stride: Stride between chunk starting positions (default: 1)
+        chunk_size: Size of each chunk. If None, calculated automatically (default: None)
+
+    Returns:
+        List of tensor chunks (views when possible, copies when necessary)
+    """
+
+    # Get the size of the specified dimension
+    dim_size = tensor.shape[dim]
+
+    # Calculate chunk size if not provided
+    if chunk_size is None:
+        chunk_size = (dim_size + chunks - 1) // chunks  # Ceiling division
+
+    chunks_list = []
+
+    for i in range(chunks):
+        start_idx = i * stride
+        end_idx = min(start_idx + chunk_size, dim_size)
+
+        # Break if we've gone beyond the tensor
+        if start_idx >= dim_size:
+            break
+
+        # Create slice objects for all dimensions
+        slices = [slice(None)] * tensor.ndim
+        slices[dim] = slice(start_idx, end_idx)
+
+        chunk = tensor[tuple(slices)]
+        chunks_list.append(chunk)
+
+    return chunks_list
+
+
+def chunk_overlap(input_tensor: Tensor, chunks: int, dim: int = 0, overlap: int = 1):
+    dim_size = input_tensor.shape[dim]
+    stride = math.ceil(dim_size / chunks)
+
+    return strided_chunk_views(
+        input_tensor, chunks, dim, stride=stride, chunk_size=stride + overlap
+    )
diff --git a/src/axolotl/integrations/liger/__init__.py b/src/axolotl/integrations/liger/__init__.py
index c7ac42372..86d56be80 100644
--- a/src/axolotl/integrations/liger/__init__.py
+++ b/src/axolotl/integrations/liger/__init__.py
@@ -18,174 +18,10 @@ Module for the Plugin for LIGER integraton with Axolotl.
 Liger Kernel is the collection of Triton-native kernels for LLM Training.
 It is designed to be performant, correct, and light-weight.
 """
-import inspect
-import logging
-import sys
+from .args import LigerArgs
+from .plugin import LigerPlugin
 
-from axolotl.integrations.base import BasePlugin
-from axolotl.utils.distributed import is_main_process
-
-from .args import LigerArgs  # pylint: disable=unused-import. # noqa: F401
-from .utils import patch_with_compile_disable
-
-LOG = logging.getLogger("axolotl.integrations.liger")
-
-
-class LigerPlugin(BasePlugin):
-    """
-    Plugin for LIGER integraton with Axolotl.
-    """
-
-    def get_input_args(self):
-        return "axolotl.integrations.liger.LigerArgs"
-
-    def pre_model_load(self, cfg):
-        if cfg.torch_compile:
-            # torch compile will unnecessarily attempt to optimize the triton kernel unless explicitly disabled
-            import liger_kernel.ops.fused_linear_cross_entropy
-
-            patch_with_compile_disable(
-                liger_kernel.ops.fused_linear_cross_entropy,
-                "fused_linear_cross_entropy_forward",
-            )
-            patch_with_compile_disable(
-                liger_kernel.ops.fused_linear_cross_entropy,
-                "fused_linear_cross_entropy_backward",
-            )
-        from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss
-        from liger_kernel.transformers.functional import liger_cross_entropy
-        from liger_kernel.transformers.layer_norm import LigerLayerNorm
-        from liger_kernel.transformers.monkey_patch import MODEL_TYPE_TO_APPLY_LIGER_FN
-        from liger_kernel.transformers.rms_norm import LigerRMSNorm
-        from liger_kernel.transformers.rope import liger_rotary_pos_emb
-        from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
-
-        if cfg.liger_cross_entropy and cfg.liger_fused_linear_cross_entropy:
-            raise ValueError(
-                "Cannot have both `liger_cross_entropy` and `liger_fused_linear_cross_entropy` set."
-            )
-
-        if cfg.model_config_type in MODEL_TYPE_TO_APPLY_LIGER_FN:
-            apply_liger_fn = MODEL_TYPE_TO_APPLY_LIGER_FN[cfg.model_config_type]
-            liger_fn_sig = inspect.signature(apply_liger_fn)
-            kwargs = {}
-            if "rope" in liger_fn_sig.parameters:
-                kwargs["rope"] = cfg.liger_rope
-            if "cross_entropy" in liger_fn_sig.parameters:
-                kwargs["cross_entropy"] = cfg.liger_cross_entropy
-            if "fused_linear_cross_entropy" in liger_fn_sig.parameters:
-                kwargs["fused_linear_cross_entropy"] = (
-                    cfg.liger_fused_linear_cross_entropy
-                )
-            if "rms_norm" in liger_fn_sig.parameters:
-                kwargs["rms_norm"] = cfg.liger_rms_norm
-            if "layer_norm" in liger_fn_sig.parameters:
-                kwargs["layer_norm"] = cfg.liger_layer_norm
-            if "geglu" in liger_fn_sig.parameters:
-                kwargs["geglu"] = cfg.liger_glu_activation
-            elif "swiglu" in liger_fn_sig.parameters:
-                kwargs["swiglu"] = cfg.liger_glu_activation
-            if is_main_process(use_environ=True):
-                LOG.info(
-                    f"Applying LIGER to {cfg.model_config_type} with kwargs: {kwargs}"
-                )
-            apply_liger_fn(**kwargs)
-        elif cfg.model_config_type == "jamba":
-            from transformers.models.jamba import modeling_jamba
-
-            from .models.jamba import lce_forward as jamba_lce_forward
-
-            if cfg.liger_rope:
-                modeling_jamba.apply_rotary_pos_emb = liger_rotary_pos_emb
-            if cfg.liger_rms_norm:
-                modeling_jamba.JambaRMSNorm = LigerRMSNorm
-            if cfg.liger_glu_activation:
-                modeling_jamba.JambaMLP = LigerSwiGLUMLP
-            if cfg.liger_layer_norm:
-                modeling_jamba.nn.LayerNorm = LigerLayerNorm
-            if cfg.liger_cross_entropy:
-                from transformers.loss.loss_utils import nn
-
-                nn.functional.cross_entropy = liger_cross_entropy
-            if cfg.liger_fused_linear_cross_entropy:
-                modeling_jamba.JambaForCausalLM.forward = jamba_lce_forward
-        elif cfg.model_config_type == "deepseek_v2":
-            from accelerate import init_empty_weights
-            from transformers import AutoModelForCausalLM
-
-            with init_empty_weights():
-                model = AutoModelForCausalLM.from_pretrained(
-                    cfg.base_model, trust_remote_code=cfg.trust_remote_code or False
-                )
-                modeling_mod = sys.modules[model.__class__.__module__]
-
-            from .models.deepseekv2 import lce_forward as deepseekv2_lce_forward
-
-            if cfg.liger_rope:
-                # The DeepseekV2 version of RoPE is different than upstream LLaMA.
-                # See https://github.com/linkedin/Liger-Kernel/issues/129#issuecomment-2313763528
-                logging.warning("Fused liger_rope is not supported for DeepseekV2.")
-            if cfg.liger_glu_activation:
-                logging.warning("liger_glu_activation is not supported for DeepseekV2.")
-            if cfg.liger_rms_norm:
-                modeling_mod.DeepseekV2RMSNorm = LigerRMSNorm
-            if cfg.liger_glu_activation:
-                modeling_mod.DeepseekV2MLP.forward = LigerSwiGLUMLP.forward
-            if cfg.liger_layer_norm:
-                modeling_mod.DeepseekV2MLP.forward = LigerLayerNorm.forward
-            if cfg.liger_cross_entropy:
-                # We do not patch `nn.functional.cross_entropy` for DeepseekV2 as it still uses
-                # nn.CrossEntropyLoss in the forward method.
-                modeling_mod.CrossEntropyLoss = LigerCrossEntropyLoss
-            if cfg.liger_fused_linear_cross_entropy:
-                modeling_mod.DeepseekV2ForCausalLM.forward = deepseekv2_lce_forward
-        elif cfg.model_config_type == "llama4":
-            from axolotl.integrations.liger.models.llama4 import (
-                apply_liger_kernel_to_llama4,
-            )
-
-            apply_liger_kernel_to_llama4(
-                cross_entropy=cfg.liger_cross_entropy,
-                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
-                glu_activation=cfg.liger_glu_activation,
-                rms_norm=cfg.liger_rms_norm,
-                layer_norm=cfg.liger_layer_norm,
-            )
-        elif cfg.model_config_type == "qwen3":
-            from axolotl.integrations.liger.models.qwen3 import (
-                apply_liger_kernel_to_qwen3,
-            )
-
-            apply_liger_kernel_to_qwen3(
-                cross_entropy=cfg.liger_cross_entropy,
-                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
-                glu_activation=cfg.liger_glu_activation,
-                rms_norm=cfg.liger_rms_norm,
-                layer_norm=cfg.liger_layer_norm,
-            )
-        elif cfg.model_config_type == "qwen3_moe":
-            from axolotl.integrations.liger.models.qwen3_moe import (
-                apply_liger_kernel_to_qwen3_moe,
-            )
-
-            apply_liger_kernel_to_qwen3_moe(
-                cross_entropy=cfg.liger_cross_entropy,
-                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
-                glu_activation=cfg.liger_glu_activation,
-                rms_norm=cfg.liger_rms_norm,
-                layer_norm=cfg.liger_layer_norm,
-            )
-        elif cfg.model_config_type == "granitemoe":
-            from liger_kernel.transformers import apply_liger_kernel_to_granite
-
-            apply_liger_kernel_to_granite(
-                rope=cfg.liger_rope,
-                cross_entropy=cfg.liger_cross_entropy,
-                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
-                rms_norm=cfg.liger_rms_norm,
-                swiglu=cfg.liger_glu_activation,
-            )
-        else:
-            logging.warning(
-                f"Unsupported model config type: {cfg.model_config_type}. Liger not applied."
-            )
+__all__ = [
+    "LigerArgs",
+    "LigerPlugin",
+]
diff --git a/src/axolotl/integrations/liger/args.py b/src/axolotl/integrations/liger/args.py
index 02ece3143..d5bb10cfd 100644
--- a/src/axolotl/integrations/liger/args.py
+++ b/src/axolotl/integrations/liger/args.py
@@ -15,12 +15,12 @@
 """
 Module for handling LIGER input arguments.
 """
-import logging
-from typing import Optional
 
 from pydantic import BaseModel, model_validator
 
-LOG = logging.getLogger("axolotl.integrations.liger.args")
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
 
 
 class LigerArgs(BaseModel):
@@ -28,13 +28,13 @@ class LigerArgs(BaseModel):
     Input args for LIGER.
     """
 
-    liger_rope: Optional[bool] = None
-    liger_rms_norm: Optional[bool] = None
-    liger_layer_norm: Optional[bool] = None
-    liger_swiglu: Optional[bool] = None
-    liger_glu_activation: Optional[bool] = None
-    liger_cross_entropy: Optional[bool] = None
-    liger_fused_linear_cross_entropy: Optional[bool] = None
+    liger_rope: bool | None = None
+    liger_rms_norm: bool | None = None
+    liger_layer_norm: bool | None = None
+    liger_swiglu: bool | None = None
+    liger_glu_activation: bool | None = None
+    liger_cross_entropy: bool | None = None
+    liger_fused_linear_cross_entropy: bool | None = None
 
     @model_validator(mode="before")
     @classmethod
@@ -51,3 +51,33 @@ class LigerArgs(BaseModel):
             )
             data["liger_glu_activation"] = data.pop("liger_swiglu")
         return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_tiled_mlp_conflict(cls, data):
+        if (
+            data.get("liger_glu_activation") is True
+            and data.get("tiled_mlp") is True
+            and not data.get("tiled_mlp_use_original_mlp")
+        ):
+            raise ValueError(
+                "You cannot have both `liger_glu_activation` and `tiled_mlp` set without `tiled_mlp_use_original_mlp: true`."
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_liger_rms_norm_tensor_parallel(cls, data):
+        if data.get("liger_rms_norm") and data.get("tensor_parallel_size", 1) > 1:
+            raise ValueError(
+                "`liger_rms_norm` is incompatible with tensor parallelism, "
+                "see https://github.com/linkedin/Liger-Kernel/issues/826"
+            )
+        return data
+
+    @model_validator(mode="after")
+    def check_tensor_parallel_size_liger_fused_linear_cross_entropy(self):
+        # TODO @SalmanMohammadi this is a larger fix - investigate
+        if self.tensor_parallel_size > 1 and self.liger_fused_linear_cross_entropy:
+            raise ValueError("Tensor parallelism is not compatible with liger losses.")
+        return self
diff --git a/src/axolotl/integrations/liger/models/base.py b/src/axolotl/integrations/liger/models/base.py
new file mode 100644
index 000000000..f3cf4299a
--- /dev/null
+++ b/src/axolotl/integrations/liger/models/base.py
@@ -0,0 +1,189 @@
+"""
+Generic FLCE patch for untested models similar to Llama
+"""
+
+from typing import Optional, Tuple, Union
+
+import torch
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.trainer.orpo_trainer import _FSDPForwardRedirection
+from liger_kernel.utils import PEFT_AVAILABLE
+from peft.utils import ModulesToSaveWrapper
+from torch.distributed.fsdp import FullyShardedDataParallel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+
+from axolotl.utils.callbacks.models import get_causal_lm_model_cls_prefix
+
+
+def lce_forward(
+    self,
+    *args,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    labels: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    **kwargs,
+) -> Union[Tuple, CausalLMOutputWithPast]:
+    r"""
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+    """
+
+    # pylint: disable=duplicate-code
+    output_attentions = (
+        output_attentions
+        if output_attentions is not None
+        else self.config.output_attentions
+    )
+    output_hidden_states = (
+        output_hidden_states
+        if output_hidden_states is not None
+        else self.config.output_hidden_states
+    )
+
+    return_dict = (
+        return_dict if return_dict is not None else self.config.use_return_dict
+    )
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        *args,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = (
+        slice(-logits_to_keep, None)
+        if isinstance(logits_to_keep, int)
+        else logits_to_keep
+    )
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    logits = None
+    loss = None
+
+    # if in training mode, don't materialize logits
+    if skip_logits and labels is None and shift_labels is None:
+        raise ValueError("skip_logits is True, but labels and shift_labels are None")
+
+    if skip_logits is None:
+        # By default, if in training mode, don't materialize logits
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    if skip_logits:
+        loss = lce_maybe_trainable_lm_head(
+            self,
+            hidden_states=kept_hidden_states,
+            hidden_size=self.config.hidden_size,
+            labels=labels,
+            shift_labels=shift_labels,
+            **kwargs,
+        )
+
+    else:
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        return (loss,) + output if loss is not None else output
+
+    return CausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+    )
+
+
+def lce_maybe_trainable_lm_head(
+    self, hidden_states, hidden_size, labels, shift_labels, **loss_kwargs
+):
+    lm_head = self.lm_head
+
+    # Unwrap the module if lm_head has been added as trainable module in PEFT LoRA configuration,
+    # i.e. listed in the modules_to_save field of LoraConfig, so the lm_head weights are read
+    # from the unwrapped module.
+    # See https://huggingface.co/docs/peft/package_reference/lora for reference.
+    if PEFT_AVAILABLE and isinstance(lm_head, ModulesToSaveWrapper):
+        lm_head = lm_head.modules_to_save.default
+
+    # If FSDP is used and lm_head is trainable, e.g., during full fine-tuning or with LoRA,
+    # reading the lm_head module weights and calling the kernel must be done within FSDP forward pass
+    # so the module entire parameters are summoned and kept in memory during the kernel execution.
+    if isinstance(lm_head, FullyShardedDataParallel):
+        return _FSDPForwardRedirection()(
+            lm_head,
+            _liger_for_causal_lm_loss,
+            lm_head.module,
+            hidden_states,
+            hidden_size,
+            labels,
+            shift_labels,
+            **loss_kwargs,
+        )
+
+    # FSDP is not used so we can read the lm_head weights and call the kernel directly
+    return _liger_for_causal_lm_loss(
+        lm_head=self.lm_head,
+        hidden_states=hidden_states,
+        hidden_size=hidden_size,
+        labels=labels,
+        shift_labels=shift_labels,
+        **loss_kwargs,
+    )
+
+
+def _liger_for_causal_lm_loss(
+    lm_head, hidden_states, hidden_size, labels, shift_labels, **loss_kwargs
+):
+    return LigerForCausalLMLoss(
+        hidden_states=hidden_states,
+        lm_head_weight=lm_head.weight,
+        labels=labels,
+        hidden_size=hidden_size,
+        shift_labels=shift_labels,
+        **loss_kwargs,
+    )
+
+
+def patch_lce_forward(
+    model_type,
+):
+    try:
+        # Dynamically import the module and MLP class
+        module_path = f"transformers.models.{model_type}.modeling_{model_type}"
+        model_cls_prefix, _ = get_causal_lm_model_cls_prefix(model_type)
+        module = __import__(module_path, fromlist=[f"{model_cls_prefix}ForCausalLM"])
+        model_cls = getattr(module, f"{model_cls_prefix}ForCausalLM")
+
+        model_cls.forward = lce_forward
+    # pylint: disable=duplicate-code
+    except (ImportError, AttributeError) as e:
+        raise RuntimeError(
+            f"Could not import ForCausalLM class for model_type: {model_type}. "
+            f"Error: {str(e)}"
+        ) from e
diff --git a/src/axolotl/integrations/liger/plugin.py b/src/axolotl/integrations/liger/plugin.py
new file mode 100644
index 000000000..89f7c37b7
--- /dev/null
+++ b/src/axolotl/integrations/liger/plugin.py
@@ -0,0 +1,182 @@
+"""
+Liger-Kernel Plugin for Axolotl
+"""
+
+import inspect
+import sys
+
+from axolotl.integrations.base import BasePlugin
+from axolotl.utils.logging import get_logger
+
+from .models.base import patch_lce_forward
+from .utils import patch_with_compile_disable
+
+LOG = get_logger(__name__)
+
+
+class LigerPlugin(BasePlugin):
+    """
+    Plugin for LIGER integraton with Axolotl.
+    """
+
+    def get_input_args(self):
+        return "axolotl.integrations.liger.LigerArgs"
+
+    def pre_model_load(self, cfg):
+        if cfg.torch_compile:
+            # torch compile will unnecessarily attempt to optimize the triton kernel unless explicitly disabled
+            import liger_kernel.ops.fused_linear_cross_entropy
+
+            patch_with_compile_disable(
+                liger_kernel.ops.fused_linear_cross_entropy,
+                "fused_linear_cross_entropy_forward",
+            )
+            patch_with_compile_disable(
+                liger_kernel.ops.fused_linear_cross_entropy,
+                "fused_linear_cross_entropy_backward",
+            )
+        from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss
+        from liger_kernel.transformers.functional import liger_cross_entropy
+        from liger_kernel.transformers.layer_norm import LigerLayerNorm
+        from liger_kernel.transformers.monkey_patch import MODEL_TYPE_TO_APPLY_LIGER_FN
+        from liger_kernel.transformers.rms_norm import LigerRMSNorm
+        from liger_kernel.transformers.rope import liger_rotary_pos_emb
+        from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
+
+        if cfg.liger_cross_entropy and cfg.liger_fused_linear_cross_entropy:
+            raise ValueError(
+                "Cannot have both `liger_cross_entropy` and `liger_fused_linear_cross_entropy` set."
+            )
+
+        if cfg.model_config_type in MODEL_TYPE_TO_APPLY_LIGER_FN:
+            apply_liger_fn = MODEL_TYPE_TO_APPLY_LIGER_FN[cfg.model_config_type]
+            liger_fn_sig = inspect.signature(apply_liger_fn)
+            kwargs = {}
+            if "rope" in liger_fn_sig.parameters:
+                kwargs["rope"] = cfg.liger_rope
+            if "cross_entropy" in liger_fn_sig.parameters:
+                kwargs["cross_entropy"] = cfg.liger_cross_entropy
+            if "fused_linear_cross_entropy" in liger_fn_sig.parameters:
+                kwargs["fused_linear_cross_entropy"] = (
+                    cfg.liger_fused_linear_cross_entropy
+                )
+            if "rms_norm" in liger_fn_sig.parameters:
+                kwargs["rms_norm"] = cfg.liger_rms_norm
+            if "layer_norm" in liger_fn_sig.parameters:
+                kwargs["layer_norm"] = cfg.liger_layer_norm
+            if "geglu" in liger_fn_sig.parameters:
+                kwargs["geglu"] = cfg.liger_glu_activation
+            elif "swiglu" in liger_fn_sig.parameters:
+                kwargs["swiglu"] = cfg.liger_glu_activation
+            LOG.info(f"Applying LIGER to {cfg.model_config_type} with kwargs: {kwargs}")
+            apply_liger_fn(**kwargs)
+        elif cfg.model_config_type == "jamba":
+            from transformers.models.jamba import modeling_jamba
+
+            from .models.jamba import lce_forward as jamba_lce_forward
+
+            if cfg.liger_rope:
+                modeling_jamba.apply_rotary_pos_emb = liger_rotary_pos_emb
+            if cfg.liger_rms_norm:
+                modeling_jamba.JambaRMSNorm = LigerRMSNorm
+            if cfg.liger_glu_activation:
+                modeling_jamba.JambaMLP = LigerSwiGLUMLP
+            if cfg.liger_layer_norm:
+                modeling_jamba.nn.LayerNorm = LigerLayerNorm
+            if cfg.liger_cross_entropy:
+                from transformers.loss.loss_utils import nn
+
+                nn.functional.cross_entropy = liger_cross_entropy
+            if cfg.liger_fused_linear_cross_entropy:
+                modeling_jamba.JambaForCausalLM.forward = jamba_lce_forward
+        elif cfg.model_config_type == "deepseek_v2":
+            from accelerate import init_empty_weights
+            from transformers import AutoModelForCausalLM
+
+            with init_empty_weights():
+                model = AutoModelForCausalLM.from_pretrained(
+                    cfg.base_model, trust_remote_code=cfg.trust_remote_code or False
+                )
+                modeling_mod = sys.modules[model.__class__.__module__]
+
+            from .models.deepseekv2 import lce_forward as deepseekv2_lce_forward
+
+            if cfg.liger_rope:
+                # The DeepseekV2 version of RoPE is different than upstream LLaMA.
+                # See https://github.com/linkedin/Liger-Kernel/issues/129#issuecomment-2313763528
+                LOG.warning("Fused liger_rope is not supported for DeepseekV2.")
+            if cfg.liger_rms_norm:
+                modeling_mod.DeepseekV2RMSNorm = LigerRMSNorm
+            if cfg.liger_glu_activation:
+                modeling_mod.DeepseekV2MLP.forward = LigerSwiGLUMLP.forward
+            if cfg.liger_layer_norm:
+                LOG.warning("liger_layer_norm is not supported for DeepseekV2.")
+            if cfg.liger_cross_entropy:
+                # We do not patch `nn.functional.cross_entropy` for DeepseekV2 as it still uses
+                # nn.CrossEntropyLoss in the forward method.
+                modeling_mod.CrossEntropyLoss = LigerCrossEntropyLoss
+            if cfg.liger_fused_linear_cross_entropy:
+                modeling_mod.DeepseekV2ForCausalLM.forward = deepseekv2_lce_forward
+        elif cfg.model_config_type == "llama4":
+            from axolotl.integrations.liger.models.llama4 import (
+                apply_liger_kernel_to_llama4,
+            )
+
+            apply_liger_kernel_to_llama4(
+                cross_entropy=cfg.liger_cross_entropy,
+                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
+                glu_activation=cfg.liger_glu_activation,
+                rms_norm=cfg.liger_rms_norm,
+                layer_norm=cfg.liger_layer_norm,
+            )
+        elif cfg.model_config_type == "qwen3":
+            from axolotl.integrations.liger.models.qwen3 import (
+                apply_liger_kernel_to_qwen3,
+            )
+
+            apply_liger_kernel_to_qwen3(
+                cross_entropy=cfg.liger_cross_entropy,
+                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
+                glu_activation=cfg.liger_glu_activation,
+                rms_norm=cfg.liger_rms_norm,
+                layer_norm=cfg.liger_layer_norm,
+            )
+        elif cfg.model_config_type == "qwen3_moe":
+            from axolotl.integrations.liger.models.qwen3_moe import (
+                apply_liger_kernel_to_qwen3_moe,
+            )
+
+            apply_liger_kernel_to_qwen3_moe(
+                cross_entropy=cfg.liger_cross_entropy,
+                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
+                glu_activation=cfg.liger_glu_activation,
+                rms_norm=cfg.liger_rms_norm,
+                layer_norm=cfg.liger_layer_norm,
+            )
+        elif cfg.model_config_type == "granitemoe":
+            from liger_kernel.transformers import apply_liger_kernel_to_granite
+
+            apply_liger_kernel_to_granite(
+                rope=cfg.liger_rope,
+                cross_entropy=cfg.liger_cross_entropy,
+                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
+                rms_norm=cfg.liger_rms_norm,
+                swiglu=cfg.liger_glu_activation,
+            )
+        elif cfg.liger_fused_linear_cross_entropy:
+            try:
+                patch_lce_forward(cfg.model_config_type)
+                LOG.warning_once(
+                    f"Applied ONLY liger_fused_linear_cross_entropy genericpatches for model type: {cfg.model_config_type}"
+                )
+                LOG.warning_once(
+                    f"Liger + {cfg.model_config_type} generic FLCE support is experimental and may not work as expected."
+                )
+            except RuntimeError:
+                LOG.warning(
+                    f"Unsupported model config type: {cfg.model_config_type}. Liger not applied."
+                )
+        else:
+            LOG.warning(
+                f"Unsupported model config type: {cfg.model_config_type}. Liger not applied."
+            )
diff --git a/src/axolotl/integrations/llm_compressor/plugin.py b/src/axolotl/integrations/llm_compressor/plugin.py
index d986d51f4..57d506a57 100644
--- a/src/axolotl/integrations/llm_compressor/plugin.py
+++ b/src/axolotl/integrations/llm_compressor/plugin.py
@@ -3,7 +3,6 @@ Sparse Finetuning plugin for Axolotl — enables handling of sparse neural netwo
 by maintaining masks for zero weights during training.
 """
 
-import logging
 from functools import wraps
 from typing import Any, Callable, Concatenate, ParamSpec, TypeVar
 
@@ -16,11 +15,12 @@ from transformers.trainer_callback import TrainerCallback, TrainerControl, Train
 from transformers.training_args import TrainingArguments
 
 from axolotl.integrations.base import BasePlugin
+from axolotl.utils.logging import get_logger
 
 P = ParamSpec("P")  # Params for generic function signatures
 R = TypeVar("R")  # Return type for generic function signatures
 
-LOG = logging.getLogger("axolotl.integrations.llm_compressor")
+LOG = get_logger(__name__)
 
 
 class LLMCompressorCallbackHandler(TrainerCallback):
diff --git a/src/axolotl/integrations/spectrum/__init__.py b/src/axolotl/integrations/spectrum/__init__.py
index 6059e7951..9f66aef97 100644
--- a/src/axolotl/integrations/spectrum/__init__.py
+++ b/src/axolotl/integrations/spectrum/__init__.py
@@ -17,14 +17,16 @@ Spectrum Plugin to automatically generate unfrozen parameters based on SNR data.
 """
 
 import json
-import logging
 
 import requests
 
 from axolotl.integrations.base import BasePlugin
+from axolotl.utils.logging import get_logger
 
 from .args import SpectrumArgs  # pylint: disable=unused-import. # noqa: F401
 
+LOG = get_logger(__name__)
+
 
 def _generate_unfrozen_params_yaml(snr_data, top_fraction=0.5):
     unfrozen_parameters = {}
@@ -83,17 +85,17 @@ class SpectrumPlugin(BasePlugin):
         except FileNotFoundError:
             pass
         except Exception as exc:  # pylint: disable=broad-exception-caught
-            logging.warning(f"Failed to read SNR data from {snr_path}: {exc}")
+            LOG.warning(f"Failed to read SNR data from {snr_path}: {exc}")
 
         if not snr_data:
             try:
                 snr_data = requests.get(snr_url, timeout=60).json()
             except requests.exceptions.RequestException as exc:
-                logging.warning(f"Failed to fetch SNR data from {snr_url}: {exc}")
+                LOG.warning(f"Failed to fetch SNR data from {snr_url}: {exc}")
                 return
             # also catch json parsing errors
             except json.JSONDecodeError as exc:
-                logging.warning(f"Failed to parse SNR data from {snr_url}: {exc}")
+                LOG.warning(f"Failed to parse SNR data from {snr_url}: {exc}")
                 return
 
         unfrozen_parameters = _generate_unfrozen_params_yaml(
diff --git a/src/axolotl/kernels/lora.py b/src/axolotl/kernels/lora.py
index 03fca6df4..fb45f2aa7 100644
--- a/src/axolotl/kernels/lora.py
+++ b/src/axolotl/kernels/lora.py
@@ -14,6 +14,7 @@ from typing import Callable
 import torch
 from bitsandbytes.functional import QuantState
 from torch import nn
+from torch.distributed.tensor import DTensor
 
 from .geglu import geglu_backward, geglu_forward
 from .quantize import dequantize
@@ -25,6 +26,7 @@ def get_lora_parameters(
     proj: nn.Module,
 ) -> tuple[
     torch.Tensor,
+    torch.Tensor | None,
     QuantState | None,
     torch.Tensor | None,
     torch.Tensor | None,
@@ -37,39 +39,54 @@ def get_lora_parameters(
         proj: The projection module to extract parameters from.
 
     Returns:
-        A tuple containing the base weight matrix, quantization state, LoRA A matrix,
-        LoRA B matrix, and scaling factor. States and matrices may be None if not
-        available.
+        A tuple containing the base weights, quantization state, LoRA A and B weights,
+        scaling factor, and base layer bias. Quant state, weights, and bias may be
+        `None` if not available.
     """
     # For DPO or disabled adapters
     base_layer = proj.base_layer if hasattr(proj, "base_layer") else proj
     W = base_layer.weight
+    b = base_layer.bias
 
     if not hasattr(proj, "disable_adapters") or proj.disable_adapters or proj.merged:
         quant_state = getattr(W, "quant_state", None)
-        return W, quant_state, None, None, None
+        return W, b, quant_state, None, None, None
+
+    quant_state = getattr(W, "quant_state", None)
 
     active_adapter = (
         proj.active_adapters[0]
         if hasattr(proj, "active_adapters")
         else proj.active_adapter
     )
-    A = proj.lora_A[active_adapter].weight
-    B = proj.lora_B[active_adapter].weight
+
+    linear_A = proj.lora_A[active_adapter]
+    linear_B = proj.lora_B[active_adapter]
+
+    # This manual unsharding is needed for FSDP2 + LoRA kernels compatibility.
+    # We fuse linear layers + LoRA adapters calculations into a single
+    # torch.autograd.Function, bypassing the registered unshard / reshard behavior.
+    # Note that we don't apply resharding later in this module (it gets messy quickly),
+    # but LoRA parameters are generally small enough that this is not an issue.
+    if isinstance(linear_A.weight, DTensor):
+        linear_A.unshard()
+        linear_B.unshard()
+
+    A = linear_A.weight
+    B = linear_B.weight
     s = proj.scaling[active_adapter]
 
-    quant_state = getattr(W, "quant_state", None)
-
-    return W, quant_state, A, B, s
+    return W, b, quant_state, A, B, s
 
 
 def matmul_lora(
     X: torch.Tensor,
     W: torch.Tensor,
-    W_quant: QuantState,
-    A: torch.Tensor,
-    B: torch.Tensor,
-    s: float,
+    b: torch.Tensor | None,
+    W_quant: QuantState | None,
+    A: torch.Tensor | None,
+    B: torch.Tensor | None,
+    s: float | None,
     out: torch.Tensor | None = None,
 ) -> torch.Tensor:
     """
@@ -90,20 +107,22 @@ def matmul_lora(
     dtype = X.dtype
     W = dequantize(W.t(), W_quant)
 
+    reshape = False
     if X.dim() == 3:
         batch, seq_len, _ = X.shape
         X = X.view(-1, X.shape[-1])
         reshape = True
-    else:
-        reshape = False
 
     out = torch.matmul(X, W, out=out)
     if W_quant is not None:
         del W
 
     if A is not None:
-        A, B = A.t(), B.t()
-        out += (X @ A.to(dtype)) @ (s * B.to(dtype))
+        A, B = A.t().to(dtype), B.t().to(dtype)  # type: ignore[union-attr]
+        out += s * X @ A @ B
+
+    if b is not None:
+        out += b
 
     return out.view(batch, seq_len, -1) if reshape else out
 
@@ -117,17 +136,20 @@ class LoRA_MLP(torch.autograd.Function):
         ctx,
         X: torch.Tensor,
         gate_weight: torch.Tensor,
-        gate_quant: object | None,
+        gate_bias: torch.Tensor | None,
+        gate_quant: QuantState | None,
         gate_A: torch.Tensor | None,
         gate_B: torch.Tensor | None,
         gate_scale: float,
         up_weight: torch.Tensor,
-        up_quant: object | None,
+        up_bias: torch.Tensor | None,
+        up_quant: QuantState | None,
         up_A: torch.Tensor | None,
         up_B: torch.Tensor | None,
         up_scale: float,
         down_weight: torch.Tensor,
-        down_quant: object | None,
+        down_bias: torch.Tensor | None,
+        down_quant: QuantState | None,
         down_A: torch.Tensor | None,
         down_B: torch.Tensor | None,
         down_scale: float,
@@ -142,20 +164,22 @@ class LoRA_MLP(torch.autograd.Function):
             ctx: Autograd context
             X: Input features
             gate_weight: Gate projection weight
+            gate_bias: Gate projection bias
             gate_quant: Gate quantization state
             gate_A: Gate LoRA A matrix
             gate_B: Gate LoRA B matrix
             gate_scale: Gate LoRA scale
-            up_weight: Up-projection weight
-            up_quant: Up-projection quantization state
-            up_A: Up-projection LoRA A matrix
-            up_B: Up-projection LoRA B matrix
-            up_scale: Up-projection LoRA scale
-            down_weight: Down-projection weight
-            down_quant: Down-projection quantization state
-            down_A: Down-projection LoRA A matrix
-            down_B: Down-projection LoRA B matrix
-            down_scale: Down-projection LoRA scale
+            up_weight: Up projection weight
+            up_quant: Up projection quantization state
+            up_A: Up projection LoRA A matrix
+            up_B: Up projection LoRA B matrix
+            up_scale: Up projection LoRA scale
+            down_weight: Down projection weight
+            down_bias: Down projection bias
+            down_quant: Down projection quantization state
+            down_A: Down projection LoRA A matrix
+            down_B: Down projection LoRA B matrix
+            down_scale: Down projection LoRA scale
             activation_fn: Forward activation function
             activation_fn_backward: Backward activation function
             inplace: Whether to perform operations in-place
@@ -164,15 +188,17 @@ class LoRA_MLP(torch.autograd.Function):
             Output transformed by multi-layer perceptron and activation function
         """
         # Compute projections
-        gate = matmul_lora(X, gate_weight, gate_quant, gate_A, gate_B, gate_scale)
-        up = matmul_lora(X, up_weight, up_quant, up_A, up_B, up_scale)
+        gate = matmul_lora(
+            X, gate_weight, gate_bias, gate_quant, gate_A, gate_B, gate_scale
+        )
+        up = matmul_lora(X, up_weight, up_bias, up_quant, up_A, up_B, up_scale)
 
         # Activation
         hidden = activation_fn(gate, up)
 
         # Down projection
         output = matmul_lora(
-            hidden, down_weight, down_quant, down_A, down_B, down_scale
+            hidden, down_weight, down_bias, down_quant, down_A, down_B, down_scale
         )
 
         # Save for backward
@@ -195,22 +221,26 @@ class LoRA_MLP(torch.autograd.Function):
         torch.Tensor | None,
         None,
         None,
+        None,
         torch.Tensor | None,
         torch.Tensor | None,
         None,
         None,
         None,
+        None,
         torch.Tensor | None,
         torch.Tensor | None,
         None,
         None,
         None,
+        None,
         torch.Tensor | None,
         torch.Tensor | None,
         None,
         None,
         None,
         None,
+        None,
     ]:
         """
         Performs backward pass computation for LoRA MLP.
@@ -222,7 +252,7 @@ class LoRA_MLP(torch.autograd.Function):
         Returns:
             Tuple containing gradients for all inputs from forward pass:
             - Input gradient tensor (or `None`)
-            - `None` for weights/quantization states
+            - `None` for weights/biases/quantization states
             - LoRA A/B matrix gradients (or `None`)
             - `None` for scaling factors
             - `None` for activation functions and flags
@@ -265,9 +295,10 @@ class LoRA_MLP(torch.autograd.Function):
         dtype = X.dtype
 
         # Down projection
-        DW = matmul_lora(
+        grad_down = matmul_lora(
             grad_output,
             down_weight.t(),
+            None,
             down_quant,
             down_B,
             down_A,
@@ -275,24 +306,24 @@ class LoRA_MLP(torch.autograd.Function):
         )
 
         # Activation backward
-        h, grad_gate, grad_up = ctx.activation_fn_backward(DW, gate, up)
+        h, grad_gate, grad_up = ctx.activation_fn_backward(grad_down, gate, up)
 
         # Initialize and compute LoRA gradients
         d_down_A = d_down_B = d_up_A = d_up_B = d_gate_A = d_gate_B = None
 
-        if down_A is not None:
+        if down_A is not None and down_B is not None:
             d_down_A = h.t() @ (grad_output @ down_B.t())
             d_down_B = (down_A.t() @ h.t()) @ grad_output
             d_down_A *= down_scale
             d_down_B *= down_scale
 
-        if up_A is not None:
+        if up_A is not None and up_B is not None:
             d_up_A = X.t() @ (grad_up @ up_B.t())
             d_up_B = (up_A.t() @ X.t()) @ grad_up
             d_up_A *= up_scale
             d_up_B *= up_scale
 
-        if gate_A is not None:
+        if gate_A is not None and gate_B is not None:
             d_gate_A = X.t() @ (grad_gate @ gate_B.t())
             d_gate_B = (gate_A.t() @ X.t()) @ grad_gate
             d_gate_A *= gate_scale
@@ -311,15 +342,15 @@ class LoRA_MLP(torch.autograd.Function):
             del up_weight
 
             # Note the .to(dtype) only where mixing LoRA with base weights
-            if up_A is not None:
+            if up_A is not None and up_B is not None:
                 dX += grad_up @ up_B.to(dtype).t() @ (up_scale * up_A.to(dtype).t())
 
             # Gate projection gradients
-            gate_weight = dequantize(gate_weight.t(), gate_quant)
-            dX += grad_gate @ gate_weight.t()
+            gate_weight = dequantize(gate_weight, gate_quant)
+            dX += grad_gate @ gate_weight
             del gate_weight
 
-            if gate_A is not None:
+            if gate_A is not None and gate_B is not None:
                 dX += (
                     grad_gate
                     @ gate_B.to(dtype).t()
@@ -334,22 +365,26 @@ class LoRA_MLP(torch.autograd.Function):
             dX,
             None,
             None,
+            None,
             d_gate_A.t() if d_gate_A is not None else None,
             d_gate_B.t() if d_gate_B is not None else None,
             None,
             None,
             None,
+            None,
             d_up_A.t() if d_up_A is not None else None,
             d_up_B.t() if d_up_B is not None else None,
             None,
             None,
             None,
+            None,
             d_down_A.t() if d_down_A is not None else None,
             d_down_B.t() if d_down_B is not None else None,
             None,
             None,
             None,
             None,
+            None,
         )
 
 
@@ -364,23 +399,26 @@ def apply_lora_mlp_swiglu(self, X: torch.Tensor, inplace: bool = True) -> torch.
     Returns:
         Output tensor after applying LoRA-adapted MLP with SwiGLU activation
     """
-    gateW, gateW_quant, gateA, gateB, gateS = get_lora_parameters(self.gate_proj)
-    upW, upW_quant, upA, upB, upS = get_lora_parameters(self.up_proj)
-    downW, downW_quant, downA, downB, downS = get_lora_parameters(self.down_proj)
+    gateW, gateb, gateW_quant, gateA, gateB, gateS = get_lora_parameters(self.gate_proj)
+    upW, upb, upW_quant, upA, upB, upS = get_lora_parameters(self.up_proj)
+    downW, downb, downW_quant, downA, downB, downS = get_lora_parameters(self.down_proj)
 
     out = LoRA_MLP.apply(
         X,
         gateW,
+        gateb,
         gateW_quant,
         gateA,
         gateB,
         gateS,
         upW,
+        upb,
         upW_quant,
         upA,
         upB,
         upS,
         downW,
+        downb,
         downW_quant,
         downA,
         downB,
@@ -404,22 +442,25 @@ def apply_lora_mlp_geglu(self, X: torch.Tensor, inplace: bool = True) -> torch.T
     Returns:
         Output tensor after applying LoRA-adapted MLP with GEGLU activation
     """
-    gateW, gateW_quant, gateA, gateB, gateS = get_lora_parameters(self.gate_proj)
-    upW, upW_quant, upA, upB, upS = get_lora_parameters(self.up_proj)
-    downW, downW_quant, downA, downB, downS = get_lora_parameters(self.down_proj)
+    gateW, gateb, gateW_quant, gateA, gateB, gateS = get_lora_parameters(self.gate_proj)
+    upW, upb, upW_quant, upA, upB, upS = get_lora_parameters(self.up_proj)
+    downW, downb, downW_quant, downA, downB, downS = get_lora_parameters(self.down_proj)
     out = LoRA_MLP.apply(
         X,
         gateW,
+        gateb,
         gateW_quant,
         gateA,
         gateB,
         gateS,
         upW,
+        upb,
         upW_quant,
         upA,
         upB,
         upS,
         downW,
+        downb,
         downW_quant,
         downA,
         downB,
@@ -446,16 +487,19 @@ class LoRA_QKV(torch.autograd.Function):
         ctx: torch.autograd.function.FunctionCtx,
         X: torch.Tensor,
         q_weight: torch.Tensor,
+        q_bias: torch.Tensor | None,
         q_quant: QuantState | None,
         q_A: torch.Tensor | None,
         q_B: torch.Tensor | None,
         q_scale: float,
         k_weight: torch.Tensor,
+        k_bias: torch.Tensor | None,
         k_quant: QuantState | None,
         k_A: torch.Tensor | None,
         k_B: torch.Tensor | None,
         k_scale: float,
         v_weight: torch.Tensor,
+        v_bias: torch.Tensor | None,
         v_quant: QuantState | None,
         v_A: torch.Tensor | None,
         v_B: torch.Tensor | None,
@@ -469,16 +513,19 @@ class LoRA_QKV(torch.autograd.Function):
             ctx: Autograd context
             X: Input tensor
             q_weight: Query projection weight
+            q_bias: Query projection bias
             q_quant: Query quantization state
             q_A: Query LoRA A matrix
             q_B: Query LoRA B matrix
             q_scale: Query LoRA scale
             k_weight: Key projection weight
+            k_bias: Key projection bias
             k_quant: Key quantization state
             k_A: Key LoRA A matrix
             k_B: Key LoRA B matrix
             k_scale: Key LoRA scale
             v_weight: Value projection weight
+            v_bias: Value projection bias
             v_quant: Value quantization state
             v_A: Value LoRA A matrix
             v_B: Value LoRA B matrix
@@ -488,20 +535,21 @@ class LoRA_QKV(torch.autograd.Function):
         Returns:
             Tuple of (Query, Key, Value) projection tensors
         """
-        Q = matmul_lora(X, q_weight, q_quant, q_A, q_B, q_scale)
-        K = matmul_lora(X, k_weight, k_quant, k_A, k_B, k_scale)
-        V = matmul_lora(X, v_weight, v_quant, v_A, v_B, v_scale)
+        Q = matmul_lora(X, q_weight, q_bias, q_quant, q_A, q_B, q_scale)
+        K = matmul_lora(X, k_weight, k_bias, k_quant, k_A, k_B, k_scale)
+        V = matmul_lora(X, v_weight, v_bias, v_quant, v_A, v_B, v_scale)
 
         ctx.save_for_backward(X, q_A, q_B, k_A, k_B, v_A, v_B)
         ctx.scales = (q_scale, k_scale, v_scale)
         ctx.quants = (q_quant, k_quant, v_quant)
         ctx.weights = (q_weight, k_weight, v_weight)
+        ctx.biases = (q_bias, k_bias, v_bias)
         ctx.inplace = inplace
 
         return Q, K, V
 
     @staticmethod
-    @torch_amp_custom_fwd
+    @torch_amp_custom_bwd
     def backward(
         ctx: torch.autograd.function.FunctionCtx,
         q_grad: torch.Tensor,
@@ -511,16 +559,19 @@ class LoRA_QKV(torch.autograd.Function):
         torch.Tensor,
         None,
         None,
+        None,
         torch.Tensor | None,
         torch.Tensor | None,
         None,
         None,
         None,
+        None,
         torch.Tensor | None,
         torch.Tensor | None,
         None,
         None,
         None,
+        None,
         torch.Tensor | None,
         torch.Tensor | None,
         None,
@@ -608,31 +659,31 @@ class LoRA_QKV(torch.autograd.Function):
         # Transpose gradients if needed
         if d_A_q is not None:
             d_A_q = d_A_q.t()
-        if d_B_q is not None:
-            d_B_q = d_B_q.t()
+            d_B_q = d_B_q.t()  # type: ignore[union-attr]
         if d_A_k is not None:
             d_A_k = d_A_k.t()
-        if d_B_k is not None:
-            d_B_k = d_B_k.t()
+            d_B_k = d_B_k.t()  # type: ignore[union-attr]
         if d_A_v is not None:
             d_A_v = d_A_v.t()
-        if d_B_v is not None:
-            d_B_v = d_B_v.t()
+            d_B_v = d_B_v.t()  # type: ignore[union-attr]
 
         return (
             grad_X.view(batch, seq_len, -1),
             None,
             None,
+            None,
             d_A_q,
             d_B_q,
             None,
             None,
             None,
+            None,
             d_A_k,
             d_B_k,
             None,
             None,
             None,
+            None,
             d_A_v,
             d_B_v,
             None,
@@ -653,22 +704,25 @@ def apply_lora_qkv(
     Returns:
         Tuple of (Query, Key, Value) projection tensors
     """
-    QW, QW_quant, QA, QB, QS = get_lora_parameters(self.q_proj)
-    KW, KW_quant, KA, KB, KS = get_lora_parameters(self.k_proj)
-    VW, VW_quant, VA, VB, VS = get_lora_parameters(self.v_proj)
+    QW, Qb, QW_quant, QA, QB, QS = get_lora_parameters(self.q_proj)
+    KW, Kb, KW_quant, KA, KB, KS = get_lora_parameters(self.k_proj)
+    VW, Vb, VW_quant, VA, VB, VS = get_lora_parameters(self.v_proj)
     Q, K, V = LoRA_QKV.apply(
         X,
         QW,
+        Qb,
         QW_quant,
         QA,
         QB,
         QS,
         KW,
+        Kb,
         KW_quant,
         KA,
         KB,
         KS,
         VW,
+        Vb,
         VW_quant,
         VA,
         VB,
@@ -688,10 +742,11 @@ class LoRA_O(torch.autograd.Function):
         ctx: torch.autograd.function.FunctionCtx,
         X: torch.Tensor,
         W: torch.Tensor,
+        b: torch.Tensor,
         W_quant: QuantState | None,
-        A: torch.Tensor | None,
-        B: torch.Tensor | None,
-        S: float,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        s: float,
     ) -> torch.Tensor:
         """
         Forward pass for output projection with LoRA.
@@ -700,19 +755,20 @@ class LoRA_O(torch.autograd.Function):
             ctx: Autograd context
             X: Input tensor
             W: Output projection weight
+            b: Output projection bias
             W_quant: Weight quantization state
             A: LoRA A matrix
             B: LoRA B matrix
-            S: LoRA scaling factor
+            s: LoRA scaling factor
 
         Returns:
-            Output projection tensor
+            Output projection result
         """
-        XW = matmul_lora(X, W, W_quant, A, B, S)
+        XW = matmul_lora(X, W, b, W_quant, A, B, s)
         ctx.custom_saved_tensors = (
             W,
             W_quant,
-            S,
+            s,
         )
         ctx.save_for_backward(A, B, X)
 
@@ -727,8 +783,9 @@ class LoRA_O(torch.autograd.Function):
         torch.Tensor,
         None,
         None,
-        torch.Tensor | None,
-        torch.Tensor | None,
+        None,
+        torch.Tensor,
+        torch.Tensor,
         None,
     ]:
         """
@@ -741,7 +798,7 @@ class LoRA_O(torch.autograd.Function):
         Returns:
             Tuple containing gradients for all forward inputs
         """
-        W, W_quant, S = ctx.custom_saved_tensors
+        W, W_quant, s = ctx.custom_saved_tensors
         A, B, X = ctx.saved_tensors
 
         batch, seq_len, hd = X.shape
@@ -751,17 +808,19 @@ class LoRA_O(torch.autograd.Function):
 
         # Weight projection
         dY_X = X.t() @ dY
-        d_A = S * dY_X @ B
-        d_B = S * A @ dY_X
+        d_A = s * dY_X @ B
+        d_B = s * A @ dY_X
 
         # Get derivative for dX
         W = dequantize(W.t(), W_quant)
         dX = dY @ W.t()
         del W
-        dX += dY @ B.to(dtype) @ (S * A.to(dtype))
 
-        # W, W_quant, A, B, S
-        return dX.view(batch, seq_len, hd), None, None, d_A.t(), d_B.t(), None
+        A, B = A.to(dtype), B.to(dtype)
+        dX += s * dY @ B @ A
+
+        # W, b, W_quant, A, B, s
+        return dX.view(batch, seq_len, hd), None, None, None, d_A.t(), d_B.t(), None
 
 
 def apply_lora_o(self, X: torch.Tensor) -> torch.Tensor:
@@ -774,7 +833,7 @@ def apply_lora_o(self, X: torch.Tensor) -> torch.Tensor:
     Returns:
         Transformed output tensor
     """
-    OW, OW_quant, OA, OB, OS = get_lora_parameters(self.o_proj)
-    output = LoRA_O.apply(X, OW, OW_quant, OA, OB, OS)
+    OW, Ob, OW_quant, OA, OB, OS = get_lora_parameters(self.o_proj)
+    output = LoRA_O.apply(X, OW, Ob, OW_quant, OA, OB, OS)
 
     return output
diff --git a/src/axolotl/loaders/adapter.py b/src/axolotl/loaders/adapter.py
index f7a484e9b..db28206b6 100644
--- a/src/axolotl/loaders/adapter.py
+++ b/src/axolotl/loaders/adapter.py
@@ -1,6 +1,5 @@
 """Adapter loading functionality, including LoRA / QLoRA and associated utils"""
 
-import logging
 import os
 import types
 from typing import Any
@@ -21,8 +20,9 @@ from transformers import PreTrainedModel
 
 from axolotl.loaders.utils import get_linear_embedding_layers
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
 
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__)
 
 
 def setup_quantized_meta_for_peft(model: torch.nn.Module):
@@ -76,6 +76,7 @@ def load_lora(
     config_only: bool = False,
 ) -> tuple[PreTrainedModel | PeftModel | PeftMixedModel | None, PeftConfig | None]:
     lora_target_modules = cfg.lora_target_modules or []
+    lora_target_parameters = cfg.lora_target_parameters or []
 
     if cfg.lora_target_linear:
         linear_names = find_all_linear_names(model)
@@ -106,6 +107,7 @@ def load_lora(
         r=cfg.lora_r,
         lora_alpha=cfg.lora_alpha,
         target_modules=lora_target_modules,
+        target_parameters=lora_target_parameters,
         layers_to_transform=cfg.peft_layers_to_transform,
         layers_pattern=cfg.peft_layers_pattern,
         lora_dropout=cfg.lora_dropout,
@@ -122,9 +124,9 @@ def load_lora(
     rank = int(os.environ.get("LOCAL_RANK", 0))
 
     if (
-        cfg.fsdp
+        cfg.fsdp_config
         and cfg.adapter
-        and cfg.fsdp_config.fsdp_cpu_ram_efficient_loading
+        and cfg.fsdp_config.cpu_ram_efficient_loading
         and rank != 0
     ):
         setup_quantized_meta_for_peft(model)
@@ -152,9 +154,9 @@ def load_lora(
                 "Exception caught during model.print_trainable_parameters(): %s", exc
             )
     elif (
-        cfg.fsdp
+        cfg.fsdp_config
         and cfg.adapter
-        and cfg.fsdp_config.fsdp_cpu_ram_efficient_loading
+        and cfg.fsdp_config.cpu_ram_efficient_loading
         and rank != 0
     ):
         setup_quantized_peft_meta_for_training(model)
diff --git a/src/axolotl/loaders/adapters/__init__.py b/src/axolotl/loaders/adapters/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/axolotl/loaders/constants.py b/src/axolotl/loaders/constants.py
index c08518dd6..3fabf9d94 100644
--- a/src/axolotl/loaders/constants.py
+++ b/src/axolotl/loaders/constants.py
@@ -2,6 +2,7 @@
 
 from transformers import (
     Gemma3ForConditionalGeneration,
+    Gemma3nForConditionalGeneration,
     Llama4ForConditionalGeneration,
     LlavaForConditionalGeneration,
     Mistral3ForConditionalGeneration,
@@ -18,4 +19,13 @@ MULTIMODAL_AUTO_MODEL_MAPPING = {
     "qwen2_5_vl": Qwen2_5_VLForConditionalGeneration,
     "mistral3": Mistral3ForConditionalGeneration,
     "gemma3": Gemma3ForConditionalGeneration,
+    "gemma3n": Gemma3nForConditionalGeneration,
 }
+
+try:
+    from transformers import VoxtralForConditionalGeneration
+
+    # transformers >4.53.2
+    MULTIMODAL_AUTO_MODEL_MAPPING["voxtral"] = VoxtralForConditionalGeneration
+except ImportError:
+    pass
diff --git a/src/axolotl/loaders/model.py b/src/axolotl/loaders/model.py
index d7ac84a6d..6bf1f149b 100644
--- a/src/axolotl/loaders/model.py
+++ b/src/axolotl/loaders/model.py
@@ -1,9 +1,8 @@
-"""Model loader class implementation for loading, configuring, and patching various
-models.
+"""
+Model loader class implementation for loading, configuring, and patching various models.
 """
 
 import gc
-import logging
 import math
 import os
 from functools import cached_property
@@ -15,7 +14,15 @@ import torch
 import transformers
 import transformers.modeling_utils
 from accelerate import init_empty_weights
-from peft import PeftConfig, PeftMixedModel, PeftModel, prepare_model_for_kbit_training
+from accelerate.parallelism_config import ParallelismConfig
+from peft import (
+    PeftConfig,
+    PeftMixedModel,
+    PeftModel,
+    PeftModelForCausalLM,
+    prepare_model_for_kbit_training,
+)
+from torch.distributed import DeviceMesh
 from transformers import (
     AutoModelForCausalLM,
     AutoModelForVision2Seq,
@@ -44,13 +51,15 @@ from axolotl.models.mamba import fix_mamba_attn_for_loss
 from axolotl.utils.bench import log_gpu_memory_usage
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import (
+    build_parallelism_config,
     get_device_count,
     get_device_type,
 )
+from axolotl.utils.logging import get_logger
 from axolotl.utils.model_shard_quant import load_sharded_model_quant
 from axolotl.utils.schemas.enums import RLType
 
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__)
 PLUGIN_MANAGER = PluginManager.get_instance()
 
 
@@ -81,6 +90,10 @@ class ModelLoader:
             `AutoModelForCausalLM`).
     """
 
+    use_parallel_config: bool | None = False
+    parallelism_config: ParallelismConfig | None = None
+    device_mesh: DeviceMesh | None = None
+
     def __init__(
         self,
         cfg: DictDefault,
@@ -134,12 +147,17 @@ class ModelLoader:
         """Check if flash attention is installed."""
         return find_spec("flash_attn") is not None
 
-    @cached_property
-    def qlora_fsdp(self):
-        """Property that determines if FSDP with QLoRA is enabled."""
-        return self.cfg.fsdp and self.cfg.adapter == "qlora"
+    @property
+    def is_fsdp_enabled(self):
+        """Property that determines if FSDP is enabled."""
+        return self.cfg.fsdp_config is not None or self.cfg.fsdp is not None
 
-    def load(self) -> tuple[PreTrainedModel, PeftConfig | None]:
+    @property
+    def is_qlora_and_fsdp_enabled(self):
+        """Property that determines if FSDP with QLoRA is enabled."""
+        return self.is_fsdp_enabled and self.cfg.adapter == "qlora"
+
+    def load(self) -> tuple[PreTrainedModel | PeftModelForCausalLM, PeftConfig | None]:
         """Load and prepare the model with all configurations and patches.
 
         Returns:
@@ -151,6 +169,7 @@ class ModelLoader:
 
         # Build the model
         PLUGIN_MANAGER.pre_model_load(self.cfg)
+        self.patch_manager.apply_post_plugin_pre_model_load_patches()
         skip_move_to_device = self._build_model()
         PLUGIN_MANAGER.post_model_build(self.cfg, self.model)
 
@@ -171,10 +190,26 @@ class ModelLoader:
 
     def _apply_pre_model_load_setup(self):
         """Apply patches and setup configurations before model loading."""
+        if self.use_parallel_config is not None:
+            self.use_parallel_config = (
+                self.cfg.fsdp_config
+                or (self.cfg.tensor_parallel_size and self.cfg.tensor_parallel_size > 1)
+                or (
+                    self.cfg.context_parallel_size
+                    and self.cfg.context_parallel_size > 1
+                )
+            )
+            if self.cfg.fsdp_config and self.cfg.fsdp_version != 2:
+                self.use_parallel_config = False
+
+        if self.use_parallel_config:
+            self._set_parallel_config()
         self._set_auto_model_loader()
         self._set_device_map_config()
         if self.cfg.revision_of_model:
             self.model_kwargs["revision"] = self.cfg.revision_of_model
+        if self.cfg.use_kernels:
+            self.model_kwargs["use_kernels"] = self.cfg.use_kernels
         self._set_quantization_config()
         self._set_attention_config()
 
@@ -183,14 +218,25 @@ class ModelLoader:
         # Handle PeftModel if needed
         if (
             isinstance(self.model, (peft.PeftModel, peft.PeftModelForCausalLM))
-            and not self.qlora_fsdp
+            and not self.is_qlora_and_fsdp_enabled
         ):
             self.model = self.model.merge_and_unload()
 
+        self._apply_activation_checkpointing()
         self._resize_token_embeddings()
         self._adjust_model_config()
-        self._log_memory_usage()
         self._configure_embedding_dtypes()
+        self._configure_qat()
+        log_gpu_memory_usage(LOG, "Memory usage after model load", 0)
+
+    def _apply_activation_checkpointing(self):
+        if self.cfg.activation_offloading is True:
+            from axolotl.core.trainers.mixins.activation_checkpointing import (
+                ac_wrap_hf_model,
+            )
+
+            # ^^ importing this at the module level breaks plugins
+            ac_wrap_hf_model(self.model)
 
     def _resize_token_embeddings(self):
         """Resize token embeddings if needed."""
@@ -244,22 +290,13 @@ class ModelLoader:
         ):
             self.model.config.eos_token_id = self.tokenizer.eos_token_id
 
-    def _log_memory_usage(self):
-        """Log device memory usage after model load."""
-        if hasattr(self.model, "device") and self.model.device.type in (
-            "cuda",
-            "mps",
-            "npu",
-        ):
-            log_gpu_memory_usage(LOG, "after model load", self.model.device)
-
     def _configure_embedding_dtypes(self):
         """Configure embedding module dtypes."""
         # Get embedding modules
         embedding_modules = get_linear_embedding_layers(self.cfg.model_config_type)
 
         # Initial dtype conversion
-        if not self.cfg.fsdp:
+        if not self.is_fsdp_enabled:
             # We don't run this during FSDP because this will leave mixed and bfloat16
             # dtypes in the model which FSDP doesn't like
             if self.cfg.load_in_4bit and self.cfg.embeddings_skip_upcast:
@@ -271,11 +308,14 @@ class ModelLoader:
             )
 
         # Handle DeepSpeed Zero3
-        if is_deepspeed_zero3_enabled():
+        if (
+            is_deepspeed_zero3_enabled()
+            or os.getenv("ACCELERATE_DEEPSPEED_ZERO_STAGE") == "3"
+        ):
             self._set_z3_leaf_modules()
 
         # Apply gradient checkpointing if needed
-        needs_fa2_dtype = self.cfg.adapter or self.cfg.fsdp
+        needs_fa2_dtype = self.cfg.adapter or self.is_fsdp_enabled
         if self.cfg.adapter in ["lora", "qlora"]:
             needs_fa2_dtype = True
             if self.cfg.gradient_checkpointing:
@@ -291,10 +331,12 @@ class ModelLoader:
             # we need to convert them back to fp16/bf16 for flash-attn compatibility.
             (
                 (needs_fa2_dtype or self.cfg.flash_attention or self.cfg.flex_attention)
-                and not self.qlora_fsdp
+                and not self.is_qlora_and_fsdp_enabled
+            )
+            or (
+                # CCE requires embedding layers to be in fp16/bf16 for backward pass
+                self.cfg.cut_cross_entropy
             )
-            # CCE requires embedding layers to be in fp16/bf16 for backward pass
-            or self.cfg.cut_cross_entropy
         )
 
         if should_convert:
@@ -305,6 +347,19 @@ class ModelLoader:
                 before_kbit_train_or_finetune=False,
             )
 
+    def _configure_qat(self):
+        """Configure QAT."""
+        if self.cfg.qat:
+            from axolotl.utils.quantization import prepare_model_for_qat
+
+            prepare_model_for_qat(
+                self.model,
+                self.cfg.qat.weight_dtype,
+                self.cfg.qat.group_size,
+                self.cfg.qat.activation_dtype,
+                self.cfg.qat.quantize_embedding,
+            )
+
     def _load_adapters(self) -> PeftConfig | None:
         """Load LoRA or other adapters."""
         # Load LoRA or adapter
@@ -337,7 +392,6 @@ class ModelLoader:
             and not (self.cfg.rl and self.cfg.load_in_4bit)
             and not skip_move_to_device
         ):
-            # TODO: validate this conditional
             self.model.to(f"{str(get_device_type())}:{self.cfg.local_rank}")
 
         if get_device_count() > 1 and int(os.getenv("WORLD_SIZE", "1")) == 1:
@@ -362,6 +416,13 @@ class ModelLoader:
             gc.collect()
             torch.cuda.empty_cache()
 
+    def _set_parallel_config(self):
+        """Set parallelism configuration (DP, FSDP, TP, CP) in PartialState/Accelerator"""
+        parallelism_config, device_mesh = build_parallelism_config(self.cfg)
+        if parallelism_config:
+            self.parallelism_config = parallelism_config
+            self.device_mesh = device_mesh
+
     def _set_auto_model_loader(self):
         """Set `self.auto_model_loader`. Defaults to `transformers.AutoModelForCausalLM`
         (set at `__init__`). When using a multimodal model, `self.auto_model_loader`
@@ -410,7 +471,17 @@ class ModelLoader:
 
         self.model_kwargs["torch_dtype"] = self.cfg.torch_dtype
 
-        if not is_deepspeed_zero3_enabled():
+        is_ds_zero3 = is_deepspeed_zero3_enabled()
+
+        # FSDP requires control over device placement, so don't set device_map when FSDP is enabled
+        if self.is_fsdp_enabled:
+            # For QLoRA + FSDP, we still need to set device_map to "auto" for proper initialization
+            if self.is_qlora_and_fsdp_enabled:
+                self.model_kwargs["device_map"] = {
+                    "": int(os.environ.get("LOCAL_RANK", 0))
+                }
+            # For other FSDP cases, don't set device_map at all
+        elif not is_ds_zero3:
             self.model_kwargs["device_map"] = device_map
 
             cur_device = get_device_type()
@@ -432,8 +503,17 @@ class ModelLoader:
 
     def _set_quantization_config(self):
         """Set up quantization config (bitsandbytes, awq, gptq, etc.)"""
-        self.model_kwargs["load_in_8bit"] = self.cfg.load_in_8bit
-        self.model_kwargs["load_in_4bit"] = self.cfg.load_in_4bit
+
+        if self.cfg.model_quantization_config == "Mxfp4Config":
+            from transformers import Mxfp4Config
+
+            mxfp4_kwargs = {}
+            if self.cfg.model_quantization_config_kwargs:
+                mxfp4_kwargs = self.cfg.model_quantization_config_kwargs
+            self.model_kwargs["quantization_config"] = Mxfp4Config(**mxfp4_kwargs)
+        else:
+            self.model_kwargs["load_in_8bit"] = self.cfg.load_in_8bit
+            self.model_kwargs["load_in_4bit"] = self.cfg.load_in_4bit
 
         if self.cfg.gptq:
             if not hasattr(self.model_config, "quantization_config"):
@@ -468,7 +548,9 @@ class ModelLoader:
                 self.model_kwargs["quantization_config"] = BitsAndBytesConfig(
                     **self.model_config.quantization_config
                 )
-        elif self.cfg.adapter == "qlora" and self.model_kwargs["load_in_4bit"]:
+        elif self.cfg.adapter == "qlora" and self.model_kwargs.get(
+            "load_in_4bit", False
+        ):
             bnb_config = {
                 "load_in_4bit": True,
                 "llm_int8_threshold": 6.0,
@@ -479,11 +561,14 @@ class ModelLoader:
                 "bnb_4bit_quant_storage": torch.bfloat16,
             }
             if self.cfg.model_config_type in ["jamba", "qwen2_moe"] and not (
-                self.cfg.deepspeed or self.cfg.fsdp
+                self.cfg.deepspeed or self.is_fsdp_enabled
             ):
                 # for some reason, this causes the loss to be off by an order of magnitude
                 # but deepspeed needs this still in bfloat16
                 bnb_config["bnb_4bit_quant_storage"] = torch.float32
+            if self.cfg.model_config_type == "falcon_h1":
+                # output projection cannot be quantized for Falcon-H1 models
+                bnb_config["llm_int8_skip_modules"] = ["out_proj"]
 
             if self.cfg.bnb_config_kwargs:
                 bnb_config.update(self.cfg.bnb_config_kwargs)
@@ -491,13 +576,18 @@ class ModelLoader:
             self.model_kwargs["quantization_config"] = BitsAndBytesConfig(
                 **bnb_config,
             )
-        elif self.cfg.adapter == "lora" and self.model_kwargs["load_in_8bit"]:
+        elif self.cfg.adapter == "lora" and self.model_kwargs.get(
+            "load_in_8bit", False
+        ):
             bnb_config = {
                 "load_in_8bit": True,
             }
             # Exclude mamba blocks from int8 quantization for jamba
             if self.cfg.model_config_type == "jamba":
                 bnb_config["llm_int8_skip_modules"] = ["mamba"]
+            if self.cfg.model_config_type == "falcon_h1":
+                # output projection cannot be quantized for Falcon-H1 models
+                bnb_config["llm_int8_skip_modules"] = ["out_proj"]
             self.model_kwargs["quantization_config"] = BitsAndBytesConfig(
                 **bnb_config,
             )
@@ -509,7 +599,9 @@ class ModelLoader:
 
     def _set_attention_config(self):
         """Sample packing uses custom FA2 patch"""
-        if self.cfg.flex_attention:
+        if self.cfg.attn_implementation:
+            self.model_kwargs["attn_implementation"] = self.cfg.attn_implementation
+        elif self.cfg.flex_attention:
             self.model_kwargs["attn_implementation"] = "flex_attention"
             self.model_config._attn_implementation = (  # pylint: disable=protected-access
                 "flex_attention"
@@ -536,11 +628,18 @@ class ModelLoader:
         if self.cfg.low_cpu_mem_usage:
             self.model_kwargs["low_cpu_mem_usage"] = True
 
-    def _configure_zero3_memory_efficient_loading(self):
-        """Set the deepspeed config to load the model into RAM first before moving
-        to VRAM.
+    def _configure_zero3_memory_efficient_loading(
+        self,
+    ) -> HfTrainerDeepSpeedConfig | None:
+        """
+        Set the deepspeed config to load the model into RAM first before moving to VRAM.
 
-        We need to return `hf_ds_cfg` as it needs to exist before model loading.
+        IMPORTANT
+        ==========
+
+        We need to return `hf_ds_cfg` as it needs to exist before model loading for zero3.
+        HfTrainerDeepSpeedConfig is a class that is used to configure the DeepSpeed training.
+        It is not passed anywhere in the model loading function, just need to exist.
         """
         hf_ds_cfg = None
 
@@ -571,9 +670,41 @@ class ModelLoader:
     def _build_model(self) -> bool:
         """Load model, with load strategy depending on config."""
         skip_move_to_device = False
+
+        if self.cfg.tensor_parallel_size > 1:
+            self.model_kwargs["tp_size"] = self.cfg.tensor_parallel_size
+            self.model_kwargs["tp_plan"] = "auto"
+            self.model_kwargs["device_mesh"] = self.device_mesh
+            if "device_map" in self.model_kwargs:
+                del self.model_kwargs["device_map"]  # not compatible with `tp_plan`
+
+        if self.is_fsdp_enabled:
+            if self.cfg.fsdp_config.cpu_ram_efficient_loading:
+                skip_move_to_device = True
+                # Don't delete device_map for QLoRA + FSDP - it was set correctly in _set_device_map
+                if (
+                    "device_map" in self.model_kwargs
+                    and not self.is_qlora_and_fsdp_enabled
+                ):
+                    del self.model_kwargs["device_map"]
+            elif self.is_qlora_and_fsdp_enabled:
+                skip_move_to_device = True
+
+            if (
+                self.cfg.tensor_parallel_size <= 1
+                and self.cfg.fsdp_config.cpu_ram_efficient_loading
+                and self.cfg.fsdp_version == 2
+            ):
+                # setting device_map for TP is not supported
+                local_rank = int(os.getenv("LOCAL_RANK", "0"))
+                if local_rank == 0:
+                    self.model_kwargs["device_map"] = "cpu"
+                else:
+                    self.model_kwargs["device_map"] = "meta"
+
         if (
-            self.qlora_fsdp
-            and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading
+            self.is_qlora_and_fsdp_enabled
+            and self.cfg.fsdp_config.cpu_ram_efficient_loading
             and (
                 self.cfg.model_config_type == "dbrx"
                 or self.cfg.qlora_sharded_model_loading
@@ -599,13 +730,8 @@ class ModelLoader:
             and not self.cfg.trust_remote_code
             and not self.cfg.gptq
         ):
-            # TODO: Do we need to open this up for all models?
-            if self.cfg.fsdp and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading:
-                skip_move_to_device = True
-                if "device_map" in self.model_kwargs:
-                    del self.model_kwargs["device_map"]
-
-            self._configure_zero3_memory_efficient_loading()
+            # Please don't remove underscore binding without reading the fn docstring.
+            _ = self._configure_zero3_memory_efficient_loading()
 
             # Load model with random initialization if specified
             if self.cfg.random_init_weights:
@@ -657,35 +783,36 @@ class ModelLoader:
                     trust_remote_code=self.cfg.trust_remote_code or False,
                     **self.model_kwargs,
                 )
+        elif self.cfg.gptq:
+            self.model = self.auto_model_loader.from_pretrained(
+                self.base_model,
+                config=self.model_config,
+                trust_remote_code=self.cfg.trust_remote_code or False,
+                **self.model_kwargs,
+            )
         else:
-            if self.cfg.gptq:
-                self.model = self.auto_model_loader.from_pretrained(
-                    self.base_model,
-                    config=self.model_config,
-                    trust_remote_code=self.cfg.trust_remote_code or False,
-                    **self.model_kwargs,
-                )
-            else:
-                if (
-                    self.cfg.fsdp
-                    and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading
-                ):
-                    # disabling either of these two still leads to VRAM spike before setting back down
-                    skip_move_to_device = True
-                    if "device_map" in self.model_kwargs:
-                        del self.model_kwargs["device_map"]
-
-                self._configure_zero3_memory_efficient_loading()
-
-                self.model = self.auto_model_loader.from_pretrained(
-                    self.base_model,
-                    config=self.model_config,
-                    trust_remote_code=self.cfg.trust_remote_code or False,
-                    **self.model_kwargs,
-                )
+            # Please don't remove underscore binding without reading the fn docstring.
+            _ = self._configure_zero3_memory_efficient_loading()
+            self.model = self.auto_model_loader.from_pretrained(
+                self.base_model,
+                config=self.model_config,
+                trust_remote_code=self.cfg.trust_remote_code or False,
+                **self.model_kwargs,
+            )
         if is_deepspeed_zero3_enabled():
             skip_move_to_device = True
 
+        # pylint: disable=protected-access
+        if self.cfg.tensor_parallel_size > 1:
+            # workaround for upstream 4.54.0 not setting _tp_size or _device_mesh
+            # TODO(wing): remove once 4.54.1 is released
+            if self.model._tp_size != self.cfg.tensor_parallel_size:
+                self.model._tp_size = self.cfg.tensor_parallel_size
+                self.model._device_mesh = self.model_kwargs["device_mesh"]
+
+        if self.cfg.experimental_skip_move_to_device is not None:
+            skip_move_to_device = self.cfg.experimental_skip_move_to_device
+
         return skip_move_to_device
 
     def _set_z3_leaf_modules(self):
@@ -718,8 +845,8 @@ class ModelLoader:
             skip_prepare_model_for_kbit_training = True
 
         if (
-            self.qlora_fsdp
-            or (self.cfg.fsdp and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading)
+            self.is_qlora_and_fsdp_enabled
+            or (self.is_fsdp_enabled and self.cfg.fsdp_config.cpu_ram_efficient_loading)
             or is_deepspeed_zero3_enabled()
         ):
             # Make sure everything is in the same dtype
@@ -741,6 +868,9 @@ class ModelLoader:
         dist_dtype: torch.dtype,
         before_kbit_train_or_finetune: bool,
     ):
+        dest = {"dtype": dist_dtype}
+        if self.cfg.lora_on_cpu:
+            dest["device"] = "cpu"
         for name, module in self.model.named_modules():
             if "norm" in name:
                 module.to(dist_dtype)
@@ -751,4 +881,4 @@ class ModelLoader:
                     # don't upcast lm_head for btlm
                     continue
             if any(m in name for m in embedding_modules) and hasattr(module, "weight"):
-                module.to(dist_dtype)
+                module.to(**dest)
diff --git a/src/axolotl/loaders/patch_manager.py b/src/axolotl/loaders/patch_manager.py
index f251f958d..f1ca3c725 100644
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -4,7 +4,6 @@ Applies pre- and post-model load patches for various fixes and optimizations.
 """
 
 import importlib.util
-import logging
 from functools import cached_property
 
 import addict
@@ -17,8 +16,9 @@ from axolotl.monkeypatch.multipack import (
     patch_for_multipack,
 )
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
 
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__)
 PLUGIN_MANAGER = PluginManager.get_instance()
 
 
@@ -49,19 +49,43 @@ class PatchManager:
 
     def apply_pre_model_load_patches(self):
         """Apply pre-model load patches based on config."""
+        self._apply_transformers_patches()
+        # self._apply_flex_attention_patches()
         self._apply_flash_attention_patches()
+        self._apply_chunked_cross_entropy_patch()
         self._apply_fsdp_patches()
         self._apply_adapter_patches()
-        self._apply_flex_attention_patches()
         self._apply_model_specific_patches()
         self._apply_fp8_patches()
         self._apply_flash_attention_peft_patches()
         self._apply_gradient_checkpointing_patches()
         self._patch_attention()
         self._apply_multipack_patches()
+        self._patch_loss_llama()
         self._patch_llama_derived_model()
         self._apply_mistral_cross_entropy_patch()
-        self._apply_unsloth_self_attention_patch()
+        self._apply_self_attention_lora_patch()
+        self._apply_fsdp2_bnb_patches()
+
+    def apply_post_plugin_pre_model_load_patches(self):
+        """Apply post plugin-pre_model_load load patches based on config."""
+        self._apply_tiled_mlp(self.cfg.model_config_type)
+        self._apply_voxtral_patches()
+
+    def _apply_transformers_patches(self):
+        from axolotl.monkeypatch.transformers.trainer_loss_calc import (
+            patch_evaluation_loop,
+            patch_maybe_log_save_evaluate,
+        )
+
+        patch_fsdp2 = (
+            self.cfg.torch_compile
+            and self.cfg.fsdp_config
+            and self.cfg.fsdp_version == 2
+        )
+
+        patch_evaluation_loop(patch_fsdp2)
+        patch_maybe_log_save_evaluate()
 
     def apply_post_model_load_patches(self, model: PreTrainedModel):
         """Apply patches that require the model instance."""
@@ -77,12 +101,41 @@ class PatchManager:
             patch_xformers_attn_over_fa2()
             self.cfg.flash_attention = True
 
+    def _apply_chunked_cross_entropy_patch(self):
+        if self.cfg.chunked_cross_entropy:
+            from axolotl.monkeypatch.loss.chunked import patch_chunked_ce_loss_fn
+
+            if self.cfg.chunked_cross_entropy_num_chunks:
+                patch_chunked_ce_loss_fn(self.cfg.chunked_cross_entropy_num_chunks)
+            else:
+                patch_chunked_ce_loss_fn()
+
     def _apply_fsdp_patches(self):
         """Apply patches for FSDP configurations."""
-        if self.cfg.fsdp_config and str(self.cfg.fsdp_config.fsdp_version) == "2":
-            from axolotl.monkeypatch.accelerate.fsdp2 import patch_accelerate_fsdp_utils
+        if self.cfg.context_parallel_size > 1 or (
+            self.cfg.fsdp_config and str(self.cfg.fsdp_version) == "2"
+        ):
+            from axolotl.monkeypatch.accelerate.parallelism_config import (
+                patch_parallelism_config,
+            )
 
-            patch_accelerate_fsdp_utils()
+            patch_parallelism_config()
+        if self.cfg.fsdp_config and str(self.cfg.fsdp_version) == "2":
+            from axolotl.monkeypatch.accelerate.fsdp2 import patch_accelerate_fsdp2
+
+            patch_accelerate_fsdp2()
+            if self.cfg.rl:
+                from axolotl.monkeypatch.trainer.trl import patch_trl_prepare_fsdp2
+
+                patch_trl_prepare_fsdp2()
+
+        # if self.cfg.fsdp_config:
+        #     # see transformers#39152
+        #     from axolotl.monkeypatch.trainer_fsdp_optim import (
+        #         patch_training_loop_for_fsdp,
+        #     )
+        #
+        #     patch_training_loop_for_fsdp()
 
     def _apply_adapter_patches(self):
         """Apply patches for adapter configurations."""
@@ -94,14 +147,20 @@ class PatchManager:
     def _apply_flex_attention_patches(self):
         """Apply patches for flexible attention."""
         if self.cfg.flex_attention:
-            from axolotl.monkeypatch.attention.flex_attn import (
-                patch_flex_make_mask,
-                patch_flex_wrapper,
-            )
+            # from axolotl.monkeypatch.attention.flex_attn import (
+            #     patch_flex_make_mask,
+            #     patch_flex_wrapper,
+            # )
+            #
+            # flex_attn_compile_kwargs = self.cfg.flex_attn_compile_kwargs or {}
+            # patch_flex_wrapper(**flex_attn_compile_kwargs)
+            # patch_flex_make_mask()
+            if self.cfg.sample_packing:
+                from axolotl.core.attention.flex_block_mask import (
+                    patch_create_causal_mask,
+                )
 
-            flex_attn_compile_kwargs = self.cfg.flex_attn_compile_kwargs or {}
-            patch_flex_wrapper(**flex_attn_compile_kwargs)
-            patch_flex_make_mask()
+                patch_create_causal_mask(self.cfg.model_config_type)
 
     def _apply_model_specific_patches(self):
         """Apply patches specific to model architectures."""
@@ -115,13 +174,6 @@ class PatchManager:
 
             patch_llama4_linearized_modeling()
 
-        if self.cfg.model_config_type == "gemma3":
-            from axolotl.monkeypatch.gemma3 import (
-                patch_gemma3conditionalgeneration_forward,
-            )
-
-            patch_gemma3conditionalgeneration_forward()
-
     def _apply_fp8_patches(self):
         """Apply patches for FP8 support."""
         if self.cfg.fp8:
@@ -129,7 +181,9 @@ class PatchManager:
                 patch_create_accelerate_code_for_fp8,
             )
 
-            patch_create_accelerate_code_for_fp8()
+            patch_create_accelerate_code_for_fp8(
+                self.cfg.fp8_enable_fsdp_float8_all_gather
+            )
 
     def _apply_flash_attention_peft_patches(self):
         """Apply patches for Flash Attention with PEFT."""
@@ -142,13 +196,19 @@ class PatchManager:
 
     def _apply_gradient_checkpointing_patches(self):
         """Apply patches for gradient checkpointing."""
-        if self.cfg.gradient_checkpointing in ["unsloth", "offload"]:
+        if (
+            self.cfg.gradient_checkpointing
+            and self.cfg.activation_offloading == "legacy"
+        ):
             from axolotl.monkeypatch.gradient_checkpointing import (
                 hf_grad_checkpoint_offload_wrapper,
             )
 
             transformers.modeling_utils.checkpoint = hf_grad_checkpoint_offload_wrapper
-        if self.cfg.gradient_checkpointing == "offload_disk":
+        elif (
+            self.cfg.gradient_checkpointing
+            and self.cfg.activation_offloading == "offload_disk"
+        ):
             from axolotl.monkeypatch.gradient_checkpointing import (
                 hf_grad_checkpoint_disk_offload_wrapper,
             )
@@ -169,9 +229,20 @@ class PatchManager:
 
             patch_mistral_cross_entropy()
 
-    def _apply_unsloth_self_attention_patch(self):
-        """Apply Unsloth self-attention patches if configured."""
-        if self.cfg.unsloth_lora_qkv or self.cfg.unsloth_lora_o:
+    def _apply_self_attention_lora_patch(self):
+        """Apply self-attention LoRA patches if configured."""
+        if self.cfg.lora_qkv_kernel or self.cfg.lora_o_kernel:
+            # Only patch if conditions are met
+            can_patch = (
+                self.cfg.lora_dropout == 0
+                if hasattr(self.cfg, "lora_dropout")
+                else True
+            )  # default to True if lora_dropout is not set
+
+            if not can_patch:
+                LOG.warning("Cannot patch self-attention - requires no dropout")
+                return
+
             from axolotl.monkeypatch.lora_kernels import patch_self_attn_lora
 
             patch_self_attn_lora(self.cfg)
@@ -206,19 +277,49 @@ class PatchManager:
                 has_remote_code=has_remote_code,
             )
 
-            if self.cfg.is_llama_derived_model:
-                self._patch_loss_llama()
+    def _apply_fsdp2_bnb_patches(self):
+        """Apply FSDP2 BNB patches."""
+        if (
+            self.cfg.fsdp_config
+            and str(self.cfg.fsdp_version) == "2"
+            and self.cfg.adapter == "qlora"
+        ):
+            from axolotl.monkeypatch.fsdp2_qlora import (
+                apply_bnb_torch_function_patch,
+                apply_init_sharded_param_patch,
+                apply_init_unsharded_param_patch,
+            )
+
+            apply_bnb_torch_function_patch()
+            apply_init_sharded_param_patch()
+            apply_init_unsharded_param_patch()
+
+    def _apply_tiled_mlp(self, model_type: str):
+        if self.cfg.tiled_mlp:
+            from axolotl.monkeypatch.tiled_mlp import (
+                patch_tiled_mlp,
+            )
+
+            patch_tiled_mlp(
+                model_type,
+                use_original_mlp=self.cfg.tiled_mlp_use_original_mlp,
+                cfg_num_shards=self.cfg.tiled_mlp_num_shards,
+            )
+
+    def _apply_voxtral_patches(self):
+        """Apply patches for Voxtral model."""
+        if self.cfg.model_config_type == "voxtral":
+            from axolotl.monkeypatch.models.voxtral.modeling import (
+                patch_voxtral_conditional_generation_forward,
+            )
+
+            patch_voxtral_conditional_generation_forward()
 
     def _patch_attention(self):
         """Apply attention-specific patches based on model type."""
         if not (self.cfg.flash_attention and hasattr(self.model_config, "model_type")):
             return
 
-        if self.model_config.model_type == "mllama" and self.cfg.flash_attention:
-            from axolotl.monkeypatch.attention.mllama import patch_mllama
-
-            patch_mllama()
-
         if self.model_config.model_type == "btlm":
             from axolotl.monkeypatch.btlm_attn_hijack_flash import (
                 replace_btlm_attn_with_flash_attn,
@@ -235,6 +336,9 @@ class PatchManager:
 
     def _patch_loss_llama(self):
         """Patch loss functions and other optimizations for LLaMA models."""
+        if not self.cfg.is_llama_derived_model:
+            return
+
         if self.cfg.flash_attn_cross_entropy and self.has_flash_attn:
             from axolotl.monkeypatch.llama_attn_hijack_flash import (
                 patch_fa_llama_cross_entropy,
@@ -260,31 +364,21 @@ class PatchManager:
 
             patch_self_attn_lora()
 
-    def _patch_llama_flash_attention(self, packed=False):
+    def _patch_llama_flash_attention(self):
         """Apply Flash Attention patches for LLaMA models."""
         from axolotl.monkeypatch.llama_attn_hijack_flash import (
             replace_llama_attn_with_flash_attn,
         )
 
-        if packed:
-            if self.cfg.device not in ["mps", "cpu"] and not self.inference:
-                LOG.info("patching with flash attention for sample packing")
-                replace_llama_attn_with_flash_attn(
-                    packed=True,
-                    cross_entropy=self.cfg.flash_attn_cross_entropy,
-                    rms_norm=self.cfg.flash_attn_rms_norm,
-                )
-        elif self.cfg.s2_attention:
+        if self.cfg.s2_attention:
             LOG.info("patching w/ flash-enabled, shifted-sparse attention")
             replace_llama_attn_with_flash_attn(
-                packed=False,
                 cross_entropy=self.cfg.flash_attn_cross_entropy,
                 rms_norm=self.cfg.flash_attn_rms_norm,
                 use_shifted_sparse_attn=True,
             )
         elif self.cfg.flash_attn_cross_entropy or self.cfg.flash_attn_rms_norm:
             replace_llama_attn_with_flash_attn(
-                packed=False,
                 cross_entropy=self.cfg.flash_attn_cross_entropy,
                 rms_norm=self.cfg.flash_attn_rms_norm,
             )
@@ -314,10 +408,8 @@ class PatchManager:
             and (self.cfg.flash_attention or self.cfg.flex_attention)
             and self.cfg.sample_packing
         ):
-            self._patch_loss_llama()
-
             if self.cfg.flash_attention:
-                self._patch_llama_flash_attention(packed=self.cfg.sample_packing)
+                self._patch_llama_flash_attention()
             elif self.cfg.xformers_attention:
                 self._patch_llama_xformers_attention()
             elif self.cfg.sample_packing:
@@ -340,17 +432,12 @@ class PatchManager:
             from axolotl.monkeypatch.llama_attn_hijack_flash import (
                 is_xformers_swiglu_available,
                 replace_llama_mlp_with_swiglu,
-                replace_llama_qkv_with_fused,
             )
 
             if self.cfg.flash_attn_fuse_mlp and is_xformers_swiglu_available():
                 LOG.info("Patching with SwiGLU...")
                 replace_llama_mlp_with_swiglu(model)
 
-            if self.cfg.flash_attn_fuse_qkv:
-                LOG.info("Patching with fused QKV...")
-                replace_llama_qkv_with_fused(model)
-
     def _apply_unsloth_patches(self, model):
         """Apply unsloth optimization patches."""
         if self.cfg.unsloth_lora_mlp:
diff --git a/src/axolotl/loaders/processor.py b/src/axolotl/loaders/processor.py
index 57394bc67..2e3ec8d7f 100644
--- a/src/axolotl/loaders/processor.py
+++ b/src/axolotl/loaders/processor.py
@@ -1,6 +1,5 @@
 """Processor loading functionality for multi-modal models"""
 
-import logging
 from typing import Any
 
 import transformers
@@ -10,8 +9,9 @@ from transformers import (
 )
 
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
 
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__)
 
 
 def load_processor(cfg: DictDefault, tokenizer: PreTrainedTokenizerBase):
diff --git a/src/axolotl/loaders/tokenizer.py b/src/axolotl/loaders/tokenizer.py
index ec9d69e8a..0a486d023 100644
--- a/src/axolotl/loaders/tokenizer.py
+++ b/src/axolotl/loaders/tokenizer.py
@@ -1,26 +1,28 @@
 """Tokenizer loading functionality and associated utils"""
 
 import json
-import logging
 import os
 
 import transformers
 from transformers import (
     AddedToken,
     AutoTokenizer,
+    PreTrainedTokenizer,
 )
 
 from axolotl.integrations.base import PluginManager
 from axolotl.loaders.utils import get_linear_embedding_layers, load_model_config
 from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN
 from axolotl.utils.chat_templates import get_chat_template_from_config
+from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import (
     barrier,
     is_local_main_process,
     is_main_process,
 )
+from axolotl.utils.logging import get_logger
 
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__)
 PLUGIN_MANAGER = PluginManager.get_instance()
 
 
@@ -117,8 +119,26 @@ def modify_tokenizer_files(
     return tokenizer_dir
 
 
-def load_tokenizer(cfg):
+def load_tokenizer(cfg: DictDefault) -> PreTrainedTokenizer:
     """Load and configure the tokenizer based on the provided config."""
+
+    def _load_mistral_common_tokenizer(cfg: DictDefault):
+        """Load mistral-common tokenizer"""
+        from transformers import tokenization_mistral_common
+
+        from axolotl.utils.mistral import HFMistralTokenizer
+
+        # patch
+        tokenization_mistral_common.MistralCommonTokenizer = HFMistralTokenizer
+
+        # Load the HF-compatible wrapper around MistralTokenizer
+        tokenizer = HFMistralTokenizer.from_pretrained(cfg.tokenizer_config)
+
+        return tokenizer
+
+    if cfg.tokenizer_use_mistral_common:
+        return _load_mistral_common_tokenizer(cfg)
+
     model_config = load_model_config(cfg)
     tokenizer_kwargs = {}
     use_fast = True  # this is the default
@@ -173,7 +193,8 @@ def load_tokenizer(cfg):
         tokenizer.padding_side = "left"
 
     # Qwen base only has single token, so we need to set the special tokens
-    if cfg.is_qwen_derived_model:
+    # the following check is for Qwen1 base models
+    if cfg.is_qwen_derived_model and hasattr(tokenizer, "eod_id"):
         token_ids = ["bos_token_id", "eos_token_id", "pad_token_id", "unk_token_id"]
         for attr_name in token_ids:
             if getattr(tokenizer, attr_name) is None:
@@ -207,11 +228,12 @@ def load_tokenizer(cfg):
                 )
                 and k != "pad_token"
             ):
-                lora_modules_to_save = ", ".join(
+                lora_modules_to_save_str = ", ".join(
                     [f"`{x}`" for x in lora_modules_to_save]
                 )
                 raise ValueError(
-                    f"Please set lora_modules_to_save to [{lora_modules_to_save}] when using an adapter and changing the special tokens."
+                    f"Please set lora_modules_to_save to [{lora_modules_to_save_str}] "
+                    "when using an adapter and changing the special tokens."
                 )
 
             tokenizer.add_special_tokens(
@@ -257,7 +279,7 @@ def load_tokenizer(cfg):
             {"additional_special_tokens": additional_special_tokens}
         )
 
-    if is_main_process(use_environ=True):
+    if is_main_process():
         LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
         LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
         LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
@@ -278,4 +300,9 @@ def load_tokenizer(cfg):
         LOG.info(
             "No Chat template selected. Consider adding a chat template for easier inference."
         )
+
+    # make the tokenizer.pad call quieter 🤐
+    if hasattr(tokenizer, "deprecation_warnings"):
+        tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
+
     return tokenizer
diff --git a/src/axolotl/loaders/utils.py b/src/axolotl/loaders/utils.py
index 1aae4834d..240e00da7 100644
--- a/src/axolotl/loaders/utils.py
+++ b/src/axolotl/loaders/utils.py
@@ -1,7 +1,6 @@
 """Utilities for axolotl.loaders module"""
 
 import contextlib
-import logging
 from typing import Type
 
 import addict
@@ -9,8 +8,9 @@ import torch
 from transformers import AutoConfig, PretrainedConfig, PreTrainedModel
 
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
 
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__)
 
 
 def get_module_class_from_name(
@@ -131,6 +131,17 @@ def check_model_config(cfg: DictDefault, model_config: PretrainedConfig):
             f"Please include [{lora_modules_to_save_joined}] in `lora_modules_to_save`."
         )
 
+    if (
+        cfg.tensor_parallel_size
+        and cfg.tensor_parallel_size > 1
+        and hasattr(model_config, "tie_word_embeddings")
+        and model_config.tie_word_embeddings
+    ):
+        raise ValueError(
+            "Tensor parallelism is incompatible with models configured with `tie_word_embeddings` enabled. "
+            "Please use a model without `tie_word_embeddings`, or disable tensor parallelism."
+        )
+
 
 def load_model_config(cfg: DictDefault) -> PretrainedConfig | addict.Dict:
     """Loads and configures a model configuration from HuggingFace or local sources.
@@ -195,9 +206,11 @@ def ensure_dtype(model: PreTrainedModel, dtype: torch.dtype = torch.bfloat16):
             bias_mismatch = module.bias.dtype != dtype
 
         if weight_mismatch:
-            print(f"Converting module {name}.weight: {module.weight.dtype} -> {dtype}")
+            LOG.debug(
+                f"Converting module {name}.weight: {module.weight.dtype} -> {dtype}"
+            )
         if bias_mismatch:
-            print(f"Converting module {name}.bias: {module.bias.dtype} -> {dtype}")
+            LOG.debug(f"Converting module {name}.bias: {module.bias.dtype} -> {dtype}")
         if weight_mismatch or bias_mismatch:
             module.to(dtype)
 
diff --git a/src/axolotl/logging_config.py b/src/axolotl/logging_config.py
index 2ddf89a8c..10c5ae9dc 100644
--- a/src/axolotl/logging_config.py
+++ b/src/axolotl/logging_config.py
@@ -2,14 +2,64 @@
 Common logging module for axolotl
 """
 
+import logging
 import os
 import sys
-from logging import Formatter
+from logging import Formatter, Logger, LogRecord
 from logging.config import dictConfig
 from typing import Any, Dict
 
 from colorama import Fore, Style, init
 
+DEFAULT_AXOLOTL_LOG_LEVEL = "INFO"
+DEFAULT_LOG_LEVEL = "WARNING"
+
+
+class AxolotlOrWarnErrorFilter(logging.Filter):
+    """
+    Allows ANY WARNING or higher (unless overridden by LOG_LEVEL)
+    Allows axolotl.* at INFO or higher (unless overridden by AXOLOTL_LOG_LEVEL)
+    Drops all other records (i.e. non-axolotl.INFO, DEBUG, etc. by default)
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        axolotl_log_level = os.getenv(
+            "AXOLOTL_LOG_LEVEL", DEFAULT_AXOLOTL_LOG_LEVEL
+        ).upper()
+        other_log_level = os.getenv("LOG_LEVEL", DEFAULT_LOG_LEVEL).upper()
+
+        try:
+            # py311+ only
+            level_mapping = logging.getLevelNamesMapping()
+            self.axolotl_level = level_mapping[axolotl_log_level]
+            self.other_level = level_mapping[other_log_level]
+        except AttributeError:
+            # For py310, use getLevelName directly
+            self.axolotl_level = logging.getLevelName(axolotl_log_level)
+            self.other_level = logging.getLevelName(other_log_level)
+
+    def filter(self, record: LogRecord) -> bool:
+        # General filter
+        if record.levelno >= self.other_level:
+            return True
+
+        # Axolotl filter
+        return (
+            record.name.startswith("axolotl") and record.levelno >= self.axolotl_level
+        )
+
+
+class AxolotlLogger(Logger):
+    """A Logger that automatically rejects non-axolotl INFOs."""
+
+    def __init__(self, name: str, level: int = logging.NOTSET):
+        super().__init__(name, level)
+
+        # set global filter on the logger itself
+        self.addFilter(AxolotlOrWarnErrorFilter())
+
 
 class ColorfulFormatter(Formatter):
     """
@@ -55,11 +105,15 @@ DEFAULT_LOGGING_CONFIG: Dict[str, Any] = {
             "stream": sys.stdout,
         },
     },
-    "root": {"handlers": ["console"], "level": os.getenv("LOG_LEVEL", "INFO")},
+    # log level will be superseded by the AxolotlLogger
+    "root": {
+        "handlers": ["console"],
+        "level": os.getenv("LOG_LEVEL", DEFAULT_LOG_LEVEL),
+    },
     "loggers": {
         "axolotl": {
             "handlers": ["color_console"],
-            "level": "DEBUG",
+            "level": os.getenv("AXOLOTL_LOG_LEVEL", DEFAULT_AXOLOTL_LOG_LEVEL).upper(),
             "propagate": False,
         },
     },
@@ -70,3 +124,8 @@ def configure_logging():
     """Configure with default logging"""
     init()  # Initialize colorama
     dictConfig(DEFAULT_LOGGING_CONFIG)
+    logging.setLoggerClass(AxolotlLogger)
+
+    # set default `ACCELERATE_LOG_LEVEL` to `LOG_LEVEL` if available and not set
+    if "ACCELERATE_LOG_LEVEL" not in os.environ:
+        os.environ["ACCELERATE_LOG_LEVEL"] = os.getenv("LOG_LEVEL", DEFAULT_LOG_LEVEL)
diff --git a/src/axolotl/monkeypatch/accelerate/fsdp2.py b/src/axolotl/monkeypatch/accelerate/fsdp2.py
index d8ec00c69..efc388294 100644
--- a/src/axolotl/monkeypatch/accelerate/fsdp2.py
+++ b/src/axolotl/monkeypatch/accelerate/fsdp2.py
@@ -1,63 +1,382 @@
 """
-monkeypatch for accelerate fsdp2 fix when modifying ordereddict during interation
+monkeypatch for accelerate fsdp2 fix when modifying ordereddict during interation, and saving full state dicts
 """
 
-import logging
+import copy
+import functools
 import sys
 
 import torch
+import torch.distributed as dist
+from torch import nn
 
-LOG = logging.getLogger(__name__)
+from axolotl.utils.bench import log_gpu_memory_usage
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
 
 
-def fsdp2_load_full_state_dict(accelerator, model: torch.nn.Module, full_sd: dict):
+def fsdp2_load_full_state_dict(
+    _accelerator, model: torch.nn.Module, full_sd: dict, offload_to_cpu: bool = False
+):
     """
     Loads the full state dict (could be only on rank 0) into the sharded model. This is done by broadcasting the
     parameters from rank 0 to all other ranks. This function modifies the model in-place.
-
     Args:
         accelerator (`Accelerator`): The accelerator instance
-        model (`torch.nn.Module`): The model to load the state dict into
+        model (`torch.nn.Module`):
+            The model to load the state dict into, expected to be on meta device or a VRAM spike can occur
         full_sd (`dict`): The full state dict to load, can only be on rank 0
     """
-    import torch.distributed as dist
     from torch.distributed.tensor import distribute_tensor
 
     LOG.info("Broadcasting full state dict to all ranks...")
-    sharded_sd = model.state_dict()
-    param_names = sorted(sharded_sd.keys())
-    for param_name in param_names:
-        mesh = sharded_sd[param_name].device_mesh
-        if accelerator.is_main_process:
-            # Use the corresponding tensor from full_sd (assuming the key exists in full_sd)
-            full_param = full_sd[param_name].detach().cuda()
-            dist.broadcast(full_param, src=0, group=mesh.get_group())
-            sharded_tensor = distribute_tensor(
-                full_param, mesh, sharded_sd[param_name].placements
+    import time
+
+    start_time = time.time()
+
+    meta_sharded_sd = model.state_dict()
+    sharded_sd = {}
+    for param_name, sharded_meta_param in meta_sharded_sd.items():
+        full_tensor = None
+        if _accelerator.is_main_process:
+            full_tensor = full_sd[param_name]
+            full_tensor = full_tensor.to(sharded_meta_param.dtype)
+
+        if hasattr(sharded_meta_param, "device_mesh"):
+            device_mesh = sharded_meta_param.device_mesh
+            if _accelerator.is_main_process:
+                full_tensor = full_tensor.to(device_mesh.device_type)
+            else:
+                full_tensor = torch.empty(
+                    sharded_meta_param.size(),
+                    device=device_mesh.device_type,
+                    dtype=sharded_meta_param.dtype,
+                )
+            sharded_param = distribute_tensor(
+                full_tensor,
+                device_mesh,
+                sharded_meta_param.placements,
+                src_data_rank=0,
             )
-            sharded_sd[param_name] = sharded_tensor
         else:
-            # Prepare a tensor of matching shape and dtype
-            full_tensor = torch.empty(
-                sharded_sd[param_name].size(),
-                device="cuda",
-                dtype=sharded_sd[param_name].dtype,
-            )
-            dist.broadcast(full_tensor, src=0, group=mesh.get_group())
-            sharded_tensor = distribute_tensor(
-                full_tensor, mesh, sharded_sd[param_name].placements
-            )
-            sharded_sd[param_name] = sharded_tensor
+            # Non-sharded parameters
+            if _accelerator.is_main_process:
+                sharded_param = full_tensor.to(torch.device("cuda"))
+            else:
+                # broadcast manually
+                sharded_param = torch.empty_like(
+                    sharded_meta_param,
+                    device=torch.device("cuda"),
+                    dtype=sharded_meta_param.dtype,
+                )
+            dist.broadcast(sharded_param, src=0)
 
-    model.load_state_dict(sharded_sd, assign=True)
+        if offload_to_cpu:
+            sharded_param = sharded_param.cpu()
 
+        sharded_sd[param_name] = nn.Parameter(sharded_param)
 
-def patch_accelerate_fsdp_utils():
-    from accelerate.utils import fsdp_utils
+        del full_tensor
+        full_sd[param_name] = None
 
-    fsdp_utils.fsdp2_load_full_state_dict = fsdp2_load_full_state_dict
-    setattr(
-        sys.modules["accelerate.utils.fsdp_utils"],
-        "fsdp2_load_full_state_dict",
-        fsdp2_load_full_state_dict,
+    model.load_state_dict(sharded_sd, assign=True, strict=True)
+    end_time = time.time()
+    LOG.debug(
+        f"Time taken to load full state dict: {(end_time - start_time):.2f} seconds"
+    )
+    log_gpu_memory_usage(LOG, "Memory usage after broadcasting full state dict", 0)
+    return model
+
+
+def get_state_dict(self, model, unwrap=True):
+    """
+    Returns the state dictionary of a model sent through [`Accelerator.prepare`] potentially without full
+    precision.
+
+    Args:
+        model (`torch.nn.Module`):
+            A PyTorch model sent through [`Accelerator.prepare`]
+        unwrap (`bool`, *optional*, defaults to `True`):
+            Whether to return the original underlying state_dict of `model` or to return the wrapped state_dict
+
+    Returns:
+        `dict`: The state dictionary of the model potentially without full precision.
+
+    Example:
+
+    ```python
+    >>> import torch
+    >>> from accelerate import Accelerator
+
+    >>> accelerator = Accelerator()
+    >>> net = torch.nn.Linear(2, 2)
+    >>> net = accelerator.prepare(net)
+    >>> state_dict = accelerator.get_state_dict(net)
+    ```
+    """
+    from accelerate import DistributedType
+    from accelerate.utils import compare_versions
+
+    if self.distributed_type == DistributedType.DEEPSPEED:
+        zero3_sharding = self.deepspeed_config["zero_optimization"]["stage"] == 3
+        tp_sharding = (
+            self.deepspeed_config.get("tensor_parallel", {}).get("autotp_size", 0) > 1
+        )
+        if zero3_sharding or tp_sharding:
+            if model.zero_gather_16bit_weights_on_model_save():
+                if tp_sharding and not compare_versions("deepspeed", ">=", "0.16.4"):
+                    raise ImportError(
+                        "Deepspeed TP requires deepspeed >= 0.16.4, Please update DeepSpeed via `pip install deepspeed -U`."
+                    )
+                state_dict = (
+                    model._consolidated_16bit_state_dict()  # pylint: disable=protected-access
+                    if tp_sharding
+                    else model._zero3_consolidated_16bit_state_dict()  # pylint: disable=protected-access
+                )
+            else:
+                raise ValueError(
+                    "Cannot get 16bit model weights because `stage3_gather_16bit_weights_on_model_save` in DeepSpeed config is False. "
+                    "To save the model weights in 16bit, set `stage3_gather_16bit_weights_on_model_save` to True in DeepSpeed config file or "
+                    "set `zero3_save_16bit_model` to True when using `accelerate config`. "
+                    "To save the full checkpoint, run `model.save_checkpoint(save_dir)` and use `zero_to_fp32.py` to recover weights."
+                )
+        else:
+            from deepspeed.checkpoint.utils import clone_tensors_for_torch_save
+
+            state_dict = clone_tensors_for_torch_save(
+                self.unwrap_model(model).state_dict()
+            )
+    elif self.is_fsdp2:
+        # https://github.com/pytorch/torchtune/blob/main/torchtune/training/_distributed.py#L465
+        state_dict = {}
+        sharded_state_dict = model.state_dict()
+        for param_name, param in sharded_state_dict.items():
+            if param.is_cpu:
+                param = param.to(torch.device("cuda"))
+
+            param = param.full_tensor()
+            if torch.distributed.get_rank() == 0:
+                state_dict[param_name] = param.cpu()
+            torch.distributed.barrier()
+    elif self.distributed_type == DistributedType.FSDP:
+        from torch.distributed.fsdp import FullStateDictConfig
+        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+        from torch.distributed.fsdp import StateDictType
+
+        full_state_dict_config = FullStateDictConfig(
+            offload_to_cpu=True, rank0_only=True
+        )
+        with FSDP.state_dict_type(
+            model, StateDictType.FULL_STATE_DICT, full_state_dict_config
+        ):
+            state_dict = model.state_dict()
+    else:
+        if unwrap:
+            model = self.unwrap_model(model)
+        state_dict = model.state_dict()
+
+    return state_dict
+
+
+def _process_lora_module_for_fsdp(module, fsdp2_kwargs):
+    """Helper function to process LoRA modules for FSDP2."""
+    from torch.distributed.fsdp import fully_shard
+
+    log_bias_dtype_mismatch = False
+
+    # Linear4Bit will keep it's bias term in fp32. If the weight dtype is in bf16 we are not able to
+    # wrap this. Therefore we must ensure the bias has the same dtype as the weight
+    if module.base_layer.bias is not None:
+        if module.base_layer.weight.dtype != module.base_layer.bias.dtype:
+            log_bias_dtype_mismatch = True
+            module.base_layer.bias.data = module.base_layer.bias.data.to(
+                module.base_layer.weight.dtype
+            )
+
+    for active_adapter in module.active_adapters:
+        if module.lora_A:
+            fully_shard(module.lora_A[active_adapter], **fsdp2_kwargs)
+        if module.lora_B:
+            fully_shard(module.lora_B[active_adapter], **fsdp2_kwargs)
+        if module.lora_embedding_A:
+            fully_shard(module.lora_embedding_A[active_adapter], **fsdp2_kwargs)
+        if module.lora_embedding_B:
+            fully_shard(module.lora_embedding_B[active_adapter], **fsdp2_kwargs)
+        if module.lora_magnitude_vector:
+            fully_shard(module.lora_magnitude_vector[active_adapter], **fsdp2_kwargs)
+    return log_bias_dtype_mismatch
+
+
+def fsdp2_prepare_model(accelerator, model: torch.nn.Module) -> torch.nn.Module:
+    """Prepares the model for FSDP2 in-place. Also returns the model to avoid misuse of the original model.
+
+    Args:
+        accelerator (`Accelerator`): The accelerator instance
+        model (`torch.nn.Module`): The model to prepare
+
+    Returns:
+        `torch.nn.Module`: Prepared model
+    """
+    from accelerate.utils import get_module_children_bottom_up, is_compiled_module
+    from accelerate.utils.fsdp_utils import fsdp2_prepare_auto_wrap_policy
+    from accelerate.utils.modeling import get_non_persistent_buffers
+    from peft import PeftModel
+    from peft.tuners.lora import LoraLayer
+    from torch.distributed.fsdp import (
+        CPUOffloadPolicy,
+        FSDPModule,
+        MixedPrecisionPolicy,
+        fully_shard,
+    )
+
+    is_type_fsdp = isinstance(model, FSDPModule) or (
+        is_compiled_module(model)
+        and isinstance(model._orig_mod, FSDPModule)  # pylint: disable=protected-access
+    )
+    if is_type_fsdp:
+        return model
+
+    fsdp2_plugin = accelerator.state.fsdp_plugin
+
+    original_sd = model.state_dict()
+
+    from torch.distributed.fsdp.wrap import (
+        size_based_auto_wrap_policy,
+        transformer_auto_wrap_policy,
+    )
+
+    # We need the `auto_wrap_policy` original type to create a custom poilicy function for sharding
+    # This is because `fully_shard` doesn't support old auto wrap policies, rather we have to imitate the behaviour
+    if fsdp2_plugin.auto_wrap_policy is transformer_auto_wrap_policy:
+        pass  # auto_wrap_policy_type = "transformer"
+    elif fsdp2_plugin.auto_wrap_policy is size_based_auto_wrap_policy:
+        pass  # auto_wrap_policy_type = "size"
+
+    # We set `auto_wrap_policy` to `functools.partial` to avoid creating it again
+    # This is because of `apply_activation_checkpointing` which will can reuse this function
+    fsdp2_plugin.set_auto_wrap_policy(model)
+
+    if fsdp2_plugin.activation_checkpointing:
+        from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+            CheckpointImpl,
+            apply_activation_checkpointing,
+            checkpoint_wrapper,
+        )
+
+        # Apply activation checkpointing before applying `fully_shard`
+        apply_activation_checkpointing(
+            model,
+            checkpoint_wrapper_fn=functools.partial(
+                checkpoint_wrapper,
+                checkpoint_impl=CheckpointImpl.NO_REENTRANT,
+            ),
+            auto_wrap_policy=fsdp2_plugin.auto_wrap_policy,
+        )
+
+    mesh = getattr(accelerator.state, "device_mesh", None)
+
+    fsdp2_kwargs = {
+        "reshard_after_forward": fsdp2_plugin.reshard_after_forward,
+        "offload_policy": fsdp2_plugin.cpu_offload,
+        # `fully_shard` doesn't accept `None` in case of `MixedPrecisionPolicy`
+        "mp_policy": fsdp2_plugin.mixed_precision_policy or MixedPrecisionPolicy(),
+        "mesh": (
+            mesh[tuple(accelerator.state.parallelism_config.fsdp_dim_names)]
+            if mesh is not None
+            else None
+        ),
+    }
+    model_has_params4bit = False
+    for _, param in model.named_parameters():
+        # this is a temporary fix whereby loading models with bnb params cannot be moved from
+        # GPU to a meta device due with FSDP2 because torch operations don't return the original class type
+        # bypassing the move to meta will still cause the VRAM spike, but at least it still will load
+        if param.__class__.__name__ == "Params4bit":
+            model_has_params4bit = True
+            break
+
+    if fsdp2_plugin.cpu_ram_efficient_loading and not model_has_params4bit:
+        # Context: `fully_shard` moves the model to GPU if it was on CPU, however it can also be on `meta` and then it stays there even after `fully_shard`
+        # For this reason, we need to move the model to `meta` device, as then sharding happens on `meta` device
+        # If we kept the model on CPU (`cpu_ram_efficient_loading` has model be on CPU on all ranks, though non-main ranks only have `torch.emtpy`), `fully_shard` would move it to GPU
+        # Afterwards, when we call `fsdp2_load_full_state_dict`, us creating the state_dict would result into briefly having two copies of model state_dict on the GPU -> VRAM spike
+
+        # We need to keep the original non-persistent buffers, as those MAY not be in the state_dict, resulting in them staying on meta device
+        # Also, these buffers aren't getting sharded by default
+        # We get the FQNs of all non-persistent buffers, to re-register them after
+        non_persistent_buffer_fqns = get_non_persistent_buffers(
+            model, recurse=True, fqns=True
+        )
+        original_non_persistent_buffers = copy.deepcopy(
+            {k: v for k, v in model.named_buffers() if k in non_persistent_buffer_fqns}
+        )
+        # We move the model to meta device, as then sharding happens on meta device
+        model = model.to(torch.device("meta"))
+        # We need to re-tie the weights, not exactly sure why, but if we don't do this, reference to `lm_head/embed_tokens` stay hanging -> more VRAM usage
+        # We assume `transformers` models have a `tie_weights` method if they support it
+        if hasattr(model, "tie_weights"):
+            model.tie_weights()
+
+    is_peft_model = isinstance(model, PeftModel)
+
+    auto_wrap_policy = fsdp2_prepare_auto_wrap_policy(fsdp2_plugin, model)
+    log_bias_dtype_mismatch = False
+    if auto_wrap_policy is not None:
+        for module in get_module_children_bottom_up(model)[:-1]:
+            if is_peft_model and isinstance(module, LoraLayer):
+                module_log_bias_mismatch = _process_lora_module_for_fsdp(
+                    module, fsdp2_kwargs
+                )
+                log_bias_dtype_mismatch |= module_log_bias_mismatch
+            if auto_wrap_policy(module) and not isinstance(module, FSDPModule):
+                fully_shard(module, **fsdp2_kwargs)
+
+    fully_shard(model, **fsdp2_kwargs)
+
+    if log_bias_dtype_mismatch:
+        LOG.warning(
+            "Bias dtype mismatch detected in LoRA base linear layer. Bias parameters have been cast to weight dtype."
+        )
+
+    if fsdp2_plugin.cpu_ram_efficient_loading:
+        offload_to_cpu = isinstance(fsdp2_plugin.cpu_offload, CPUOffloadPolicy)
+        fsdp2_load_full_state_dict(
+            accelerator, model, original_sd, offload_to_cpu=offload_to_cpu
+        )
+
+    if fsdp2_plugin.cpu_ram_efficient_loading and not model_has_params4bit:
+        # We re-register the buffers, as they may not be in the state_dict
+        for fqn, buffer_tensor in original_non_persistent_buffers.items():
+            buffer_tensor = buffer_tensor.to(accelerator.device)
+
+            if "." in fqn:
+                parent_fqn, local_buffer_name = fqn.rsplit(".", 1)
+                parent_module = model.get_submodule(parent_fqn)
+            else:
+                local_buffer_name = fqn
+                parent_module = model
+
+            parent_module.register_buffer(
+                local_buffer_name, buffer_tensor, persistent=False
+            )
+
+        # We need to tie the weights again, as call to `load_full_state_dict` breaks the tie
+        # Needs to be called both here and above
+        # removing this call makes the have slightly different loss
+        # removing the call above leads to extra memory usage as explained in the comment above
+        if hasattr(model, "tie_weights"):
+            model.tie_weights()
+    return model
+
+
+def patch_accelerate_fsdp2():
+    import accelerate
+
+    accelerate.accelerator.fsdp2_prepare_model = fsdp2_prepare_model
+    accelerate.Accelerator.get_state_dict = get_state_dict
+    setattr(
+        sys.modules["accelerate"],
+        "Accelerator.get_state_dict",
+        get_state_dict,
     )
diff --git a/src/axolotl/monkeypatch/accelerate/parallelism_config.py b/src/axolotl/monkeypatch/accelerate/parallelism_config.py
new file mode 100644
index 000000000..e3cafc87d
--- /dev/null
+++ b/src/axolotl/monkeypatch/accelerate/parallelism_config.py
@@ -0,0 +1,77 @@
+"""
+workaround to allow parallelism config for pure CP
+"""
+
+# pylint: disable=protected-access
+import os
+import warnings
+
+from accelerate import DistributedType
+
+
+def _validate_accelerator(self, accelerator):
+    _warnings = set()
+    if not accelerator.multi_device and self.total_size == 1:
+        # No distributed setup, valid parallelism config
+        return
+
+    # We need this to ensure DDP works
+    if self.total_size == 1:
+        self._set_size("dp_replicate", accelerator.num_processes)
+
+    if self.total_size != accelerator.num_processes:
+        raise ValueError(
+            f"ParallelismConfig total_size ({self.total_size}) does not match "
+            f"num_processes ({accelerator.num_processes}). Please adjust dp_replicate_size/ "
+            f"dp_shard_size/tp_size/cp_size."
+        )
+
+    # allow parallelism config when not using fsdp if using pure context parallelism
+    allow_parallelism_config = False
+
+    if (
+        self.cp_size > 1  # pylint: disable=chained-comparison
+        and self.dp_shard_size <= 1
+        and os.environ.get("ACCELERATE_ALLOW_CP_STANDALONE", "false").lower() == "true"
+    ):
+        allow_parallelism_config = True
+
+    if (
+        self.total_size > 1
+        and not allow_parallelism_config
+        and not (accelerator.is_fsdp2 or accelerator.multi_device)
+    ):
+        raise ValueError(
+            f"ParallelismConfig is only compatible DistributedType.FSDP (version 2) or DistributedType.Multi{{Device}}, but got {accelerator.distributed_type}."
+        )
+
+    for parallelism, size in self._sizes.items():
+        if size == 1 and getattr(self, f"{parallelism}_handler", None) is not None:
+            _warnings.add(
+                f"ParallelismConfig.{parallelism}_handler is set, but {parallelism}_size is set to 1. This handler will be ignored."
+            )
+
+    if _warnings and accelerator.is_main_process:
+        warnings.warn(
+            "ParallelismConfig has the following warnings:\n" + "\n".join(_warnings),
+            UserWarning,
+        )
+
+
+def patched_is_fsdp2(self) -> bool:
+    """
+    Patched version of is_fsdp2 that guards against a None fsdp_plugin.
+    """
+    # The new logic checks if fsdp_plugin exists before accessing its attributes
+    return (
+        self.distributed_type == DistributedType.FSDP
+        and self.fsdp_plugin
+        and self.fsdp_plugin.fsdp_version == 2
+    )
+
+
+def patch_parallelism_config():
+    from accelerate.accelerator import AcceleratorState, ParallelismConfig
+
+    ParallelismConfig._validate_accelerator = _validate_accelerator
+    AcceleratorState.is_fsdp2 = property(patched_is_fsdp2)
diff --git a/src/axolotl/monkeypatch/attention/flex_attn.py b/src/axolotl/monkeypatch/attention/flex_attn.py
index 3652a30b3..98aead832 100644
--- a/src/axolotl/monkeypatch/attention/flex_attn.py
+++ b/src/axolotl/monkeypatch/attention/flex_attn.py
@@ -6,6 +6,10 @@ from typing import Optional, Tuple, Union
 import torch
 import transformers
 
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
 
 def patch_flex_wrapper(**flex_attn_compile_kwargs):
     # TODO remove this patch when transformers#37285 is merged and in a release
@@ -46,10 +50,15 @@ def patch_flex_wrapper(**flex_attn_compile_kwargs):
                 # cause errors. The suggested fix is to compile with "max-autotune-no-cudagraphs"
                 # see https://github.com/pytorch/pytorch/issues/146260 for training
                 self.training = training
+                LOG.info(
+                    "Compiling flex attention with kwargs: %s. This may take a while...",
+                    flex_attn_compile_kwargs,
+                )
                 self._compiled_flex_attention = torch.compile(
                     flex_attention,
                     **flex_attn_compile_kwargs,
                 )
+                LOG.info("Flex attention compiled successfully.")
                 self._is_flex_compiled = True
 
         def __call__(self):
diff --git a/src/axolotl/monkeypatch/attention/mllama.py b/src/axolotl/monkeypatch/attention/mllama.py
deleted file mode 100644
index c9e8fb5e1..000000000
--- a/src/axolotl/monkeypatch/attention/mllama.py
+++ /dev/null
@@ -1,230 +0,0 @@
-"""
-Monkeypatch for Vision Llama for FA2 support
-"""
-
-# pylint: disable=duplicate-code
-
-from typing import Optional, Tuple
-
-import torch
-from flash_attn.flash_attn_interface import flash_attn_func
-from transformers.cache_utils import Cache
-from transformers.modeling_flash_attention_utils import _flash_attention_forward
-from transformers.models.mllama.configuration_mllama import MllamaTextConfig
-from transformers.models.mllama.modeling_mllama import (
-    MllamaTextCrossAttention,
-    MllamaTextSelfAttention,
-    apply_rotary_pos_emb,
-    repeat_kv,
-)
-from transformers.utils import is_flash_attn_greater_or_equal_2_10
-
-
-class MllamaTextCrossFlashAttention2(MllamaTextCrossAttention):
-    """
-    Mllama flash cross-attention module. This module inherits from `MllamaTextCrossAttention` and
-    implements the forward pass using Flash Attention for improved performance.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # Check if flash attention version is greater or equal to 2.1
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        cross_attention_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Cache] = None,
-        attention_mask: Optional[  # pylint: disable=unused-argument
-            torch.Tensor
-        ] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,  # pylint: disable=unused-argument
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        query_states = query_states.view(
-            bsz, q_len, self.num_heads, self.head_dim
-        ).transpose(1, 2)
-        query_states = self.q_norm(query_states)
-
-        if cross_attention_states is not None:
-            key_states = self.k_proj(cross_attention_states)
-            value_states = self.v_proj(cross_attention_states)
-            key_states = key_states.view(
-                bsz, -1, self.num_key_value_heads, self.head_dim
-            ).transpose(1, 2)
-            value_states = value_states.view(
-                bsz, -1, self.num_key_value_heads, self.head_dim
-            ).transpose(1, 2)
-            key_states = repeat_kv(key_states, self.num_key_value_groups)
-            value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-            key_states = self.k_norm(key_states)
-            if past_key_value is not None:
-                key_states, value_states = past_key_value.update(
-                    key_states,
-                    value_states,
-                    self.layer_idx,
-                    {"cache_position": cache_position},
-                )
-        elif cache_position[0] != 0:
-            key_states, value_states = (
-                past_key_value.key_cache[self.layer_idx],
-                past_key_value.value_cache[self.layer_idx],
-            )
-        else:
-            raise ValueError(
-                "Cross attention layer can't find neither `cross_attn_states` nor cached values for key/values!"
-            )
-
-        # Transpose to get the expected layout for flash attention
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        # Apply Flash Attention
-        dropout_rate = self.dropout if self.training else 0.0
-        output = flash_attn_func(
-            query_states,
-            key_states,
-            value_states,
-            dropout_p=dropout_rate,
-            softmax_scale=None,
-            causal=False,
-            return_attn_probs=output_attentions,
-        )
-
-        attn_output = output.contiguous().view(bsz, q_len, -1)
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class MllamaTextSelfFlashAttention2(MllamaTextSelfAttention):
-    """
-    Mllama flash self-attention module. This module inherits from `MllamaTextSelfAttention` and
-    implements the forward pass using Flash Attention for improved performance.
-    """
-
-    def __init__(self, config: MllamaTextConfig, layer_idx: int, *args, **kwargs):
-        super().__init__(config, layer_idx, *args, **kwargs)
-
-        # Check if flash attention version is greater or equal to 2.1
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,  # pylint: disable=unused-argument
-        past_key_value=None,
-        cache_position: Optional[torch.LongTensor] = None,
-        **kwargs,  # pylint: disable=unused-argument
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        output_attentions = False
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # Flash attention requires the input to have the shape
-        # batch_size x seq_length x num_heads x head_dim
-        query_states = query_states.view(
-            bsz, q_len, self.num_heads, self.head_dim
-        ).transpose(1, 2)
-        key_states = key_states.view(
-            bsz, q_len, self.num_key_value_heads, self.head_dim
-        ).transpose(1, 2)
-        value_states = value_states.view(
-            bsz, q_len, self.num_key_value_heads, self.head_dim
-        ).transpose(1, 2)
-
-        cos, sin = position_embeddings
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin
-        )
-
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(
-                key_states, value_states, self.layer_idx, cache_kwargs
-            )
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        # Transpose to get the expected layout for flash attention
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        dropout_rate = self.dropout if self.training else 0.0
-
-        # Handle potential silent casting to float32
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = (
-                    self.config._pre_quantization_dtype  # pylint: disable=protected-access
-                )
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        attn_output = _flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            use_top_left_mask=self._flash_attn_uses_top_left_mask,
-            is_causal=True,
-        )
-
-        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-def patch_mllama():
-    from transformers.models.mllama.modeling_mllama import (
-        MLLAMA_TEXT_ATTENTION_CLASSES,
-        MLLAMA_TEXT_CROSS_ATTENTION_CLASSES,
-        MLLAMA_VISION_ATTENTION_CLASSES,
-        MllamaPreTrainedModel,
-    )
-
-    MllamaPreTrainedModel._supports_flash_attn_2 = (  # pylint: disable=protected-access
-        True
-    )
-    MLLAMA_TEXT_ATTENTION_CLASSES["flash_attention_2"] = MllamaTextSelfFlashAttention2
-    MLLAMA_TEXT_CROSS_ATTENTION_CLASSES["flash_attention_2"] = (
-        MllamaTextCrossFlashAttention2
-    )
-    # fallback to SDPA
-    MLLAMA_VISION_ATTENTION_CLASSES["flash_attention_2"] = (
-        MLLAMA_VISION_ATTENTION_CLASSES["sdpa"]
-    )
diff --git a/src/axolotl/monkeypatch/btlm_attn_hijack_flash.py b/src/axolotl/monkeypatch/btlm_attn_hijack_flash.py
index 127590680..589980c8b 100644
--- a/src/axolotl/monkeypatch/btlm_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/btlm_attn_hijack_flash.py
@@ -3,7 +3,6 @@ Flash attention monkey patch for cerebras btlm model
 """
 
 import importlib
-import logging
 from typing import Optional, Tuple
 
 import torch
@@ -11,7 +10,9 @@ from accelerate import init_empty_weights
 from flash_attn.flash_attn_interface import flash_attn_func
 from transformers import AutoConfig, AutoModelForCausalLM
 
-LOG = logging.getLogger("axolotl")
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
 
 
 def replace_btlm_attn_with_flash_attn(model_name="cerebras/btlm-3b-8k-base"):
diff --git a/src/axolotl/monkeypatch/fsdp2_qlora.py b/src/axolotl/monkeypatch/fsdp2_qlora.py
new file mode 100644
index 000000000..a2cb7e472
--- /dev/null
+++ b/src/axolotl/monkeypatch/fsdp2_qlora.py
@@ -0,0 +1,205 @@
+"""
+Monkeypatch to add Params4bit support to FSDP2. This enables QLoRA + FSDP2, as well as
+our LoRA / QLoRA Triton kernels to work with FSDP2.
+
+This patch modifies the _init_sharded_param method in FSDPParam to handle bitsandbytes
+Params4bit parameters.
+"""
+
+import importlib
+import inspect
+
+import torch
+from torch.nn import Parameter
+
+from axolotl.monkeypatch.utils import detab_code
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+
+def patched_torch_function(cls, func, types, args=(), kwargs=None):
+    """
+    Patched version of Params4bit.__torch_function__ for preserving Params4bit
+    class identity and attributes.
+    """
+    if kwargs is None:
+        kwargs = {}
+
+    if func in [torch.chunk, torch.split]:
+        tensor = args[0]
+        result = Parameter.__torch_function__(func, types, args, kwargs)
+
+        if isinstance(result, tuple):
+            return tuple(
+                cls(
+                    data=chunk,
+                    requires_grad=tensor.requires_grad,
+                    quant_state=tensor.quant_state,
+                    blocksize=tensor.blocksize,
+                    compress_statistics=tensor.compress_statistics,
+                    quant_type=tensor.quant_type,
+                    quant_storage=tensor.quant_storage,
+                    module=tensor.module,
+                    bnb_quantized=tensor.bnb_quantized,
+                )
+                for chunk in result
+            )
+
+        return cls(
+            data=result,
+            requires_grad=tensor.requires_grad,
+            quant_state=tensor.quant_state,
+            blocksize=tensor.blocksize,
+            compress_statistics=tensor.compress_statistics,
+            quant_type=tensor.quant_type,
+            quant_storage=tensor.quant_storage,
+            module=tensor.module,
+            bnb_quantized=tensor.bnb_quantized,
+        )
+
+    return Parameter.__torch_function__(func, types, args, kwargs)
+
+
+# pylint: disable=protected-access
+def apply_bnb_torch_function_patch():
+    """
+    Patch Params4bit.__torch_function__ using Axolotl-style approach.
+
+    Returns:
+        True if patching succeeded, False otherwise.
+    """
+    from bitsandbytes.nn.modules import Params4bit
+
+    Params4bit.__torch_function__ = classmethod(patched_torch_function)
+
+    LOG.info("Successfully patched Params4bit.__torch_function__")
+
+
+# pylint: disable=protected-access
+def apply_init_sharded_param_patch():
+    """Apply patch to FSDPParam._init_sharded_param to support Params4bit."""
+    from torch.distributed.fsdp._fully_shard._fsdp_param import FSDPParam
+
+    # Get original source
+    original_source = inspect.getsource(FSDPParam._init_sharded_param)
+    original_source, _ = detab_code(original_source)
+
+    # Define the replacement
+    original_param_creation = """    self.sharded_param = nn.Parameter(self.to_sharded_dtensor(sharded_param))
+    self.sharded_param.requires_grad_(param.requires_grad)"""
+
+    patched_param_creation = """    import bitsandbytes as bnb
+    if isinstance(param, bnb.nn.modules.Params4bit):
+        self.sharded_param = bnb.nn.modules.Params4bit(
+            data=sharded_param,
+            requires_grad=param.requires_grad,
+            quant_state=param.quant_state,
+            blocksize=param.blocksize,
+            compress_statistics=param.compress_statistics,
+            quant_type=param.quant_type,
+            quant_storage=param.quant_storage,
+            module=param.module,
+            bnb_quantized=param.bnb_quantized,
+        )
+        self.sharded_param = self.to_sharded_dtensor(self.sharded_param)
+    else:
+        self.sharded_param = nn.Parameter(self.to_sharded_dtensor(sharded_param))
+        self.sharded_param.requires_grad_(param.requires_grad)"""
+
+    # Apply the replacement
+    if original_param_creation in original_source:
+        patched_source = original_source.replace(
+            original_param_creation, patched_param_creation
+        )
+        patched_source = patched_source.replace(
+            "def _init_sharded_param(",
+            "def patched_init_sharded_param(",
+            1,
+        )
+
+        # Load necessary imports
+        module_name = FSDPParam.__module__
+        module = importlib.import_module(module_name)
+
+        items_to_import = []
+        for item in dir(module):
+            if item in patched_source:
+                items_to_import.append(item)
+
+        exec(  # pylint: disable=exec-used  # nosec B102
+            f"from {module_name} import ({', '.join(items_to_import)})",
+            globals(),
+        )
+        exec(patched_source, globals())  # pylint: disable=exec-used  # nosec B102
+
+        # Replace the method
+        FSDPParam._init_sharded_param = patched_init_sharded_param  # pylint: disable=undefined-variable  # noqa: F821
+        LOG.info("Successfully applied FSDP _init_sharded_param patch")
+    else:
+        LOG.warning("Could not find target code for _init_sharded_param patching")
+
+
+def apply_init_unsharded_param_patch():
+    """Apply patch to FSDPParam.init_unsharded_param to support Params4bit."""
+    from torch.distributed.fsdp._fully_shard._fsdp_param import FSDPParam
+
+    # Get original source
+    original_source = inspect.getsource(FSDPParam.init_unsharded_param)
+    original_source, _ = detab_code(original_source)
+
+    # Define the replacement
+    original_param_creation = """        self._unsharded_param = nn.Parameter(
+            unsharded_param, requires_grad=self.sharded_param.requires_grad
+        )"""
+
+    patched_param_creation = """        import bitsandbytes as bnb
+        local_tensor = self.sharded_param._local_tensor
+        if isinstance(local_tensor, bnb.nn.modules.Params4bit):
+            self._unsharded_param = bnb.nn.modules.Params4bit(
+                data=unsharded_param,
+                requires_grad=self.sharded_param.requires_grad,
+                quant_state=local_tensor.quant_state,
+                blocksize=local_tensor.blocksize,
+                compress_statistics=local_tensor.compress_statistics,
+                quant_type=local_tensor.quant_type,
+                quant_storage=local_tensor.quant_storage,
+                module=local_tensor.module,
+                bnb_quantized=local_tensor.bnb_quantized,
+            )
+        else:
+            self._unsharded_param = nn.Parameter(
+                unsharded_param, requires_grad=self.sharded_param.requires_grad
+            )"""
+
+    # Apply the replacement
+    if original_param_creation in original_source:
+        patched_source = original_source.replace(
+            original_param_creation, patched_param_creation
+        )
+        patched_source = patched_source.replace(
+            "def init_unsharded_param(",
+            "def patched_init_unsharded_param(",
+            1,
+        )
+
+        # Load necessary imports
+        module_name = FSDPParam.__module__
+        module = importlib.import_module(module_name)
+
+        items_to_import = []
+        for item in dir(module):
+            if item in patched_source:
+                items_to_import.append(item)
+
+        exec(  # pylint: disable=exec-used  # nosec B102
+            f"from {module_name} import ({', '.join(items_to_import)})",
+            globals(),
+        )
+        exec(patched_source, globals())  # pylint: disable=exec-used  # nosec B102
+
+        # Replace the method
+        FSDPParam.init_unsharded_param = patched_init_unsharded_param  # pylint: disable=undefined-variable  # noqa: F821
+        LOG.info("Successfully applied FSDP init_unsharded_param patch")
+    else:
+        LOG.warning("Could not find target code for patching")
diff --git a/src/axolotl/monkeypatch/gemma3.py b/src/axolotl/monkeypatch/gemma3.py
deleted file mode 100644
index 36f591efd..000000000
--- a/src/axolotl/monkeypatch/gemma3.py
+++ /dev/null
@@ -1,230 +0,0 @@
-"""Monkeypatch for gemma3 conditional generation forward to fix loss exploding"""
-
-# pylint: disable=duplicate-code
-
-from typing import Optional, Tuple, Union
-
-import torch
-from transformers.cache_utils import Cache
-from transformers.models.gemma3.modeling_gemma3 import (
-    Gemma3CausalLMOutputWithPast,
-    logger,
-)
-from transformers.utils import (
-    is_torchdynamo_compiling,
-)
-from transformers.utils.deprecation import deprecate_kwarg
-
-
-@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
-def new_forward(
-    self,
-    input_ids: torch.LongTensor = None,
-    pixel_values: torch.FloatTensor = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None,
-    token_type_ids: Optional[torch.LongTensor] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    **lm_kwargs,
-) -> Union[Tuple, Gemma3CausalLMOutputWithPast]:
-    r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-    Returns:
-
-    Example:
-
-    ```python
-    >>> from PIL import Image
-    >>> import requests
-    >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration
-
-    >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/Gemma3-test-224px-hf")
-    >>> processor = AutoProcessor.from_pretrained("google/Gemma3-test-224px-hf")
-
-    >>> prompt = "answer en Where is the cow standing?"
-    >>> url = "https://huggingface.co/gv-hf/Gemma3-test-224px-hf/resolve/main/cow_beach_1.png"
-    >>> image = Image.open(requests.get(url, stream=True).raw)
-
-    >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")
-
-    >>> # Generate
-    >>> generate_ids = model.generate(**inputs, max_length=30)
-    >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    "answer en Where is the cow standing?\nbeach"
-    ```"""
-
-    if (input_ids is None) ^ (inputs_embeds is not None):
-        raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
-
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-
-    is_training = token_type_ids is not None and labels is not None
-
-    # Replace image id with PAD if the image token is OOV, to avoid index-errors
-    if input_ids is not None and self.config.image_token_index >= self.vocab_size:
-        special_image_mask = input_ids == self.config.image_token_index
-        llm_input_ids = input_ids.clone()
-        llm_input_ids[special_image_mask] = 0
-    else:
-        llm_input_ids = input_ids
-
-    if inputs_embeds is None:
-        inputs_embeds = self.get_input_embeddings()(llm_input_ids)
-
-    if cache_position is None:
-        past_seen_tokens = (
-            past_key_values.get_seq_length() if past_key_values is not None else 0
-        )
-        cache_position = torch.arange(
-            past_seen_tokens,
-            past_seen_tokens + inputs_embeds.shape[1],
-            device=inputs_embeds.device,
-        )
-
-    # Merge text and images
-    if pixel_values is not None:
-        image_features = self.get_image_features(pixel_values)
-
-        if input_ids is None:
-            special_image_mask = inputs_embeds == self.get_input_embeddings()(
-                torch.tensor(
-                    self.config.image_token_index,
-                    dtype=torch.long,
-                    device=inputs_embeds.device,
-                )
-            )
-        else:
-            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(
-                -1
-            )
-            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(
-                inputs_embeds.device
-            )
-
-        if (
-            not is_torchdynamo_compiling()
-            and inputs_embeds[special_image_mask].numel() != image_features.numel()
-        ):
-            image_tokens_in_text = (special_image_mask).sum(dim=1).sum(dim=0)[0]
-            raise ValueError(
-                f"Number of images does not match number of special image tokens in the input text. "
-                f"Got {image_tokens_in_text} image tokens in the text but {image_features.shape[0] * image_features.shape[1]} "
-                "tokens from image embeddings."
-            )
-        image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
-        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
-
-    # mask out pad-token-ids in labels for BC
-    if labels is not None and self.pad_token_id in labels:
-        logger.warning_once(
-            "`labels` contains `pad_token_id` which will be masked with `config.ignore_index`. "
-            "You have to mask out `pad_token_id` when preparing `labels`, this behavior will be removed in v.4.46.",
-        )
-        labels = torch.where(
-            input_ids == self.pad_token_id, self.config.ignore_index, labels
-        )
-
-    causal_mask = self._update_causal_mask(  # pylint: disable=protected-access
-        attention_mask,
-        token_type_ids,
-        past_key_values,
-        cache_position,
-        inputs_embeds,
-        is_training,
-    )
-    outputs = self.language_model(
-        attention_mask=causal_mask,
-        position_ids=position_ids,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        cache_position=cache_position,
-        logits_to_keep=logits_to_keep,
-        **lm_kwargs,
-    )
-
-    logits = outputs[0]
-    loss = None
-    if labels is not None:
-        if attention_mask is not None:
-            # Get the shifted attention mask
-            shift_attention_mask = attention_mask[:, -logits.shape[1] + 1 :].to(
-                logits.device
-            )  # +1 for shift
-
-            # Filter logits and labels based on attention mask
-            valid_indices = shift_attention_mask != 0
-            filtered_logits = logits[..., :-1, :][valid_indices]
-            filtered_labels = labels[..., 1:][valid_indices.to(labels.device)]
-
-            # TODO: do we need to handle num_items_in_batch given we filter the logits and labels?
-
-            loss = self.loss_function(
-                logits=filtered_logits,
-                labels=None,  # we pass shift_labels
-                shift_labels=filtered_labels,
-                vocab_size=self.config.text_config.vocab_size,
-                **lm_kwargs,
-            )
-        else:
-            # Standard case without filtering
-            loss = self.loss_function(
-                logits=logits,
-                labels=labels,
-                vocab_size=self.config.text_config.vocab_size,
-                **lm_kwargs,
-            )
-    if not return_dict:
-        output = (logits,) + outputs[1:]
-        return (loss,) + output if loss is not None else output
-
-    return Gemma3CausalLMOutputWithPast(
-        loss=loss,
-        logits=logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-        image_hidden_states=image_features if pixel_values is not None else None,
-    )
-
-
-def patch_gemma3conditionalgeneration_forward():
-    from transformers.models.gemma3.modeling_gemma3 import (
-        Gemma3ForConditionalGeneration,
-    )
-
-    Gemma3ForConditionalGeneration.forward = new_forward
diff --git a/src/axolotl/monkeypatch/gradient_checkpointing/__init__.py b/src/axolotl/monkeypatch/gradient_checkpointing/__init__.py
index 5d631776b..3b090d5e5 100644
--- a/src/axolotl/monkeypatch/gradient_checkpointing/__init__.py
+++ b/src/axolotl/monkeypatch/gradient_checkpointing/__init__.py
@@ -5,7 +5,7 @@ from functools import partial
 
 from packaging import version
 
-from axolotl.monkeypatch.gradient_checkpointing.offload_cpu import (
+from axolotl.monkeypatch.gradient_checkpointing.offload_cpu import (  # noqa: F401
     CPU_Offloaded_Gradient_Checkpointer,
 )
 from axolotl.monkeypatch.gradient_checkpointing.offload_disk import (
diff --git a/src/axolotl/monkeypatch/gradient_checkpointing/offload_cpu.py b/src/axolotl/monkeypatch/gradient_checkpointing/offload_cpu.py
index bbb5ad40d..bbcfb91e6 100644
--- a/src/axolotl/monkeypatch/gradient_checkpointing/offload_cpu.py
+++ b/src/axolotl/monkeypatch/gradient_checkpointing/offload_cpu.py
@@ -13,8 +13,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+import inspect
+
 import torch
 from packaging import version
+from torch.utils.checkpoint import (
+    set_device_states,
+)
+
+# support different pytorch versions
+has_device_type = "device_type" in inspect.signature(set_device_states).parameters
 
 torch_version = version.parse(torch.__version__)
 
diff --git a/src/axolotl/monkeypatch/gradient_checkpointing/offload_disk.py b/src/axolotl/monkeypatch/gradient_checkpointing/offload_disk.py
index 90e70f504..792d3c6ef 100644
--- a/src/axolotl/monkeypatch/gradient_checkpointing/offload_disk.py
+++ b/src/axolotl/monkeypatch/gradient_checkpointing/offload_disk.py
@@ -18,7 +18,6 @@ DISCO - DIsk-based Storage and Checkpointing with Optimized prefetching
 
 import atexit
 import concurrent.futures
-import logging
 import os
 import queue
 import shutil
@@ -32,11 +31,13 @@ from typing import Dict
 
 import torch
 
+from axolotl.utils.logging import get_logger
+
 torch_cuda_amp_custom_fwd = torch.amp.custom_fwd(device_type="cuda")
 torch_cuda_amp_custom_bwd = torch.amp.custom_bwd(device_type="cuda")
 
 # Setup logger
-logger = logging.getLogger(__name__)
+logger = get_logger(__name__)
 
 
 class DiskOffloadManager:
diff --git a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
index 998a81027..1316b5374 100644
--- a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
@@ -2,46 +2,33 @@
 
 # copied from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/llama_flash_attn_monkey_patch.py
 
-import logging
 import warnings
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Tuple
 
 import torch
-import torch.nn.functional as F
 import transformers
 from einops import rearrange
 from flash_attn.bert_padding import pad_input, unpad_input
-from transformers.modeling_outputs import BaseModelOutputWithPast
-from transformers.models.llama.modeling_llama import (
-    LlamaAttention,
-)
-from transformers.models.llama.modeling_llama import (
-    LlamaDecoderLayer as OriginalLlamaDecoderLayer,
-)
 from transformers.models.llama.modeling_llama import (
     LlamaMLP,
     apply_rotary_pos_emb,
     repeat_kv,
 )
 
-from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids, set_module_name
+from axolotl.monkeypatch.utils import set_module_name
+from axolotl.utils.logging import get_logger
 
 try:
     from flash_attn.flash_attn_interface import (  # pylint: disable=ungrouped-imports
-        flash_attn_kvpacked_func,
-        flash_attn_varlen_kvpacked_func,
         flash_attn_varlen_qkvpacked_func,
     )
 except ImportError:
-    from flash_attn.flash_attn_interface import (
-        flash_attn_unpadded_kvpacked_func as flash_attn_varlen_kvpacked_func,
-    )
     from flash_attn.flash_attn_interface import (
         flash_attn_unpadded_qkvpacked_func as flash_attn_varlen_qkvpacked_func,
     )
 
 
-LOG = logging.getLogger("axolotl")
+LOG = get_logger(__name__)
 
 
 def is_xformers_available() -> bool:
@@ -82,19 +69,6 @@ def replace_llama_mlp_with_swiglu(model):
             set_module_name(model, name, mlp)
 
 
-def replace_llama_qkv_with_fused(model):
-    for name, module in model.named_modules():
-        if isinstance(module, LlamaAttention):
-            qkv = FusedAttention(
-                module.config,
-                module.q_proj,
-                module.k_proj,
-                module.v_proj,
-                module.o_proj,
-            )
-            set_module_name(model, name, qkv)
-
-
 def patch_fa_llama_cross_entropy():
     LOG.info(
         "patching transformers.loss.loss_utils.fixed_cross_entropy with flash_attn.ops.triton.cross_entropy"
@@ -142,7 +116,6 @@ def patch_llama_rms_norm():
 
 
 def replace_llama_attn_with_flash_attn(
-    packed: Optional[bool] = False,
     cross_entropy: Optional[bool] = False,
     rms_norm: Optional[bool] = False,
     use_shifted_sparse_attn: Optional[bool] = False,
@@ -154,16 +127,6 @@ def replace_llama_attn_with_flash_attn(
         transformers.models.llama.modeling_llama.LlamaAttention.forward = (
             flashattn_forward_with_s2attn
         )
-    else:
-        transformers.models.llama.modeling_llama.LlamaAttention.forward = (
-            flashattn_forward
-        )
-
-    if packed:
-        transformers.models.llama.modeling_llama.LlamaDecoderLayer = LlamaDecoderLayer
-        transformers.models.llama.modeling_llama.LlamaModel.forward = (
-            llama_model_forward
-        )
 
     # skip only if explicitly disabled
     if cross_entropy:
@@ -174,49 +137,6 @@ def replace_llama_attn_with_flash_attn(
         patch_llama_rms_norm()
 
 
-class FusedAttention(LlamaAttention):
-    """
-    Fused QKV Attention layer for incrementally improved training efficiency
-    """
-
-    def __init__(
-        self,
-        config,
-        q: torch.nn.Linear,  # pylint: disable=invalid-name
-        k: torch.nn.Linear,  # pylint: disable=invalid-name
-        v: torch.nn.Linear,  # pylint: disable=invalid-name
-        o: torch.nn.Linear,  # pylint: disable=invalid-name
-    ):
-        super().__init__(config)
-        self.config = config
-        self.init_device = next(iter(q.state_dict().values())).device
-
-        # define equivalent fused qkv projection
-        self.out_features: List[int] = [q.out_features, k.out_features, v.out_features]
-        self.qkv_proj = torch.nn.Linear(
-            q.in_features, sum(self.out_features), device=self.init_device, bias=False
-        )
-        self.o_proj = o
-
-        # overwrite initialized weights with pretrained weights
-        self.qkv_proj.weight.data = torch.cat(
-            (q.weight.data, k.weight.data, v.weight.data), dim=0
-        )
-
-    def _post_training(self, model, name):
-        q_proj, k_proj, v_proj = torch.split(
-            self.qkv_proj.weight.data, self.out_features, dim=0
-        )
-
-        new_attn = LlamaAttention(self.config)
-        new_attn.q_proj.weight.data = q_proj
-        new_attn.k_proj.weight.data = k_proj
-        new_attn.v_proj.weight.data = v_proj
-        new_attn.o_proj.weight.data = self.o_proj.weight.data
-
-        set_module_name(model, name, new_attn)
-
-
 # Disable the transformation of the attention mask in LlamaModel as the flash attention
 # requires the attention mask to be the same as the key_padding_mask
 def _prepare_decoder_attention_mask(
@@ -355,574 +275,3 @@ def flashattn_forward_with_s2attn(
         .reshape(bsz, q_len, nheads, self.head_dim)
     )
     return self.o_proj(rearrange(output, "b s h d -> b s (h d)")), None, past_key_value
-
-
-def flashattn_forward(
-    self,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.Tensor] = None,
-    past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    output_attentions: bool = False,
-    use_cache: bool = False,
-    padding_mask: Optional[torch.LongTensor] = None,  # pylint: disable=unused-argument
-    cu_seqlens: Optional[torch.Tensor] = None,
-    max_seqlen: Optional[torch.Tensor] = None,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-    """Input shape: Batch x Time x Channel
-
-    attention_mask: [bsz, q_len]
-    """
-    # pylint: disable=duplicate-code
-    bsz, q_len, _ = hidden_states.size()
-
-    if not hasattr(self, "pretraining_tp"):
-        self.pretraining_tp = 1
-
-    if self.pretraining_tp > 1:
-        key_value_slicing = (
-            self.num_key_value_heads * self.head_dim
-        ) // self.pretraining_tp
-        query_slices = self.q_proj.weight.split(
-            (self.num_heads * self.head_dim) // self.pretraining_tp, dim=0
-        )
-        key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
-        value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
-
-        query_states = [
-            F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp)
-        ]
-        query_states = torch.cat(query_states, dim=-1)
-
-        key_states = [
-            F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp)
-        ]
-        key_states = torch.cat(key_states, dim=-1)
-
-        value_states = [
-            F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp)
-        ]
-        value_states = torch.cat(value_states, dim=-1)
-
-    else:
-        if isinstance(self, FusedAttention):
-            query_states, key_states, value_states = self.qkv_proj(hidden_states).split(
-                self.out_features, dim=-1
-            )
-        else:
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
-
-    query_states = query_states.view(
-        bsz, q_len, self.num_heads, self.head_dim
-    ).transpose(1, 2)
-    key_states = key_states.view(
-        bsz, q_len, self.num_key_value_heads, self.head_dim
-    ).transpose(1, 2)
-    value_states = value_states.view(
-        bsz, q_len, self.num_key_value_heads, self.head_dim
-    ).transpose(1, 2)
-    # [bsz, q_len, nh, hd]
-    # [bsz, nh, q_len, hd]
-
-    cos, sin = self.rotary_emb(value_states, position_ids=position_ids)
-    query_states, key_states = apply_rotary_pos_emb(
-        query_states, key_states, cos, sin, position_ids
-    )
-    # [bsz, nh, t, hd]
-
-    if past_key_value is not None:
-        # reuse k, v, self_attention
-        key_states = torch.cat([past_key_value[0], key_states], dim=2)
-        value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-    past_key_value = (key_states, value_states) if use_cache else None
-
-    # repeat k/v heads if n_kv_heads < n_heads
-    key_states = repeat_kv(key_states, self.num_key_value_groups)
-    value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-    if output_attentions:
-        warnings.warn(
-            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
-        )
-
-    #
-    # flash-attn v2 start
-    #
-
-    if self.training:
-        # during training q,k,v always have same seqlen
-        assert key_states.shape == query_states.shape
-        is_causal = True
-    else:
-        # turn off FA causal mask after first inference autoregressive iteration
-        # only on first autoregressive step q,k,v have same seqlen
-        is_causal = key_states.shape == query_states.shape
-
-    dropout_rate = 0.0 if not self.training else getattr(self, "attention_dropout", 0.0)
-
-    if cu_seqlens is not None and max_seqlen is not None and cu_seqlens.dim() == 1:
-        # special handling using sample packing
-        qkv = torch.stack(
-            [query_states, key_states, value_states], dim=2
-        )  # [bsz, nh, 3, q_len, hd]
-        qkv = qkv.transpose(1, 3)  # [bsz, q_len, 3, nh, hd]
-        qkv = rearrange(qkv, "b s ... -> (b s) ...")
-
-        output = flash_attn_varlen_qkvpacked_func(
-            qkv,
-            cu_seqlens,
-            max_seqlen,
-            dropout_p=dropout_rate,
-            softmax_scale=None,
-            causal=True,
-        )
-        output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
-    elif query_states.shape == key_states.shape:
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-        qkv_unpad, cu_seqlens_q, max_seqlen_q, _, output_pad_fn = generate_qkv(
-            query_states,
-            key_states,
-            value_states,
-            qkvpacked=True,
-            # We have disabled _prepare_decoder_attention_mask in LlamaModel
-            # the attention_mask should be the same as the key_padding_mask
-            key_padding_mask=attention_mask,
-            query_padding_mask=(
-                attention_mask[:, -query_states.size(1) :]
-                if attention_mask is not None
-                else None
-            ),
-        )
-        output_unpad = flash_attn_varlen_qkvpacked_func(
-            qkv_unpad,
-            cu_seqlens_q,
-            max_seqlen_q,
-            dropout_p=dropout_rate,
-            softmax_scale=None,
-            causal=is_causal,
-        )
-        output = output_pad_fn(output_unpad)
-    else:
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-        if attention_mask is None or attention_mask.all().item():
-            output = flash_attn_kvpacked_func(
-                query_states,
-                torch.stack([key_states, value_states], 2),
-                dropout_p=dropout_rate,
-                causal=is_causal,
-            )
-        else:
-            (  # pylint: disable=unbalanced-tuple-unpacking
-                q_unpad,
-                kv_unpad,
-                cu_seqlens_q,
-                cu_seqlens_k,
-                max_seqlen_q,
-                max_seqlen_k,
-                _,
-                _,
-                output_pad_fn,
-            ) = generate_qkv(
-                query_states,
-                key_states,
-                value_states,
-                kvpacked=True,
-                key_padding_mask=attention_mask,
-                query_padding_mask=(
-                    attention_mask[:, -query_states.size(1) :]
-                    if attention_mask is not None
-                    else None
-                ),
-            )
-            if q_unpad.dtype != kv_unpad.dtype:
-                kv_unpad = kv_unpad.to(q_unpad.dtype)
-            output_unpad = flash_attn_varlen_kvpacked_func(
-                q_unpad,
-                kv_unpad,
-                cu_seqlens_q,
-                cu_seqlens_k,
-                max_seqlen_q,
-                max_seqlen_k,
-                dropout_p=dropout_rate,
-                softmax_scale=None,
-                causal=is_causal,
-            )
-            output = output_pad_fn(output_unpad)
-
-    attn_output = output
-    if attn_output.size() != (bsz, q_len, self.num_heads, self.head_dim):
-        raise ValueError(
-            f"`attn_output` should be of size {(bsz, q_len, self.num_heads, self.head_dim)}, but is"
-            f" {attn_output.size()}"
-        )
-    attn_output = rearrange(attn_output, "b s h d -> b s (h d)")
-
-    #
-    # flash-attn v2 end
-    #
-
-    if self.pretraining_tp > 1:
-        attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
-        o_proj_slices = self.o_proj.weight.split(
-            self.hidden_size // self.pretraining_tp, dim=1
-        )
-        attn_output = sum(
-            F.linear(attn_output[i], o_proj_slices[i])
-            for i in range(self.pretraining_tp)
-        )
-    else:
-        attn_output = self.o_proj(attn_output)
-
-    return attn_output, None, past_key_value
-
-
-# based on https://github.com/Dao-AILab/flash-attention/blob/364a5b/tests/test_flash_attn.py#L38
-def generate_qkv(
-    q,
-    k,
-    v,
-    query_padding_mask=None,
-    key_padding_mask=None,
-    kvpacked=False,
-    qkvpacked=False,
-):  # pylint: disable=invalid-name,unnecessary-lambda-assignment
-    """
-    Arguments:
-        q: (batch_size, seqlen_q, nheads, d)
-        k: (batch_size, seqlen_k, nheads_k, d)
-        v: (batch_size, seqlen_k, nheads_k, d)
-        query_padding_mask: (batch_size, seqlen), bool
-        key_padding_mask: (batch_size, seqlen), bool
-    """
-    assert not (kvpacked and qkvpacked)
-    batch_size, seqlen_q, nheads, d = q.shape
-    _, seqlen_k, nheads_k, _ = k.shape
-    assert k.shape == (batch_size, seqlen_k, nheads_k, d)
-    assert v.shape == (batch_size, seqlen_k, nheads_k, d)
-
-    if query_padding_mask is not None:
-        q_unpad, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(
-            q, query_padding_mask
-        )
-
-        output_pad_fn = lambda output_unpad: pad_input(  # noqa: E731
-            output_unpad, indices_q, batch_size, seqlen_q
-        )
-
-    else:
-        q_unpad = rearrange(q, "b s h d -> (b s) h d")
-        cu_seqlens_q = torch.arange(
-            0,
-            (batch_size + 1) * seqlen_q,
-            step=seqlen_q,
-            dtype=torch.int32,
-            device=q_unpad.device,
-        )
-        max_seqlen_q = seqlen_q
-
-        output_pad_fn = lambda output_unpad: rearrange(  # noqa: E731
-            output_unpad, "(b s) h d -> b s h d", b=batch_size
-        )
-
-    if key_padding_mask is not None:
-        k_unpad, _, cu_seqlens_k, max_seqlen_k = unpad_input(k, key_padding_mask)
-        v_unpad, _, _, _ = unpad_input(v, key_padding_mask)
-    else:
-        k_unpad = rearrange(k, "b s h d -> (b s) h d")
-        v_unpad = rearrange(v, "b s h d -> (b s) h d")
-        cu_seqlens_k = torch.arange(
-            0,
-            (batch_size + 1) * seqlen_k,
-            step=seqlen_k,
-            dtype=torch.int32,
-            device=k_unpad.device,
-        )
-        max_seqlen_k = seqlen_k
-
-    if qkvpacked:
-        assert nheads == nheads_k
-        qkv_unpad = torch.stack([q_unpad, k_unpad, v_unpad], dim=1)
-        qkv = torch.stack([q, k, v], dim=2)
-        return (qkv_unpad, cu_seqlens_q, max_seqlen_q, qkv, output_pad_fn)
-
-    if kvpacked:
-        kv_unpad = torch.stack([k_unpad, v_unpad], dim=1)
-        kv = torch.stack([k, v], dim=2)
-        return (
-            q_unpad,
-            kv_unpad,
-            cu_seqlens_q,
-            cu_seqlens_k,
-            max_seqlen_q,
-            max_seqlen_k,
-            q,
-            kv,
-            output_pad_fn,
-        )
-
-    return (
-        q_unpad,
-        k_unpad,
-        v_unpad,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        q,
-        k,
-        v,
-        output_pad_fn,
-    )
-
-
-def llama_model_forward(
-    self,
-    input_ids: torch.LongTensor = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[List[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    cache_position: Optional[  # pylint: disable=unused-argument
-        torch.LongTensor
-    ] = None,
-) -> Union[Tuple, BaseModelOutputWithPast]:
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-
-    # retrieve input_ids and inputs_embeds
-    if input_ids is not None and inputs_embeds is not None:
-        raise ValueError(
-            "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
-        )
-    if input_ids is not None:
-        batch_size, seq_length = input_ids.shape
-    elif inputs_embeds is not None:
-        batch_size, seq_length, _ = inputs_embeds.shape
-    else:
-        raise ValueError(
-            "You have to specify either decoder_input_ids or decoder_inputs_embeds"
-        )
-
-    seq_length_with_past = seq_length
-    past_key_values_length = 0
-
-    if past_key_values is not None:
-        past_key_values_length = past_key_values[0][0].shape[2]
-        seq_length_with_past = seq_length_with_past + past_key_values_length
-
-    cu_seqlens = None
-    max_seqlen = None
-    if position_ids is None:
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-        position_ids = torch.arange(
-            past_key_values_length,
-            seq_length + past_key_values_length,
-            dtype=torch.long,
-            device=device,
-        )
-        position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-    else:
-        position_ids = position_ids.view(-1, seq_length).long()
-        cu_seqlens, max_seqlen = get_cu_seqlens_from_pos_ids(position_ids)
-        cu_seqlens = cu_seqlens.squeeze()
-
-    if inputs_embeds is None:
-        inputs_embeds = self.embed_tokens(input_ids)
-    # embed positions
-    if attention_mask is None:
-        attention_mask = torch.ones(
-            (batch_size, seq_length_with_past),
-            dtype=torch.bool,
-            device=inputs_embeds.device,
-        )
-        padding_mask = None
-    else:
-        if 0 in attention_mask:
-            padding_mask = attention_mask
-        else:
-            padding_mask = None
-
-    attention_mask = (
-        self._prepare_decoder_attention_mask(  # pylint: disable=protected-access
-            attention_mask,
-            (batch_size, seq_length),
-            inputs_embeds,
-            past_key_values_length,
-        )
-    )
-
-    hidden_states = inputs_embeds
-
-    if self.gradient_checkpointing and self.training:
-        if use_cache:
-            transformers.logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-            )
-            use_cache = False
-
-    # decoder layers
-    all_hidden_states = () if output_hidden_states else None
-    all_self_attns = () if output_attentions else None
-    next_decoder_cache = () if use_cache else None
-
-    for idx, decoder_layer in enumerate(self.layers):
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        past_key_value = past_key_values[idx] if past_key_values is not None else None
-
-        if self.gradient_checkpointing and self.training:
-
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    # None for past_key_value
-                    return module(
-                        *inputs,
-                    )
-
-                return custom_forward
-
-            layer_outputs = torch.utils.checkpoint.checkpoint(
-                create_custom_forward(decoder_layer),
-                hidden_states,
-                attention_mask,
-                position_ids,
-                past_key_value,
-                output_attentions,
-                None,
-                padding_mask,
-                cu_seqlens,
-                max_seqlen,
-            )
-        else:
-            layer_outputs = decoder_layer(
-                hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                cu_seqlens=cu_seqlens,
-                max_seqlen=max_seqlen,
-            )
-
-        hidden_states = layer_outputs[0]
-
-        if use_cache:
-            next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
-
-        if output_attentions:
-            all_self_attns += (layer_outputs[1],)
-
-    hidden_states = self.norm(hidden_states)
-
-    # add hidden states from the last decoder layer
-    if output_hidden_states:
-        all_hidden_states += (hidden_states,)
-
-    next_cache = next_decoder_cache if use_cache else None
-    if not return_dict:
-        return tuple(
-            v
-            for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
-            if v is not None
-        )
-    return BaseModelOutputWithPast(
-        last_hidden_state=hidden_states,
-        past_key_values=next_cache,
-        hidden_states=all_hidden_states,
-        attentions=all_self_attns,
-    )
-
-
-class LlamaDecoderLayer(OriginalLlamaDecoderLayer):
-    """
-    patched version of LlamaDecoderLayer to pass through the precalculated cu_seqlens
-    """
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        padding_mask: Optional[torch.LongTensor] = None,
-        cu_seqlens: Optional[torch.Tensor] = None,
-        max_seqlen: Optional[torch.Tensor] = None,
-    ) -> Tuple[
-        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
-    ]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-            cu_seqlens (`torch.Tensor`, *optional*) cumulative sequence len when packing
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-            padding_mask=padding_mask,
-            cu_seqlens=cu_seqlens,
-            max_seqlen=max_seqlen,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
diff --git a/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py b/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
index 0c1a4e822..28223eee3 100644
--- a/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
@@ -2,7 +2,6 @@
 Directly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments
 """
 
-import logging
 import warnings
 from typing import Optional, Tuple
 
@@ -11,10 +10,14 @@ import torch.nn.functional as F
 import transformers.models.llama.modeling_llama
 from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
 
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
 try:
     import xformers.ops
 except ImportError:
-    logging.error("xformers not found! Please install it before trying to use it.")
+    LOG.error("xformers not found! Please install it before trying to use it.")
 
 
 def hijack_llama_attention():
diff --git a/src/axolotl/monkeypatch/lora_kernels.py b/src/axolotl/monkeypatch/lora_kernels.py
index 6c920dcc8..be1e1f2ff 100644
--- a/src/axolotl/monkeypatch/lora_kernels.py
+++ b/src/axolotl/monkeypatch/lora_kernels.py
@@ -7,7 +7,6 @@ import types
 from typing import Generator, Tuple, Type
 
 import torch
-from accelerate.logging import get_logger
 from peft import PeftModelForCausalLM
 from torch import nn
 from transformers import AutoConfig
@@ -19,7 +18,9 @@ from axolotl.kernels.lora import (
     apply_lora_qkv,
 )
 from axolotl.monkeypatch.utils import detab_code
+from axolotl.utils.callbacks.models import get_causal_lm_model_cls_prefix
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
 
 LOG = get_logger(__name__)
 
@@ -145,12 +146,25 @@ def get_attention_cls_from_config(cfg: DictDefault) -> Type[nn.Module]:
 
         return Qwen2Attention
 
+    if model_type == "mllama":
+        from transformers.models.mllama.modeling_mllama import MllamaTextSelfAttention
+
+        return MllamaTextSelfAttention
+
+    if model_type == "llama4":
+        from transformers.models.llama4.modeling_llama4 import Llama4TextAttention
+
+        return Llama4TextAttention
+
+    if model_type == "mistral3":
+        from transformers.models.mistral.modeling_mistral import MistralAttention
+
+        return MistralAttention
+
     try:
         # Dynamically import the module and attention class
         module_path = f"transformers.models.{model_type}.modeling_{model_type}"
-        model_cls_prefix = "".join(
-            [part.capitalize() for part in model_type.split("_")]
-        )
+        model_cls_prefix, _ = get_causal_lm_model_cls_prefix(model_type)
         module = __import__(module_path, fromlist=[f"{model_cls_prefix}Attention"])
         attention_cls = getattr(module, f"{model_cls_prefix}Attention")
 
@@ -269,6 +283,29 @@ def find_mlp_in_layer(
                 )
 
 
+def get_layers(model: PeftModelForCausalLM) -> list[nn.Module]:
+    """
+    Get the layers of the model. Handles text-only and multimodal models.
+
+    Args:
+        model: A PEFT model.
+
+    Returns:
+        A list of layers.
+    """
+    pretrained_model = model.model
+
+    # check for multimodal models first
+    if hasattr(pretrained_model, "language_model"):
+        return pretrained_model.language_model.layers
+    if hasattr(pretrained_model, "model"):
+        return pretrained_model.model.layers
+
+    raise NotImplementedError(
+        f"Model type {model.config.model_type} is not supported yet. Please create an Issue."
+    )
+
+
 def apply_lora_kernel_patches(
     model: PeftModelForCausalLM, cfg: DictDefault
 ) -> PeftModelForCausalLM:
@@ -340,16 +377,7 @@ def apply_lora_kernel_patches(
     if activation not in SUPPORTED_ACTIVATIONS:
         raise NotImplementedError(f"Activation {activation} is not supported")
 
-    layers = []
-    # check for multimodal models first
-    if hasattr(model, "language_model"):
-        layers = model.language_model.model.layers
-    elif hasattr(model, "model"):
-        layers = model.model.model.layers
-    else:
-        raise NotImplementedError(
-            f"Model type {model.config.model_type} is not supported yet. Please create an Issue."
-        )
+    layers = get_layers(model)
 
     # Patch each layer
     for layer in layers:
@@ -367,7 +395,6 @@ def apply_lora_kernel_patches(
                 ]
                 can_patch_qkv = all(
                     hasattr(module, "lora_A")
-                    and getattr(module, "base_layer", module).bias is None
                     and len(getattr(module, "lora_magnitude_vector", []) or []) == 0
                     for module in layer_modules
                 )
@@ -377,7 +404,8 @@ def apply_lora_kernel_patches(
                     self_attn.apply_qkv = types.MethodType(apply_lora_qkv, self_attn)
                 else:
                     LOG.warning_once(
-                        "Cannot patch some attention QKV projections - requires LoRA adapters with no bias"
+                        "Cannot patch some attention QKV projections - requires LoRA "
+                        "adapters and no lora_magnitude_vector (DoRA)"
                     )
             if cfg.lora_o_kernel:
                 # Output patching
@@ -386,7 +414,6 @@ def apply_lora_kernel_patches(
                 ]
                 can_patch_o = all(
                     hasattr(module, "lora_A")
-                    and getattr(module, "base_layer", module).bias is None
                     and len(getattr(module, "lora_magnitude_vector", []) or []) == 0
                     for module in layer_modules
                 )
@@ -395,14 +422,14 @@ def apply_lora_kernel_patches(
                     self_attn.apply_o = types.MethodType(apply_lora_o, self_attn)
                 else:
                     LOG.warning_once(
-                        "Cannot patch some attention output projection - requires LoRA adapters with no bias"
+                        "Cannot patch some attention output projection - requires LoRA "
+                        "adapters and no lora_magnitude_vector (DoRA)"
                     )
         for gate_proj, up_proj, down_proj, mlp in find_mlp_in_layer(layer):
             if cfg.lora_mlp_kernel:
                 # MLP patching
                 can_patch_mlp = all(
                     hasattr(proj, "lora_A")
-                    and getattr(proj, "base_layer", proj).bias is None
                     and len(getattr(proj, "lora_magnitude_vector", []) or []) == 0
                     for proj in (gate_proj, up_proj, down_proj)
                 )
@@ -412,7 +439,8 @@ def apply_lora_kernel_patches(
                     layer.mlp.forward = types.MethodType(apply_fn, mlp)
                 else:
                     LOG.warning_once(
-                        "Cannot patch some MLP layers - requires LoRA adapters with no bias"
+                        "Cannot patch some MLP layers - requires LoRA adapters and no "
+                        "lora_magnitude_vector (DoRA)"
                     )
 
     LOG.setLevel(original_level)
diff --git a/src/axolotl/monkeypatch/loss/__init__.py b/src/axolotl/monkeypatch/loss/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/axolotl/monkeypatch/loss/chunked.py b/src/axolotl/monkeypatch/loss/chunked.py
new file mode 100644
index 000000000..0a9d0de82
--- /dev/null
+++ b/src/axolotl/monkeypatch/loss/chunked.py
@@ -0,0 +1,134 @@
+"""
+chunked ce loss
+"""
+
+from typing import List, Optional
+
+import torch
+import torch.nn.functional as F
+
+
+# copied and modified from torchtune.modules.loss.CEWithChunkedOutputLoss
+class CEWithChunkedOutputLoss(torch.nn.Module):
+    """
+    Cross-entropy with chunked outputs that saves memory by only upcasting one chunk at a time.
+
+    For more details, please refer to: https://github.com/pytorch/torchtune/pull/1390
+    """
+
+    def __init__(self, num_output_chunks: int = 8, ignore_index: int = -100):
+        super().__init__()
+        self.num_output_chunks = num_output_chunks
+        self.ignore_index = ignore_index
+
+    def compute_cross_entropy(
+        self,
+        logits: torch.Tensor,
+        labels: torch.Tensor,
+        normalize: bool = True,  # pylint: disable=unused-argument
+    ) -> torch.Tensor:
+        """
+        Upcast logits to fp32 and compute cross entropy loss.
+        """
+        return F.cross_entropy(
+            logits.float(), labels, ignore_index=self.ignore_index, reduction="sum"
+        )
+
+    def forward(
+        self, logits: List[torch.Tensor], labels: torch.Tensor, reduction="sum"
+    ) -> torch.Tensor:
+        """
+        Args:
+            logits (List[torch.Tensor]): List of chunked logits of length
+                ``self.num_output_chunks``, where each chunk has shape
+                ``(batch_size, num_tokens / num_output_chunks, vocab_size)``.
+            labels (torch.Tensor): Ground truth labels of shape ``(batch_size, num_tokens)``.
+            reduction (str): The reduction to apply to the output.
+
+        Returns:
+            torch.Tensor: Cross entropy loss of shape (1,).
+        """
+
+        total_elements = (labels != self.ignore_index).sum()
+
+        # chunk and reshape labels (bsz, num_tokens, vocab) -> [(bsz*num_tokens/num_chunks, vocab)]
+        labels = [
+            target_chunk.reshape(-1)
+            for target_chunk in labels.chunk(self.num_output_chunks, dim=1)
+        ]
+        # reshape logits [(bsz, num_tokens/num_chunks, vocab)] -> [(bsz*num_tokens/num_chunks, vocab)]
+        logits = [
+            logit_chunk.reshape(-1, logit_chunk.size(-1)) for logit_chunk in logits
+        ]
+
+        # compute one chunk at a time
+        total_loss = 0.0
+        for logits_chunk, labels_chunk in zip(logits, labels):
+            total_loss += self.compute_cross_entropy(logits_chunk, labels_chunk)
+
+        if reduction == "sum":
+            return total_loss
+        return total_loss / total_elements
+
+
+def _build_chunked_ce_loss_fn(num_output_chunks: int = 8, ignore_index: int = -100):
+    loss_fn_ce = CEWithChunkedOutputLoss(num_output_chunks, ignore_index)
+    loss_fn_ce.compute_cross_entropy = torch.compile(
+        loss_fn_ce.compute_cross_entropy, backend="inductor"
+    )
+    return loss_fn_ce
+
+
+def get_causal_lm_loss(num_output_chunks: int = 8, ignore_index: int = -100):
+    loss_fn_ce = _build_chunked_ce_loss_fn(num_output_chunks, ignore_index)
+
+    def chunked_fix_cross_entropy(
+        source,
+        target,
+        num_items_in_batch: int = None,
+        ignore_index: int = -100,
+        **kwargs,
+    ):  # pylint: disable=unused-argument
+        reduction = "sum" if num_items_in_batch is not None else "mean"
+        logit_chunks = [  # pylint: disable=unnecessary-comprehension
+            chunk for chunk in source.chunk(loss_fn_ce.num_output_chunks, dim=1)
+        ]
+        loss = loss_fn_ce(logit_chunks, target, reduction=reduction)
+        if reduction == "sum":
+            loss = loss / num_items_in_batch
+        return loss
+
+    def for_causal_lm_chunked_loss(
+        logits,
+        labels,
+        vocab_size: int = None,  # pylint: disable=unused-argument
+        num_items_in_batch: Optional[int] = None,
+        ignore_index: int = -100,
+        shift_labels: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        # skip the upcast to float since we handle that in the chunking loss
+        if shift_labels is None:
+            # Shift so that tokens < n predict n
+            labels = F.pad(labels, (0, 1), value=ignore_index)
+            shift_labels = labels[..., 1:].contiguous()
+
+        # Skip Flattening the tokens
+        # Enable model parallelism
+        shift_labels = shift_labels.to(logits.device)
+        loss = chunked_fix_cross_entropy(
+            logits, shift_labels, num_items_in_batch, ignore_index, **kwargs
+        )
+        return loss
+
+    return for_causal_lm_chunked_loss
+
+
+def patch_chunked_ce_loss_fn(num_output_chunks: int = 8, ignore_index: int = -100):
+    import transformers.loss.loss_utils
+
+    for_causal_lm_chunked_loss = get_causal_lm_loss(num_output_chunks, ignore_index)
+    transformers.loss.loss_utils.ForCausalLMLoss = for_causal_lm_chunked_loss
+    transformers.loss.loss_utils.LOSS_MAPPING["ForCausalLM"] = (
+        for_causal_lm_chunked_loss
+    )
diff --git a/src/axolotl/monkeypatch/mistral_attn_hijack_flash.py b/src/axolotl/monkeypatch/mistral_attn_hijack_flash.py
index ac9815fce..e1be424a3 100644
--- a/src/axolotl/monkeypatch/mistral_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/mistral_attn_hijack_flash.py
@@ -2,52 +2,13 @@
 
 # pylint: disable=duplicate-code
 
-import logging
 from functools import partial
-from typing import List, Optional, Tuple, Union
 
-import torch
 import transformers
-from einops import rearrange
-from flash_attn.bert_padding import pad_input, unpad_input
-from flash_attn.flash_attn_interface import (  # pylint: disable=ungrouped-imports
-    flash_attn_kvpacked_func,
-    flash_attn_varlen_kvpacked_func,
-    flash_attn_varlen_qkvpacked_func,
-)
-from transformers.modeling_outputs import BaseModelOutputWithPast
-from transformers.models.mistral.modeling_mistral import (
-    MistralAttention as OriginalMistralAttention,
-)
-from transformers.models.mistral.modeling_mistral import (
-    MistralDecoderLayer as OriginalMistralDecoderLayer,
-)
-from transformers.models.mistral.modeling_mistral import (
-    apply_rotary_pos_emb,
-    repeat_kv,
-)
 
-from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
+from axolotl.utils.logging import get_logger
 
-LOG = logging.getLogger("axolotl.monkeypatch.mistral")
-
-
-def replace_mistral_attn_with_flash_attn(
-    packed: Optional[bool] = False,
-):
-    transformers.models.mistral.modeling_mistral.MistralModel._prepare_decoder_attention_mask = (  # pylint: disable=protected-access
-        _prepare_decoder_attention_mask
-    )
-    transformers.models.mistral.modeling_mistral.MistralAttention.forward = (
-        flashattn_forward
-    )
-    if packed:
-        transformers.models.mistral.modeling_mistral.MistralDecoderLayer = (
-            MistralDecoderLayer
-        )
-        transformers.models.mistral.modeling_mistral.MistralModel.forward = (
-            mistral_model_forward
-        )
+LOG = get_logger(__name__)
 
 
 def patch_mistral_cross_entropy():
@@ -57,602 +18,3 @@ def patch_mistral_cross_entropy():
     transformers.models.mistral.modeling_mistral.CrossEntropyLoss = partial(
         CrossEntropyLoss, inplace_backward=True
     )
-
-
-@torch.jit.script
-def _make_sliding_window_causal_mask(
-    bsz: int,
-    tgt_len: int,
-    dtype: torch.dtype,
-    device: torch.device,
-    past_key_values_length: int = 0,
-    sliding_window: int = 4096,
-):
-    """
-    Make causal mask used for sliding window attention
-    """
-    tensor = torch.full(
-        (tgt_len, tgt_len),
-        fill_value=1,
-        device=device,
-    )
-    mask = torch.tril(tensor, diagonal=0)
-    # make the mask banded to account for sliding window
-    # NOTE: HF implementation is wrong as of 14-10-2023 for torch.triu, needs +1
-    mask = torch.triu(mask, diagonal=-sliding_window + 1)
-    mask = torch.log(mask).to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat(
-            [
-                torch.zeros(
-                    tgt_len, past_key_values_length, dtype=dtype, device=device
-                ),
-                mask,
-            ],
-            dim=-1,
-        )
-    return mask[None, None, :, :].expand(
-        bsz, 1, tgt_len, tgt_len + past_key_values_length
-    )
-
-
-# Disable the transformation of the attention mask in LlamaModel as the flash attention
-# requires the attention mask to be the same as the key_padding_mask
-def _prepare_decoder_attention_mask(
-    self,
-    attention_mask,
-    input_shape,
-    inputs_embeds,
-    past_key_values_length,
-    sliding_window,
-):  # pylint: disable=unused-argument
-    # [bsz, seq_len]
-    if attention_mask is None or sliding_window is None:
-        return attention_mask
-
-    # NOTE: attention mask and sliding masks are only broadcastable in certain scenarios.
-    # Without attention_mask.shape[0] == 1, error will trigger after eval loss but only when wandb is enabled.
-    if input_shape[-1] > 1 and attention_mask.shape[0] == 1:
-        sliding_window_mask = _make_sliding_window_causal_mask(
-            bsz=input_shape[0],
-            tgt_len=input_shape[1],
-            dtype=inputs_embeds.dtype,
-            device=inputs_embeds.device,
-            past_key_values_length=past_key_values_length,
-            sliding_window=sliding_window,
-        )
-        attention_mask = attention_mask + sliding_window_mask
-    else:
-        LOG.info("skipping sliding window mask, not broadcastable with attention mask")
-
-    return attention_mask
-
-
-def flashattn_forward(
-    self: OriginalMistralAttention,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    output_attentions: bool = False,
-    use_cache: bool = False,
-    cu_seqlens: Optional[torch.Tensor] = None,
-    max_seqlen: Optional[torch.Tensor] = None,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-    bsz, q_len, _ = hidden_states.size()
-
-    query_states = self.q_proj(hidden_states)
-    key_states = self.k_proj(hidden_states)
-    value_states = self.v_proj(hidden_states)
-
-    query_states = query_states.view(
-        bsz, q_len, self.num_heads, self.head_dim
-    ).transpose(1, 2)
-    key_states = key_states.view(
-        bsz, q_len, self.num_key_value_heads, self.head_dim
-    ).transpose(1, 2)
-    value_states = value_states.view(
-        bsz, q_len, self.num_key_value_heads, self.head_dim
-    ).transpose(1, 2)
-
-    kv_seq_len = key_states.shape[-2]
-    if past_key_value is not None:
-        kv_seq_len += past_key_value[0].shape[-2]
-    cos, sin = self.rotary_emb(value_states, position_ids=position_ids)
-    query_states, key_states = apply_rotary_pos_emb(
-        query_states, key_states, cos, sin, position_ids
-    )
-
-    use_sliding_windows = (
-        getattr(self.config, "sliding_window") is not None
-        and kv_seq_len > self.config.sliding_window
-    )
-
-    if use_sliding_windows:
-        window_size = (self.config.sliding_window, self.config.sliding_window)
-    else:
-        window_size = (-1, -1)
-
-    if past_key_value is not None:
-        # Activate slicing cache only if the config has a value `sliding_windows` attribute
-        if (
-            hasattr(self.config, "sliding_window")
-            and kv_seq_len > self.config.sliding_window
-        ):
-            slicing_tokens = kv_seq_len - self.config.sliding_window
-
-            past_key = past_key_value[0]
-            past_value = past_key_value[1]
-
-            past_key = past_key[:, :, slicing_tokens:, :].contiguous()
-            past_value = past_value[:, :, slicing_tokens:, :].contiguous()
-
-            if past_key.shape[-2] != self.config.sliding_window - 1:
-                raise ValueError(
-                    f"past key much have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
-                    f" {past_key.shape}"
-                )
-
-            past_key_value = (past_key, past_value) if use_cache else None
-
-        if past_key_value is not None:
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-    past_key_value = (key_states, value_states) if use_cache else None
-
-    # repeat k/v heads if n_kv_heads < n_heads
-    key_states = repeat_kv(key_states, self.num_key_value_groups)
-    value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-    if self.training:
-        # during training q,k,v always have same seqlen
-        assert key_states.shape == query_states.shape
-        is_causal = True
-    else:
-        # turn off FA causal mask after first inference autoregressive iteration
-        # only on first autoregressive step q,k,v have same seqlen
-        is_causal = key_states.shape == query_states.shape
-
-    dropout_rate = 0.0 if not self.training else getattr(self, "attention_dropout", 0.0)
-
-    if cu_seqlens is not None and max_seqlen is not None and cu_seqlens.dim() == 1:
-        # special handling using sample packing
-        qkv = torch.stack(
-            [query_states, key_states, value_states], dim=2
-        )  # [bsz, nh, 3, q_len, hd]
-        qkv = qkv.transpose(1, 3)  # [bsz, q_len, 3, nh, hd]
-        qkv = rearrange(qkv, "b s ... -> (b s) ...")
-
-        output = flash_attn_varlen_qkvpacked_func(
-            qkv,
-            cu_seqlens,
-            max_seqlen,
-            dropout_p=dropout_rate,
-            softmax_scale=None,
-            causal=True,
-            window_size=window_size,
-        )
-        output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
-    elif query_states.shape == key_states.shape:
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-        qkv_unpad, cu_seqlens_q, max_seqlen_q, _, output_pad_fn = generate_qkv(
-            query_states,
-            key_states,
-            value_states,
-            qkvpacked=True,
-            # We have disabled _prepare_decoder_attention_mask in LlamaModel
-            # the attention_mask should be the same as the key_padding_mask
-            key_padding_mask=attention_mask,
-            query_padding_mask=(
-                attention_mask[:, -query_states.size(1) :]
-                if attention_mask is not None
-                else None
-            ),
-        )
-        output_unpad = flash_attn_varlen_qkvpacked_func(
-            qkv_unpad,
-            cu_seqlens_q,
-            max_seqlen_q,
-            dropout_p=dropout_rate,
-            softmax_scale=None,
-            causal=is_causal,
-            window_size=window_size,
-        )
-        output = output_pad_fn(output_unpad)
-    else:
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-        if attention_mask is None or attention_mask.all().item():
-            output = flash_attn_kvpacked_func(
-                query_states,
-                torch.stack([key_states, value_states], 2),
-                dropout_p=dropout_rate,
-                causal=is_causal,
-                window_size=window_size,
-            )
-        else:
-            (  # pylint: disable=unbalanced-tuple-unpacking
-                q_unpad,
-                kv_unpad,
-                cu_seqlens_q,
-                cu_seqlens_k,
-                max_seqlen_q,
-                max_seqlen_k,
-                _,
-                _,
-                output_pad_fn,
-            ) = generate_qkv(
-                query_states,
-                key_states,
-                value_states,
-                kvpacked=True,
-                key_padding_mask=attention_mask,
-                query_padding_mask=(
-                    attention_mask[:, -query_states.size(1) :]
-                    if attention_mask is not None
-                    else None
-                ),
-            )
-            if q_unpad.dtype != kv_unpad.dtype:
-                kv_unpad = kv_unpad.to(q_unpad.dtype)
-            output_unpad = flash_attn_varlen_kvpacked_func(
-                q_unpad,
-                kv_unpad,
-                cu_seqlens_q,
-                cu_seqlens_k,
-                max_seqlen_q,
-                max_seqlen_k,
-                dropout_p=dropout_rate,
-                softmax_scale=None,
-                causal=is_causal,
-                window_size=window_size,
-            )
-            output = output_pad_fn(output_unpad)
-
-    attn_output = output
-    if attn_output.size() != (bsz, q_len, self.num_heads, self.head_dim):
-        raise ValueError(
-            f"`attn_output` should be of size {(bsz, q_len, self.num_heads, self.head_dim)}, but is"
-            f" {attn_output.size()}"
-        )
-    attn_output = rearrange(attn_output, "b s h d -> b s (h d)")
-
-    attn_output = self.o_proj(attn_output)
-
-    if not output_attentions:
-        attn_weights = None
-
-    return attn_output, attn_weights, past_key_value
-
-
-# based on https://github.com/Dao-AILab/flash-attention/blob/364a5b/tests/test_flash_attn.py#L38
-def generate_qkv(
-    q,
-    k,
-    v,
-    query_padding_mask=None,
-    key_padding_mask=None,
-    kvpacked=False,
-    qkvpacked=False,
-):  # pylint: disable=invalid-name,unnecessary-lambda-assignment
-    """
-    Arguments:
-        q: (batch_size, seqlen_q, nheads, d)
-        k: (batch_size, seqlen_k, nheads_k, d)
-        v: (batch_size, seqlen_k, nheads_k, d)
-        query_padding_mask: (batch_size, seqlen), bool
-        key_padding_mask: (batch_size, seqlen), bool
-    """
-    assert not (kvpacked and qkvpacked)
-    batch_size, seqlen_q, nheads, d = q.shape
-    _, seqlen_k, nheads_k, _ = k.shape
-    assert k.shape == (batch_size, seqlen_k, nheads_k, d)
-    assert v.shape == (batch_size, seqlen_k, nheads_k, d)
-
-    if query_padding_mask is not None:
-        q_unpad, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(
-            q, query_padding_mask
-        )
-
-        output_pad_fn = lambda output_unpad: pad_input(  # noqa: E731
-            output_unpad, indices_q, batch_size, seqlen_q
-        )
-
-    else:
-        q_unpad = rearrange(q, "b s h d -> (b s) h d")
-        cu_seqlens_q = torch.arange(
-            0,
-            (batch_size + 1) * seqlen_q,
-            step=seqlen_q,
-            dtype=torch.int32,
-            device=q_unpad.device,
-        )
-        max_seqlen_q = seqlen_q
-
-        output_pad_fn = lambda output_unpad: rearrange(  # noqa: E731
-            output_unpad, "(b s) h d -> b s h d", b=batch_size
-        )
-
-    if key_padding_mask is not None:
-        k_unpad, _, cu_seqlens_k, max_seqlen_k = unpad_input(k, key_padding_mask)
-        v_unpad, _, _, _ = unpad_input(v, key_padding_mask)
-    else:
-        k_unpad = rearrange(k, "b s h d -> (b s) h d")
-        v_unpad = rearrange(v, "b s h d -> (b s) h d")
-        cu_seqlens_k = torch.arange(
-            0,
-            (batch_size + 1) * seqlen_k,
-            step=seqlen_k,
-            dtype=torch.int32,
-            device=k_unpad.device,
-        )
-        max_seqlen_k = seqlen_k
-
-    if qkvpacked:
-        assert nheads == nheads_k
-        qkv_unpad = torch.stack([q_unpad, k_unpad, v_unpad], dim=1)
-        qkv = torch.stack([q, k, v], dim=2)
-        return (qkv_unpad, cu_seqlens_q, max_seqlen_q, qkv, output_pad_fn)
-
-    if kvpacked:
-        kv_unpad = torch.stack([k_unpad, v_unpad], dim=1)
-        kv = torch.stack([k, v], dim=2)
-        return (
-            q_unpad,
-            kv_unpad,
-            cu_seqlens_q,
-            cu_seqlens_k,
-            max_seqlen_q,
-            max_seqlen_k,
-            q,
-            kv,
-            output_pad_fn,
-        )
-
-    return (
-        q_unpad,
-        k_unpad,
-        v_unpad,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        q,
-        k,
-        v,
-        output_pad_fn,
-    )
-
-
-def mistral_model_forward(
-    self,
-    input_ids: torch.LongTensor = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[List[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    cache_position: Optional[  # pylint: disable=unused-argument
-        torch.LongTensor
-    ] = None,
-) -> Union[Tuple, BaseModelOutputWithPast]:
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-
-    # retrieve input_ids and inputs_embeds
-    if input_ids is not None and inputs_embeds is not None:
-        raise ValueError(
-            "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
-        )
-    if input_ids is not None:
-        batch_size, seq_length = input_ids.shape
-    elif inputs_embeds is not None:
-        batch_size, seq_length, _ = inputs_embeds.shape
-    else:
-        raise ValueError(
-            "You have to specify either decoder_input_ids or decoder_inputs_embeds"
-        )
-
-    seq_length_with_past = seq_length
-    past_key_values_length = 0
-
-    if past_key_values is not None:
-        past_key_values_length = past_key_values[0][0].shape[2]
-        seq_length_with_past = seq_length_with_past + past_key_values_length
-
-    cu_seqlens = None
-    max_seqlen = None
-    if position_ids is None:
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-        position_ids = torch.arange(
-            past_key_values_length,
-            seq_length + past_key_values_length,
-            dtype=torch.long,
-            device=device,
-        )
-        position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-    else:
-        position_ids = position_ids.view(-1, seq_length).long()
-        cu_seqlens, max_seqlen = get_cu_seqlens_from_pos_ids(position_ids)
-        cu_seqlens = cu_seqlens.squeeze()
-
-    if inputs_embeds is None:
-        inputs_embeds = self.embed_tokens(input_ids)
-    # embed positions
-    if attention_mask is None:
-        attention_mask = torch.ones(
-            (batch_size, seq_length_with_past),
-            dtype=torch.bool,
-            device=inputs_embeds.device,
-        )
-    attention_mask = (
-        self._prepare_decoder_attention_mask(  # pylint: disable=protected-access
-            attention_mask,
-            (batch_size, seq_length),
-            inputs_embeds,
-            past_key_values_length,
-            sliding_window=self.config.sliding_window,
-        )
-    )
-
-    hidden_states = inputs_embeds
-
-    if self.gradient_checkpointing and self.training:
-        if use_cache:
-            transformers.logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-            )
-            use_cache = False
-
-    # decoder layers
-    all_hidden_states = () if output_hidden_states else None
-    all_self_attns = () if output_attentions else None
-    next_decoder_cache = () if use_cache else None
-
-    for idx, decoder_layer in enumerate(self.layers):
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        past_key_value = past_key_values[idx] if past_key_values is not None else None
-
-        if self.gradient_checkpointing and self.training:
-            layer_outputs = (
-                self._gradient_checkpointing_func(  # pylint: disable=protected-access
-                    decoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    past_key_value,
-                    output_attentions,
-                    None,
-                    cu_seqlens,
-                    max_seqlen,
-                )
-            )
-        else:
-            layer_outputs = decoder_layer(
-                hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                cu_seqlens=cu_seqlens,
-                max_seqlen=max_seqlen,
-            )
-
-        hidden_states = layer_outputs[0]
-
-        if use_cache:
-            next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
-
-        if output_attentions:
-            all_self_attns += (layer_outputs[1],)
-
-    hidden_states = self.norm(hidden_states)
-
-    # add hidden states from the last decoder layer
-    if output_hidden_states:
-        all_hidden_states += (hidden_states,)
-
-    next_cache = next_decoder_cache if use_cache else None
-    if not return_dict:
-        return tuple(
-            v
-            for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
-            if v is not None
-        )
-    return BaseModelOutputWithPast(
-        last_hidden_state=hidden_states,
-        past_key_values=next_cache,
-        hidden_states=all_hidden_states,
-        attentions=all_self_attns,
-    )
-
-
-class MistralDecoderLayer(OriginalMistralDecoderLayer):
-    """
-    patched version of MistralDecoderLayer to pass through the precalculated cu_seqlens
-    """
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        cu_seqlens: Optional[torch.Tensor] = None,
-        max_seqlen: Optional[torch.Tensor] = None,
-    ) -> Tuple[
-        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
-    ]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-            cu_seqlens (`torch.Tensor`, *optional*) cumulative sequence len when packing
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-            cu_seqlens=cu_seqlens,
-            max_seqlen=max_seqlen,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
diff --git a/src/axolotl/monkeypatch/models/__init__.py b/src/axolotl/monkeypatch/models/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/axolotl/monkeypatch/models/voxtral/__init__.py b/src/axolotl/monkeypatch/models/voxtral/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/axolotl/monkeypatch/models/voxtral/modeling.py b/src/axolotl/monkeypatch/models/voxtral/modeling.py
new file mode 100644
index 000000000..3dd652dd8
--- /dev/null
+++ b/src/axolotl/monkeypatch/models/voxtral/modeling.py
@@ -0,0 +1,67 @@
+"""Monkeypatch for voxtral to fix leaf node and dtype mismatch"""
+
+from typing import Optional, Union
+
+import torch
+from transformers.cache_utils import Cache
+from transformers.modeling_outputs import CausalLMOutputWithPast
+
+
+def patch_voxtral_conditional_generation_forward():
+    from transformers.models.voxtral.modeling_voxtral import (
+        VoxtralForConditionalGeneration,
+    )
+
+    # Store the original forward method
+    old_forward = VoxtralForConditionalGeneration.forward
+
+    def _forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        input_features: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if input_features is not None:
+            audio_embeds = self.get_audio_embeds(input_features)
+
+            # Cast audio_embeds to match inputs_embeds dtype
+            audio_embeds = audio_embeds.to(inputs_embeds.dtype)
+
+            # replace text-audio token placeholders with audio embeddings
+            audio_token_mask = input_ids == self.config.audio_token_id
+
+            inputs_embeds = inputs_embeds.clone()
+            inputs_embeds[audio_token_mask] = audio_embeds
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+        return outputs
+
+    # Apply the patch
+    VoxtralForConditionalGeneration.forward = _forward
+
+    def unpatch():
+        """Restore the original forward method"""
+        VoxtralForConditionalGeneration.forward = old_forward
+
+    return unpatch
diff --git a/src/axolotl/monkeypatch/multipack.py b/src/axolotl/monkeypatch/multipack.py
index 1467f9e29..7df9877d7 100644
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -35,6 +35,9 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
     "deepseek_v3",
     "glm",
     "glm4",
+    "smollm3",
+    "gpt_oss",
+    "arcee",
 ]
 
 
@@ -42,6 +45,10 @@ def patch_for_multipack(model_type, model_name=None, has_remote_code=False):
     if has_remote_code:
         patch_remote(model_name)
     elif hasattr(transformers, "modeling_flash_attention_utils"):
+        # sanity check in case upstream api changes on this
+        assert hasattr(
+            transformers.modeling_flash_attention_utils, "_get_unpad_data"
+        ), "transformers api changed for _get_unpad_data for flash attention"
         transformers.modeling_flash_attention_utils._get_unpad_data = (  # pylint: disable=protected-access
             get_unpad_data
         )
diff --git a/src/axolotl/monkeypatch/peft/utils.py b/src/axolotl/monkeypatch/peft/utils.py
index fdc49c5f6..0c571fbd2 100644
--- a/src/axolotl/monkeypatch/peft/utils.py
+++ b/src/axolotl/monkeypatch/peft/utils.py
@@ -3,14 +3,14 @@ Patch prepare_model_for_kbit_training to not upcast everything
 """
 
 import inspect
-import logging
 
 import peft
 
 import axolotl
 from axolotl.monkeypatch.utils import detab_code
+from axolotl.utils.logging import get_logger
 
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__)
 
 ORIGINAL_PREPARE_CODE = """
         for param in model.parameters():
diff --git a/src/axolotl/monkeypatch/relora.py b/src/axolotl/monkeypatch/relora.py
index 4a27dde81..0028a0cf6 100644
--- a/src/axolotl/monkeypatch/relora.py
+++ b/src/axolotl/monkeypatch/relora.py
@@ -2,12 +2,11 @@
 
 import glob
 import json
-import logging
 import os.path
 import shutil
 from functools import partial
 from pathlib import Path
-from typing import Dict, List, Sequence, Union
+from typing import Dict, List, Union
 
 import bitsandbytes as bnb
 import peft
@@ -15,8 +14,6 @@ import safetensors.torch as st
 import torch
 from huggingface_hub import snapshot_download
 from torch.distributed.optim import ZeroRedundancyOptimizer
-from torch.optim.lr_scheduler import LRScheduler
-from torch.optim.optimizer import Optimizer
 from transformers import (
     TrainerCallback,
     TrainerControl,
@@ -27,8 +24,9 @@ from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import barrier, is_main_process
+from axolotl.utils.logging import get_logger
 
-LOG = logging.getLogger("axolotl.relora")
+LOG = get_logger(__name__)
 
 
 @torch.no_grad()
@@ -84,7 +82,7 @@ class ReLoRACallback(TrainerCallback):
     """Callback to merge LoRA weights into the base model and save full-weight checkpoints"""
 
     def __init__(self, cfg: DictDefault):
-        self.relora_steps = cfg.relora_steps
+        self.relora_steps = cfg.jagged_restart_steps
         self.cpu_offload = cfg.relora_cpu_offload
         self.quantized = cfg.load_in_4bit or cfg.load_in_8bit
         self.last_full_model = cfg.base_model
@@ -255,51 +253,6 @@ class ReLoRACallback(TrainerCallback):
         return control
 
 
-class ReLoRAScheduler(LRScheduler):
-    """Wraps another scheduler to apply per-lora-restart learning rate warmups."""
-
-    def __init__(
-        self,
-        optimizer: Optimizer,
-        inner_schedule: LRScheduler,
-        relora_steps: int,
-        warmup_steps: int,
-        anneal_steps: int = 1,
-        min_lr_scale: float = 0.001,
-    ) -> None:
-        self.inner_schedule = inner_schedule
-        self.relora_steps = relora_steps
-        self.warmup_steps = warmup_steps
-        self.anneal_steps = anneal_steps
-        self.min_lr_scale = min_lr_scale
-        super().__init__(optimizer, inner_schedule.last_epoch)
-
-    def get_lr(self) -> float:
-        self.inner_schedule.last_epoch = self.last_epoch
-
-        original = self.inner_schedule.get_lr()
-        step = self.last_epoch
-
-        if step < self.relora_steps - self.warmup_steps:
-            scale = 1
-        else:
-            per_relora_progress = step % self.relora_steps
-            if per_relora_progress < self.warmup_steps:
-                cycle_t = min(1.0, (per_relora_progress) / self.warmup_steps)
-            elif per_relora_progress > (self.relora_steps - self.anneal_steps):
-                cycle_t = min(
-                    1.0,
-                    (self.relora_steps - per_relora_progress) / self.anneal_steps,
-                )
-            else:
-                cycle_t = 1
-            scale = cycle_t * (1 - self.min_lr_scale) + self.min_lr_scale
-
-        if isinstance(original, Sequence):
-            return [lr * scale for lr in original]
-        return original * scale
-
-
 def sharded_paths(path: str, module_names: List[str]) -> Dict[str, str]:
     model_name = "model.safetensors"
     if not os.path.exists(str(Path(path) / model_name)) and not os.path.exists(
diff --git a/src/axolotl/monkeypatch/ring_attn/__init__.py b/src/axolotl/monkeypatch/ring_attn/__init__.py
index 5833b9ce4..736378b16 100644
--- a/src/axolotl/monkeypatch/ring_attn/__init__.py
+++ b/src/axolotl/monkeypatch/ring_attn/__init__.py
@@ -5,18 +5,14 @@
 
 from .patch import (
     get_ring_attn_group,
-    patch_prepare_data_loader,
-    patch_prepare_device_mesh,
-    register_ring_attn,
+    register_ring_attn_from_device_mesh,
     set_ring_attn_group,
     update_ring_attn_params,
 )
 
 __all__ = (
     "get_ring_attn_group",
-    "patch_prepare_data_loader",
-    "patch_prepare_device_mesh",
-    "register_ring_attn",
+    "register_ring_attn_from_device_mesh",
     "set_ring_attn_group",
     "update_ring_attn_params",
 )
diff --git a/src/axolotl/monkeypatch/ring_attn/adapters/batch.py b/src/axolotl/monkeypatch/ring_attn/adapters/batch.py
index e556ba5e3..ebed9ebdc 100644
--- a/src/axolotl/monkeypatch/ring_attn/adapters/batch.py
+++ b/src/axolotl/monkeypatch/ring_attn/adapters/batch.py
@@ -18,10 +18,15 @@ import transformers
 import transformers.modeling_flash_attention_utils
 from ring_flash_attn import ring_flash_attn_func
 from ring_flash_attn.adapters.hf_adapter import check_params
-from transformers.modeling_flash_attention_utils import (
-    _flash_supports_window_size,
-    is_flash_attn_greater_or_equal,
-)
+from transformers.modeling_flash_attention_utils import is_flash_attn_greater_or_equal
+
+try:
+    from transformers.modeling_flash_attention_utils import _flash_supports_window
+except ImportError:
+    from transformers.modeling_flash_attention_utils import (
+        _flash_supports_window_size as _flash_supports_window,
+    )
+
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
 
 from axolotl.utils.schemas.enums import RingAttnFunc
@@ -33,7 +38,7 @@ RING_ATTN_FUNC_MAPPING = {
 }
 
 
-def create_flash_attn_forward(
+def create_flash_attn_forward_varlen_llama3(
     process_group: dist.ProcessGroup, ring_attn_func: RingAttnFunc
 ) -> Callable:
     """
@@ -71,6 +76,7 @@ def create_flash_attn_forward(
         max_length_q: int | None = None,
         max_length_k: int | None = None,
         target_dtype: torch.dtype | None = None,
+        attn_implementation: str | None = None,
         **kwargs,
     ):
         """
@@ -97,6 +103,7 @@ def create_flash_attn_forward(
             max_length_q: Not used in this implementation.
             max_length_k: Not used in this implementation.
             target_dtype: Not used in this implementation.
+            attn_implementation: Not used in this implementation.
             **kwargs: Additional keyword arguments. Not used in this implementation.
 
         Returns:
@@ -110,7 +117,7 @@ def create_flash_attn_forward(
 
         # Handle sliding window
         use_sliding_windows = (
-            _flash_supports_window_size
+            _flash_supports_window
             and sliding_window is not None
             and key_states.shape[1] > sliding_window
         )
@@ -161,7 +168,7 @@ def substitute_hf_flash_attn(
         old_flash_attention_forward = (
             transformers.modeling_flash_attention_utils._flash_attention_forward
         )
-        new_flash_attention_forward = create_flash_attn_forward(
+        new_flash_attention_forward = create_flash_attn_forward_varlen_llama3(
             process_group=process_group, ring_attn_func=ring_attn_func
         )
 
diff --git a/src/axolotl/monkeypatch/ring_attn/patch.py b/src/axolotl/monkeypatch/ring_attn/patch.py
index 7d733cfc1..934687a16 100644
--- a/src/axolotl/monkeypatch/ring_attn/patch.py
+++ b/src/axolotl/monkeypatch/ring_attn/patch.py
@@ -8,51 +8,33 @@ We also provide some patches for accelerate functions to prepare the dataloader
 sequence parallelism training.
 """
 
-import inspect
+import os
+from typing import Optional
 
-import accelerate
 import torch
 import torch.distributed as dist
-from accelerate.logging import get_logger
+from torch.distributed import DeviceMesh
+
+try:
+    from transformers.modeling_flash_attention_utils import _flash_supports_window
+except ImportError:
+    from transformers.modeling_flash_attention_utils import (
+        _flash_supports_window_size as _flash_supports_window,
+    )
 
 from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
+from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.enums import RingAttnFunc
 
 LOG = get_logger(__name__)
 
-
 RING_ATTN_GROUP = None
 
-ORIGINAL_PREPARE_DATALOADER_CODE = """            submesh_fsdp_size = 1
-            submesh_dp_size = 1
-            submesh_tp_size = 1
-            if "tp" in torch_device_mesh.mesh_dim_names:
-                submesh_tp_size = torch_device_mesh["tp"].size()
-            if "dp" in torch_device_mesh.mesh_dim_names:
-                submesh_dp_size = torch_device_mesh["dp"].size()
-            if "fsdp" in torch_device_mesh.mesh_dim_names:
-                submesh_fsdp_size = torch_device_mesh["fsdp"].size()
-            process_index = process_index // submesh_tp_size"""
-
-NEW_PREPARE_DATALOADER_CODE = """            submesh_fsdp_size = 1
-            submesh_dp_size = 1
-            submesh_tp_size = 1
-            submesh_cp_size = 1
-            if "cp" in torch_device_mesh.mesh_dim_names:
-                submesh_cp_size = torch_device_mesh["cp"].size()
-            if "tp" in torch_device_mesh.mesh_dim_names:
-                submesh_tp_size = torch_device_mesh["tp"].size()
-            if "dp" in torch_device_mesh.mesh_dim_names:
-                submesh_dp_size = torch_device_mesh["dp"].size()
-            if "fsdp" in torch_device_mesh.mesh_dim_names:
-                submesh_fsdp_size = torch_device_mesh["fsdp"].size()
-            process_index = process_index // (submesh_tp_size * submesh_cp_size)"""
-
 
 def get_ring_attn_group() -> dist.ProcessGroup:
     """Getter for ring attention group on this rank."""
     if RING_ATTN_GROUP is None:
-        raise RuntimeError("register_ring_attn() not yet called")
+        raise RuntimeError("register_ring_attn_from_device_mesh() not yet called")
     return RING_ATTN_GROUP
 
 
@@ -62,15 +44,107 @@ def set_ring_attn_group(ring_attn_group: dist.ProcessGroup | None):
     RING_ATTN_GROUP = ring_attn_group
 
 
-def register_ring_attn(
-    sequence_parallel_degree: int,
+def create_ring_flash_attention_forward(
+    process_group: dist.ProcessGroup, heads_k_stride: int
+):
+    from ring_flash_attn import llama3_flash_attn_varlen_func
+    from ring_flash_attn.adapters.hf_adapter import DATA_PARAMS
+
+    def _flash_attention_forward_v3(
+        query_states: torch.Tensor,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        attention_mask: torch.Tensor,  # pylint: disable=unused-argument
+        query_length: int,
+        is_causal: bool,
+        dropout: float = 0.0,
+        position_ids: Optional[torch.Tensor] = None,  # pylint: disable=unused-argument
+        softmax_scale: Optional[float] = None,
+        sliding_window: Optional[int] = None,
+        use_top_left_mask: bool = False,
+        softcap: Optional[float] = None,
+        deterministic: bool = None,
+        cu_seq_lens_q: Optional[
+            torch.LongTensor
+        ] = None,  # pylint: disable=unused-argument
+        cu_seq_lens_k: Optional[
+            torch.LongTensor
+        ] = None,  # pylint: disable=unused-argument
+        max_length_q: Optional[int] = None,  # pylint: disable=unused-argument
+        max_length_k: Optional[int] = None,  # pylint: disable=unused-argument
+        target_dtype: Optional[torch.dtype] = None,  # pylint: disable=unused-argument
+        attn_implementation: Optional[str] = None,  # pylint: disable=unused-argument
+        **kwargs,  # pylint: disable=unused-argument
+    ):
+        # pylint: disable=duplicate-code
+        if not use_top_left_mask:
+            causal = is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__.
+            causal = is_causal and query_length != 1
+
+        # Assuming 4D tensors, key_states.shape[1] is the key/value sequence length (source length).
+        use_sliding_windows = (
+            _flash_supports_window
+            and sliding_window is not None
+            and key_states.shape[1] > sliding_window
+        )
+        flash_kwargs = (
+            {"window_size": (sliding_window, sliding_window)}
+            if use_sliding_windows
+            else {}
+        )
+
+        if deterministic is None:
+            deterministic = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1"
+        flash_kwargs["deterministic"] = deterministic
+        assert (
+            softcap is None
+        ), "llama3_flash_attn_varlen_func does not support softcap yet."
+        # flash_kwargs["softcap"] = softcap
+        flash_kwargs["group"] = process_group
+
+        # not sure why attention_mask can be not None...
+        assert causal, "only causal attention is supported yet."
+        batch_size = query_states.size(0)
+        assert batch_size == 1, "varlen data should be processed in advance."
+
+        attn_output = llama3_flash_attn_varlen_func(
+            query_states.squeeze(dim=0),
+            key_states.squeeze(dim=0),
+            value_states.squeeze(dim=0),
+            cu_seqlens_q=DATA_PARAMS["cu_seqlens_q"],
+            cu_seqlens_k=DATA_PARAMS["cu_seqlens_k"],
+            max_seqlen_q=DATA_PARAMS["max_seqlen_q"],
+            max_seqlen_k=DATA_PARAMS["max_seqlen_k"],
+            heads_k_stride=heads_k_stride,
+            local_k_slice=DATA_PARAMS["local_k_slice"],
+            dropout_p=dropout,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            **flash_kwargs,
+        )
+
+        attn_output = attn_output.unsqueeze(dim=0)
+
+        return attn_output
+
+    return [
+        _flash_attention_forward_v3,
+    ]
+
+
+def register_ring_attn_from_device_mesh(
+    device_mesh: "DeviceMesh",
+    context_parallel_dim: tuple[str, ...],
     heads_k_stride: int | None,
     ring_attn_func: RingAttnFunc | None,
 ):
-    """Create ring attention group and substitute flash attn with ring flash attn.
+    """Create ring attention group using DeviceMesh and substitute flash attn with ring flash attn.
 
     Args:
-        sequence_parallel_degree: Sequence parallelism factor.
+        device_mesh: DeviceMesh object containing the parallelism topology.
+        context_parallel_dim: Name of the sequence parallel dimension in the device mesh.
         heads_k_stride: Sequence parallelism K head stride size. Passed through to
             `varlen_llama3` `ring_flash_attn` implementation.
         ring_attn_func: `ring_flash_attn` ring attention implemention. If sample
@@ -78,49 +152,55 @@ def register_ring_attn(
             `batch` function.
     """
     rank = dist.get_rank()
-    world_size = dist.get_world_size()
+
+    LOG.info(
+        f"Enabling ring attention sequence parallelism using DeviceMesh "
+        f"dimension '{context_parallel_dim}'",
+        main_process_only=True,
+    )
+
+    # Extract the sequence parallel submesh
+    try:
+        sequence_mesh = device_mesh[context_parallel_dim]
+    except (KeyError, IndexError) as e:
+        raise ValueError(
+            f"Dimension '{context_parallel_dim}' not found in device_mesh. "
+            f"Available dimensions: {device_mesh.mesh_dim_names}"
+        ) from e
+
+    # Get the process group for context parallelism
+    sequence_pg = sequence_mesh.get_group()
+    context_parallel_size = sequence_mesh.size()
 
     if rank == 0:
         LOG.info(
-            "Enabling ring attention sequence parallelism: "
-            f"each sequence will be processed across {sequence_parallel_degree} GPUs"
+            f"Sequence parallel degree: {context_parallel_size}, "
+            f"mesh shape: {sequence_mesh.mesh.shape}"
         )
 
-    assert sequence_parallel_degree <= world_size, (
-        f"sequence_parallel_degree ({sequence_parallel_degree}) "
-        f"must be less than or equal to world_size ({world_size})"
-    )
-    assert world_size % sequence_parallel_degree == 0, (
-        f"sequence_parallel_degree ({sequence_parallel_degree}) "
-        f"must evenly divide world_size ({world_size})"
-    )
+    # Log which ranks are in the current process group
+    if sequence_pg != dist.GroupMember.WORLD:
+        ranks_in_group = dist.get_process_group_ranks(sequence_pg)
+        LOG.info(f"Current sequence parallel group ranks: {ranks_in_group}")
 
-    # Assign ranks to sequence parallel groups
-    group_assignments = {}
-    for i in range(world_size // sequence_parallel_degree):
-        ring_attn_ranks = list(
-            range(
-                i * sequence_parallel_degree,
-                (i + 1) * sequence_parallel_degree,
-            )
-        )
-        group = dist.new_group(ranks=ring_attn_ranks, backend="nccl")
-
-        # Track which GPUs are in which groups
-        for r in ring_attn_ranks:
-            group_assignments[r] = i
-
-        if rank in ring_attn_ranks:
-            set_ring_attn_group(group)
-
-    # Log the GPU group assignments
-    if rank == 0:
-        LOG.info(f"Sequence parallel group assignments: {group_assignments}")
+    # Set the ring attention group
+    set_ring_attn_group(sequence_pg)
 
     if ring_attn_func is RingAttnFunc.VARLEN_LLAMA3:
-        from ring_flash_attn import substitute_hf_flash_attn
+        # fmt: off
+        import ring_flash_attn.adapters.hf_adapter
 
-        substitute_hf_flash_attn(
+        from ring_flash_attn.adapters.hf_adapter import (  # isort: skip  # pylint: disable=unused-import
+            create_ring_flash_attention_forward as create_ring_flash_attention_forward_orig,
+        )
+
+        create_ring_flash_attention_forward_orig = (  # noqa: F811,F841
+            create_ring_flash_attention_forward
+        )
+        ring_flash_attn.adapters.hf_adapter.create_ring_flash_attention_forward = create_ring_flash_attention_forward
+        # fmt: on
+
+        ring_flash_attn.adapters.hf_adapter.substitute_hf_flash_attn(
             process_group=get_ring_attn_group(), heads_k_stride=heads_k_stride or 1
         )
     elif ring_attn_func is RingAttnFunc.BATCH_RING:
@@ -147,79 +227,3 @@ def update_ring_attn_params(position_ids: torch.Tensor | None):
     cu_seqlens, _ = get_cu_seqlens_from_pos_ids(position_ids)
     cu_seqlens = cu_seqlens.squeeze().to(device=torch.cuda.current_device())
     update_ring_flash_attn_params(cu_seqlens, get_ring_attn_group())
-
-
-def patch_prepare_data_loader():
-    """Patch `accelerate.data_loader.prepare_data_loader` to respect the SP degree.
-
-    Raies:
-        RuntimeError: If source code to patch does not exist.
-    """
-    original_fn = accelerate.data_loader.prepare_data_loader
-    original_source = inspect.getsource(original_fn)
-
-    if ORIGINAL_PREPARE_DATALOADER_CODE not in original_source:
-        raise RuntimeError(
-            "SP patch failed - target snippet not found. "
-            "Check accelerate's version or update the patch."
-        )
-
-    patched_source = original_source.replace(
-        ORIGINAL_PREPARE_DATALOADER_CODE, NEW_PREPARE_DATALOADER_CODE
-    )
-
-    # Create a new function from the patched source
-    namespace = {}
-    exec(  # pylint: disable=exec-used  # nosec B102
-        patched_source, accelerate.data_loader.__dict__, namespace
-    )
-    patched_function = namespace["prepare_data_loader"]
-
-    accelerate.data_loader.prepare_data_loader = patched_function
-    LOG.info("Patched accelerate.data_loader.prepare_data_loader for SP support")
-
-
-def patch_prepare_device_mesh(sequence_parallel_degree: int):
-    """Patches the `Accelerator._prepare_device_mesh` method to create a device mesh
-    that includes sequence parallelism with the specified degree.
-
-    Args:
-        sequence_parallel_degree (int): The degree of sequence parallelism to use.
-    """
-
-    def _prepare_device_mesh(self):
-        """Prepare the device mesh for distributed training. The dataloader will
-        determine how to load data based on the device mesh.
-        """
-        if self.state.torch_tp_plugin:
-            return self.state.torch_tp_plugin.torch_device_mesh
-        if (
-            self.distributed_type == accelerate.accelerator.DistributedType.DEEPSPEED
-            and hasattr(self.state, "ds_device_mesh")
-        ):
-            return self.state.ds_device_mesh
-
-        # Create device mesh with sequence parallelism
-        world_size = dist.get_world_size()
-        mesh_shape = (
-            world_size // sequence_parallel_degree,
-            sequence_parallel_degree,
-        )
-        device_ids = list(range(world_size))
-
-        # Note that we use "cp" instead of "sp" to match the PyTorch native "context
-        # parallelism" implementation naming
-        return dist.DeviceMesh(
-            "cuda",
-            torch.tensor(device_ids).reshape(mesh_shape),
-            mesh_dim_names=("dp", "cp"),
-        )
-
-    # Replace the original method with our new method
-    # pylint: disable=protected-access
-    accelerate.accelerator.Accelerator._prepare_device_mesh = _prepare_device_mesh
-
-    LOG.info(
-        "Successfully patched Accelerator._prepare_device_mesh "
-        f"with sequence_parallel_degree={sequence_parallel_degree}"
-    )
diff --git a/src/axolotl/monkeypatch/stablelm_attn_hijack_flash.py b/src/axolotl/monkeypatch/stablelm_attn_hijack_flash.py
index c60302111..85454fe2e 100644
--- a/src/axolotl/monkeypatch/stablelm_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/stablelm_attn_hijack_flash.py
@@ -32,11 +32,11 @@ from flash_attn.flash_attn_interface import (  # pylint: disable=ungrouped-impor
 from torch import nn
 from transformers import AutoConfig, AutoModelForCausalLM
 from transformers.modeling_outputs import BaseModelOutputWithPast
-from transformers.utils import logging
 
 from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
+from axolotl.utils.logging import get_logger
 
-logger = logging.get_logger(__name__)
+logger = get_logger(__name__)
 
 
 def replace_stablelm_attn_with_flash_attn(model_name="stabilityai/stablelm-3b-4e1t"):
diff --git a/src/axolotl/monkeypatch/tiled_mlp/__init__.py b/src/axolotl/monkeypatch/tiled_mlp/__init__.py
new file mode 100644
index 000000000..4ea154991
--- /dev/null
+++ b/src/axolotl/monkeypatch/tiled_mlp/__init__.py
@@ -0,0 +1,11 @@
+"""
+TiledMLP monkey patches
+"""
+
+from .patch import (
+    patch_tiled_mlp,
+)
+
+__all__ = [
+    "patch_tiled_mlp",
+]
diff --git a/src/axolotl/monkeypatch/tiled_mlp/base.py b/src/axolotl/monkeypatch/tiled_mlp/base.py
new file mode 100644
index 000000000..3b7326bdb
--- /dev/null
+++ b/src/axolotl/monkeypatch/tiled_mlp/base.py
@@ -0,0 +1,153 @@
+"""
+TiledMLP support for DDP, FSDP, and single GPU
+"""
+
+import threading
+from typing import List
+
+import torch
+
+
+class TiledMLP(torch.autograd.Function):
+    """
+    TiledMLP implementation using gradient hooks
+    """
+
+    @staticmethod
+    def forward(
+        ctx,
+        fn,
+        self,
+        x,
+        shards,
+        compute_params,
+    ) -> torch.Tensor:
+        ctx.fn = fn
+        ctx.self = self
+        ctx.shards = shards
+        ctx.compute_params = [p for p in compute_params if p.requires_grad]
+        ctx.save_for_backward(x)
+
+        x_shards = list(torch.chunk(x, chunks=shards, dim=1))
+        with torch.no_grad():
+            output_shards = [fn(self, x_shard) for x_shard in x_shards]
+        output_unsharded = torch.cat(output_shards, dim=1)
+
+        return output_unsharded
+
+    @staticmethod
+    def backward(ctx, *grads) -> torch.Tensor:
+        fn = ctx.fn
+        (x,) = ctx.saved_tensors
+        self = ctx.self
+        shards = ctx.shards
+        compute_params = ctx.compute_params
+
+        x_requires_grad = x.requires_grad
+        x = x.detach()
+        x.requires_grad_(x_requires_grad)
+
+        incoming_grad = grads[0]
+        x_grad = torch.zeros_like(x)
+        x_shards = list(torch.chunk(x, chunks=shards, dim=1))
+
+        # Create a gradient accumulator for parameters
+        grad_accumulator = GradientAccumulator(compute_params, shards, dtype=x.dtype)
+
+        shard_step = x_shards[0].numel()
+        for i, x_shard in enumerate(x_shards):
+            x_shard.requires_grad_(x_requires_grad)
+
+            shard_offset = i * shard_step
+            x_shard.grad = (
+                x_grad.view(-1)
+                .narrow(0, shard_offset, x_shard.numel())
+                .view_as(x_shard)
+            )
+            incoming_grad_shard = (
+                incoming_grad.view(-1)
+                .narrow(0, shard_offset, x_shard.numel())
+                .view_as(x_shard)
+            )
+
+            # Install hooks for this shard
+            is_last_shard = i + 1 == shards
+            grad_accumulator.install_hooks(is_last_shard)
+
+            with torch.enable_grad():
+                output = fn(self, x_shard)
+            torch.autograd.backward(output, incoming_grad_shard)
+
+        # Clean up hooks
+        grad_accumulator.cleanup()
+        del grad_accumulator
+
+        return (None, None, x_grad, None, None)
+
+
+class GradientAccumulator:
+    """
+    Manual gradient accumulator for TiledMLP with configurable precision
+    Accumulates in specified dtype and rescales the gradient at the end
+    """
+
+    def __init__(
+        self,
+        params: List[torch.nn.Parameter],
+        total_shards: int,
+        dtype: torch.dtype | None = None,
+    ):
+        self.params = params
+        self.total_shards = total_shards
+        self.grad_accumulation_dtype = dtype or torch.float32
+        self.accumulated_grads = {}
+        self.hooks = []
+        self.lock = threading.Lock()
+        self.gradient_scale = 1.0 / total_shards
+
+        # Initialize accumulated gradients in the specified dtype
+        for param in self.params:
+            if param.grad is not None:
+                self.accumulated_grads[param] = param.grad.to(
+                    self.grad_accumulation_dtype
+                )
+                param.grad = None
+            else:
+                self.accumulated_grads[param] = torch.zeros_like(
+                    param, dtype=self.grad_accumulation_dtype
+                )
+
+    def install_hooks(self, is_last_shard: bool):
+        """Install gradient hooks that accumulate gradients in higher precision"""
+
+        def create_hook(param):
+            def hook(grad):
+                with self.lock:
+                    grad_to_accum_dtype = grad.to(self.grad_accumulation_dtype)
+                    scaled_grad = grad_to_accum_dtype * self.gradient_scale
+
+                    if param in self.accumulated_grads:
+                        self.accumulated_grads[param] += scaled_grad
+                    else:
+                        self.accumulated_grads[param] = scaled_grad.clone()
+
+                    # Only assign the averaged gradient on the last shard
+                    if is_last_shard:
+                        param.grad = self.accumulated_grads[param].to(param.dtype)
+                        return param.grad
+                    return None
+
+            return hook
+
+        # Install hooks on all parameters
+        for param in self.params:
+            if param.requires_grad:
+                hook = param.register_hook(create_hook(param))
+                self.hooks.append(hook)
+
+    def cleanup(self):
+        """Remove all installed hooks"""
+        for hook in self.hooks:
+            hook.remove()
+        self.hooks.clear()
+        del self.accumulated_grads
diff --git a/src/axolotl/monkeypatch/tiled_mlp/patch.py b/src/axolotl/monkeypatch/tiled_mlp/patch.py
new file mode 100644
index 000000000..419c73104
--- /dev/null
+++ b/src/axolotl/monkeypatch/tiled_mlp/patch.py
@@ -0,0 +1,92 @@
+"""Monkeypatch for Tiled MLP implementation"""
+
+import math
+import os
+
+import torch
+import torch.distributed as dist
+
+from axolotl.utils.callbacks.models import get_causal_lm_model_cls_prefix
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+
+def patch_tiled_mlp(model_type, use_original_mlp=True, cfg_num_shards=None):
+    from deepspeed.runtime.sequence_parallel.ulysses_sp import (
+        TiledMLP as DeepSpeedTiledMLP,
+    )
+
+    from axolotl.monkeypatch.tiled_mlp.base import TiledMLP
+
+    try:
+        # Dynamically import the module and MLP class
+        module_path = f"transformers.models.{model_type}.modeling_{model_type}"
+        model_cls_prefix, _ = get_causal_lm_model_cls_prefix(model_type)
+        module = __import__(module_path, fromlist=[f"{model_cls_prefix}MLP"])
+        mlp_cls = getattr(module, f"{model_cls_prefix}MLP")
+
+        if use_original_mlp:
+            mlp_forward = mlp_cls.forward
+        else:
+
+            def generic_mlp_forward(self_, hs):
+                return self_.down_proj(
+                    self_.act_fn(self_.gate_proj(hs)) * self_.up_proj(hs)
+                )
+
+            mlp_forward = torch.compile(generic_mlp_forward)
+
+        is_distributed = int(os.environ.get("WORLD_SIZE", 1)) > 1
+
+        def tiled_mlp_forward(self, x):
+            # pylint: disable=protected-access
+            input_shape = x.shape
+            seqlen = input_shape[-2]
+            hidden = input_shape[-1]
+            if cfg_num_shards is None:
+                num_shards = math.ceil(seqlen / hidden)
+                if is_distributed:
+                    num_shards_tensor = torch.tensor(num_shards, device=x.device)
+                    dist.all_reduce(num_shards_tensor, op=dist.ReduceOp.MAX)
+                    num_shards = num_shards_tensor.item()
+            else:
+                num_shards = cfg_num_shards
+
+            if not self._compute_params:
+                self._compute_params = [p for p in self.parameters() if p.requires_grad]
+
+            compute_params = self._compute_params
+            if not self._tiled_mlp_dist_impl:
+                if (
+                    self._compute_params
+                    and any(
+                        hasattr(p, "ds_id") or hasattr(p, "param_idx_in_group")
+                        for p in self._compute_params
+                    )
+                ) or os.environ.get("ACCELERATE_USE_DEEPSPEED", "false") == "true":
+                    self._tiled_mlp_dist_impl = DeepSpeedTiledMLP
+                else:
+                    self._tiled_mlp_dist_impl = TiledMLP
+
+            down_res = self._tiled_mlp_dist_impl.apply(
+                mlp_forward,
+                self,
+                x,
+                num_shards,
+                compute_params,
+            )
+            return down_res
+
+        mlp_cls.forward = tiled_mlp_forward
+        mlp_cls._compute_params = []  # pylint: disable=protected-access
+        mlp_cls._tiled_mlp_dist_impl = None  # pylint: disable=protected-access
+        LOG.info(
+            f"Successfully monkey-patched TiledMLP for model_type: {model_type}",
+            main_process_only=True,
+        )
+    except (ImportError, AttributeError) as e:
+        raise RuntimeError(
+            f"Could not import MLP class for model_type: {model_type}. "
+            f"Error: {str(e)}"
+        ) from e
diff --git a/src/axolotl/monkeypatch/trainer/lr.py b/src/axolotl/monkeypatch/trainer/lr.py
index 0176093d6..9afc23c46 100644
--- a/src/axolotl/monkeypatch/trainer/lr.py
+++ b/src/axolotl/monkeypatch/trainer/lr.py
@@ -2,11 +2,11 @@
 monkeypatch for Trainer _get_learning_rate method
 """
 
-import logging
-
 import torch
 
-LOG = logging.getLogger(__name__)
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
 
 
 # TODO remove this patch once https://github.com/huggingface/transformers/pull/37881 is included in a release
diff --git a/src/axolotl/monkeypatch/trainer/trl.py b/src/axolotl/monkeypatch/trainer/trl.py
new file mode 100644
index 000000000..bca9f92de
--- /dev/null
+++ b/src/axolotl/monkeypatch/trainer/trl.py
@@ -0,0 +1,13 @@
+"""Monkeypatch for TRL trainer FSDP preparation."""
+
+
+def prepare_fsdp(model, accelerator):
+    from axolotl.monkeypatch.accelerate.fsdp2 import fsdp2_prepare_model
+
+    return fsdp2_prepare_model(accelerator, model)
+
+
+def patch_trl_prepare_fsdp2():
+    import trl.models.utils
+
+    trl.models.utils.prepare_fsdp = prepare_fsdp
diff --git a/src/axolotl/monkeypatch/trainer_accelerator_args.py b/src/axolotl/monkeypatch/trainer_accelerator_args.py
index d87812c9f..819a66255 100644
--- a/src/axolotl/monkeypatch/trainer_accelerator_args.py
+++ b/src/axolotl/monkeypatch/trainer_accelerator_args.py
@@ -3,13 +3,13 @@ allow adding additional kwargs to Accelerator init
 """
 
 import inspect
-import logging
 
 from transformers import Trainer
 
 from axolotl.monkeypatch.utils import detab_code
+from axolotl.utils.logging import get_logger
 
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__)
 
 ORIGINAL_TRAINER_CODE = """
     # create accelerator object
@@ -18,7 +18,7 @@ ORIGINAL_TRAINER_CODE = """
 
 PATCHED_TRAINER_CODE = """
     if hasattr(self, "additional_accelerator_args"):
-        additional_args = self.additional_accelerator_args(fp8=True, **args)
+        additional_args = self.additional_accelerator_args(fp8=True, enable_fsdp_float8_all_gather={enable_fsdp_float8_all_gather}, **args)
         if additional_args:
             args.update(additional_args)
 
@@ -38,9 +38,9 @@ def check_create_accelerate_code_is_patchable() -> bool:
     return ORIGINAL_TRAINER_CODE in create_code
 
 
-def patch_create_accelerate_code_for_fp8():
+def patch_create_accelerate_code_for_fp8(enable_fsdp_float8_all_gather: bool):
     """
-    monkeypatch create_accelerator_and_postprocess so it checks for additional kwargs
+    Monkeypatch create_accelerator_and_postprocess so it checks for additional kwargs.
     """
 
     try:
@@ -54,7 +54,10 @@ def patch_create_accelerate_code_for_fp8():
     if ORIGINAL_TRAINER_CODE not in create_code:
         return
 
-    create_code = create_code.replace(ORIGINAL_TRAINER_CODE, PATCHED_TRAINER_CODE)
+    patched_trainer_code = PATCHED_TRAINER_CODE.format(
+        enable_fsdp_float8_all_gather=enable_fsdp_float8_all_gather
+    )
+    create_code = create_code.replace(ORIGINAL_TRAINER_CODE, patched_trainer_code)
     create_code = create_code.replace(
         "def create_accelerator_and_postprocess(",
         "def fixed_create_accelerator_and_postprocess(",
diff --git a/src/axolotl/monkeypatch/trainer_eval_guard.py b/src/axolotl/monkeypatch/trainer_eval_guard.py
deleted file mode 100644
index e929ac766..000000000
--- a/src/axolotl/monkeypatch/trainer_eval_guard.py
+++ /dev/null
@@ -1,78 +0,0 @@
-"""
-fix for FSDP2 evals when using torch.compile
-"""
-
-import inspect
-import logging
-
-from transformers import Trainer
-
-from axolotl.monkeypatch.utils import detab_code
-
-LOG = logging.getLogger(__name__)
-
-ORIGINAL_TRAINER_CODE = """
-    model.eval()
-"""
-
-PATCHED_TRAINER_CODE = """
-    if hasattr(model, "eval") and callable(model.eval):
-        self.model.eval()
-"""
-
-
-def get_evaluation_loop_code() -> str:
-    training_loop = inspect.getsource(Trainer.evaluation_loop)
-    return training_loop
-
-
-def check_evaluation_loop_is_patchable() -> bool:
-    eval_loop = get_evaluation_loop_code()
-    eval_loop, _ = detab_code(eval_loop)
-    return ORIGINAL_TRAINER_CODE in eval_loop
-
-
-def patch_evaluation_loop_for_fsdp2():
-    """
-    monkeypatch for fixing the eval loop for fsdp2 with torch.compile
-    """
-
-    try:
-        evaluation_loop = get_evaluation_loop_code()
-    except OSError:
-        return
-    Trainer._original_evaluation_loop = (  # pylint: disable=protected-access
-        evaluation_loop
-    )
-    evaluation_loop, _ = detab_code(evaluation_loop)
-    if ORIGINAL_TRAINER_CODE not in evaluation_loop:
-        return
-
-    evaluation_loop = evaluation_loop.replace(
-        ORIGINAL_TRAINER_CODE, PATCHED_TRAINER_CODE
-    )
-    evaluation_loop = evaluation_loop.replace(
-        "def evaluation_loop(",
-        "def _fixed_evaluation_loop(",
-        1,
-    )
-
-    # load imports necessary
-    import transformers.trainer
-
-    items_to_import = []
-    for item in dir(transformers.trainer):
-        if item in evaluation_loop:
-            items_to_import.append(item)
-
-    exec(  # pylint: disable=exec-used  # nosec B102
-        "from transformers.trainer import ("
-        + ", ".join(x for x in items_to_import)
-        + ")",
-        globals(),
-    )
-    exec(evaluation_loop, globals())  # pylint: disable=exec-used  # nosec B102
-    LOG.info("patching _inner_training_loop for fsdp optimizer save")
-    Trainer.evaluation_loop = (  # pylint: disable=protected-access
-        _fixed_evaluation_loop  # pylint: disable=undefined-variable  # noqa: F821
-    )
diff --git a/src/axolotl/monkeypatch/trainer_fsdp_optim.py b/src/axolotl/monkeypatch/trainer_fsdp_optim.py
index 1cbfefa5b..1c2511524 100644
--- a/src/axolotl/monkeypatch/trainer_fsdp_optim.py
+++ b/src/axolotl/monkeypatch/trainer_fsdp_optim.py
@@ -3,24 +3,22 @@ fix for FSDP optimizer save in trainer w 4.47.0
 """
 
 import inspect
-import logging
 
 from transformers import Trainer
 
 from axolotl.monkeypatch.utils import detab_code
+from axolotl.utils.logging import get_logger
 
-LOG = logging.getLogger("axolotl.monkeypatch.trainer_fsdp_save")
+LOG = get_logger(__name__)
 
 ORIGINAL_TRAINER_CODE = """
-
-    delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled
-
+                if delay_optimizer_creation:
+                    self.optimizer = self.accelerator.prepare(self.optimizer)
 """
 
 PATCHED_TRAINER_CODE = """
-
-    delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled
-
+                if delay_optimizer_creation:
+                    model = self.accelerator.prepare(self.model)
 """
 
 
diff --git a/src/axolotl/monkeypatch/transformers/__init__.py b/src/axolotl/monkeypatch/transformers/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py b/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py
new file mode 100644
index 000000000..75f4158b3
--- /dev/null
+++ b/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py
@@ -0,0 +1,165 @@
+"""
+Module for patching transformers Trainer loss calculation to use nanmean.
+
+This is needed for context parallelism since chunks of the input sequences may be fully
+masked and return NaNs in the loss calculation.
+
+Also includes a patch for FSDP2 + torch.compile. We need to bundle this together with
+the other evaluation_loop patch because we can't patch the same code twice without
+raising an OSError.
+"""
+
+import importlib
+import inspect
+
+from transformers import Trainer
+
+from axolotl.monkeypatch.utils import detab_code
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+ORIGINAL_EVAL_CODE = {
+    "list": 'metrics[f"{metric_key_prefix}_loss"] = np.concatenate(all_losses).mean().item()',
+    "array": 'metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item()',
+}
+PATCHED_EVAL_CODE = {
+    "list": 'metrics[f"{metric_key_prefix}_loss"] = np.nanmean(np.concatenate(all_losses)).item()',
+    "array": 'metrics[f"{metric_key_prefix}_loss"] = np.nanmean(all_losses).item()',
+}
+
+ORIGINAL_FSDP2_CODE = """
+    model.eval()
+"""
+
+PATCHED_FSDP2_CODE = """
+    if hasattr(model, "eval") and callable(model.eval):
+        self.model.eval()
+"""
+
+ORIGINAL_MAYBE_CODE = "tr_loss_scalar = self._nested_gather(tr_loss).mean().item()"
+PATCHED_MAYBE_CODE = "tr_loss_scalar = self._nested_gather(tr_loss).nanmean().item()"
+
+
+def check_evaluation_loop_is_patchable() -> bool:
+    evaluation_loop_source = inspect.getsource(Trainer.evaluation_loop)
+    return all(value in evaluation_loop_source for value in ORIGINAL_EVAL_CODE.values())
+
+
+def check_evaluation_loop_is_fsdp2_patchable() -> bool:
+    evaluation_loop_source = inspect.getsource(Trainer.evaluation_loop)
+    evaluation_loop_source, _ = detab_code(evaluation_loop_source)
+    return ORIGINAL_FSDP2_CODE in evaluation_loop_source
+
+
+# pylint: disable=protected-access
+def patch_evaluation_loop(patch_fsdp2: bool):
+    """Patch the evaluation_loop method."""
+    # Check if already patched
+    if hasattr(Trainer, "_original_evaluation_loop"):
+        LOG.info("Trainer.evaluation_loop already patched")
+        return
+
+    # Check if the patterns exist
+    try:
+        evaluation_loop_source = inspect.getsource(Trainer.evaluation_loop)
+    except OSError:
+        return
+    Trainer.evaluation = evaluation_loop_source
+    evaluation_loop_source, _ = detab_code(evaluation_loop_source)
+
+    # Apply the nanmean patches
+    evaluation_loop_source = evaluation_loop_source.replace(
+        ORIGINAL_EVAL_CODE["list"], PATCHED_EVAL_CODE["list"]
+    )
+    evaluation_loop_source = evaluation_loop_source.replace(
+        ORIGINAL_EVAL_CODE["array"], PATCHED_EVAL_CODE["array"]
+    )
+
+    # Apply FSDP2 eval guard patch if needed
+    if patch_fsdp2 and ORIGINAL_FSDP2_CODE in evaluation_loop_source:
+        evaluation_loop_source = evaluation_loop_source.replace(
+            ORIGINAL_FSDP2_CODE, PATCHED_FSDP2_CODE
+        )
+        LOG.info("Applied FSDP2 eval guard patch to evaluation_loop")
+
+    # Rename the function to avoid conflicts
+    evaluation_loop_source = evaluation_loop_source.replace(
+        "def evaluation_loop(",
+        "def axolotl_evaluation_loop(",
+        1,
+    )
+
+    # Get the module for necessary imports
+    module_name = Trainer.__module__
+    module = importlib.import_module(module_name)
+
+    # Import necessary items from the module
+    items_to_import = []
+    for item in dir(module):
+        if item in evaluation_loop_source:
+            items_to_import.append(item)
+
+    # Execute the imports and patched method
+    exec(  # pylint: disable=exec-used  # nosec B102
+        f"from {module_name} import ({', '.join(items_to_import)})",
+        globals(),
+    )
+    exec(evaluation_loop_source, globals())  # pylint: disable=exec-used  # nosec B102
+
+    LOG.info("Patched Trainer.evaluation_loop with nanmean loss calculation")
+    Trainer.evaluation_loop = (
+        axolotl_evaluation_loop  # pylint: disable=undefined-variable  # noqa: F821
+    )
+
+
+def check_maybe_log_save_evaluate_is_patchable() -> bool:
+    maybe_log_source = inspect.getsource(Trainer._maybe_log_save_evaluate)
+    return ORIGINAL_MAYBE_CODE in maybe_log_source
+
+
+# pylint: disable=protected-access
+def patch_maybe_log_save_evaluate():
+    """Patch the _maybe_log_save_evaluate method."""
+    # Check if already patched
+    if hasattr(Trainer, "_original_maybe_log_save_evaluate"):
+        LOG.info("Trainer._maybe_log_save_evaluate already patched")
+        return
+
+    # Check if the patterns exist
+    try:
+        maybe_log_source = inspect.getsource(Trainer._maybe_log_save_evaluate)
+    except OSError:
+        return
+    Trainer._original_maybe_log_save_evaluate = maybe_log_source
+    maybe_log_source, _ = detab_code(maybe_log_source)
+
+    # Apply the patch
+    maybe_log_source = maybe_log_source.replace(ORIGINAL_MAYBE_CODE, PATCHED_MAYBE_CODE)
+
+    # Rename the function to avoid conflicts
+    maybe_log_source = maybe_log_source.replace(
+        "def _maybe_log_save_evaluate(",
+        "def axolotl_maybe_log_save_evaluate(",
+        1,
+    )
+
+    # Get the module for necessary imports
+    module_name = Trainer.__module__
+    module = importlib.import_module(module_name)
+
+    # Import necessary items from the module
+    items_to_import = []
+    for item in dir(module):
+        if item in maybe_log_source:
+            items_to_import.append(item)
+
+    # Execute the imports and patched method
+    exec(  # pylint: disable=exec-used  # nosec B102
+        f"from {module_name} import ({', '.join(items_to_import)})",
+        globals(),
+    )
+    exec(maybe_log_source, globals())  # pylint: disable=exec-used  # nosec B102
+
+    LOG.info("Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation")
+    Trainer._maybe_log_save_evaluate = axolotl_maybe_log_save_evaluate  # pylint: disable=undefined-variable  # noqa: F821
diff --git a/src/axolotl/monkeypatch/transformers_fa_utils.py b/src/axolotl/monkeypatch/transformers_fa_utils.py
index f34ecb8c0..e372dc3f8 100644
--- a/src/axolotl/monkeypatch/transformers_fa_utils.py
+++ b/src/axolotl/monkeypatch/transformers_fa_utils.py
@@ -2,13 +2,14 @@
 see https://github.com/huggingface/transformers/pull/35834
 """
 
-import logging
 from functools import partial
 from typing import Optional
 
 import torch
 
-logger = logging.getLogger(__name__)
+from axolotl.utils.logging import get_logger
+
+logger = get_logger(__name__)
 
 
 def fixed_fa_peft_integration_check(
diff --git a/src/axolotl/monkeypatch/unsloth_.py b/src/axolotl/monkeypatch/unsloth_.py
index c81bacbfc..146047e95 100644
--- a/src/axolotl/monkeypatch/unsloth_.py
+++ b/src/axolotl/monkeypatch/unsloth_.py
@@ -4,14 +4,14 @@ import inspect
 import types
 
 import torch
-from accelerate.logging import get_logger
 from peft import PeftModelForCausalLM
 from torch import nn
 from transformers.models.llama.modeling_llama import LlamaFlashAttention2
 
 from axolotl.monkeypatch.utils import detab_code
+from axolotl.utils.logging import get_logger
 
-LOG = get_logger("axolotl.monkeypatch.unsloth")
+LOG = get_logger(__name__)
 
 ORIGINAL_QKV_CODE = """
     query_states = self.q_proj(hidden_states)
@@ -133,7 +133,7 @@ def patch_self_attn_lora():
     )
     exec(self_attn_forward, globals())  # pylint: disable=exec-used  # nosec B102
     self_attn_lora_patched = True
-    LOG.info("patching unsloth attn lora", main_process_only=True)
+    LOG.info("patching unsloth attn lora")
     LlamaFlashAttention2.forward = (
         unsloth_attn_forward  # pylint: disable=undefined-variable  # noqa: F821
     )
@@ -153,7 +153,7 @@ def integrate_rope_embeddings():
     ):
         return fast_rope_embedding(q, k, cos, sin)
 
-    LOG.info("patching unsloth RoPE embeddings", main_process_only=True)
+    LOG.info("patching unsloth RoPE embeddings")
     transformers.models.llama.modeling_llama.apply_rotary_pos_emb = apply_rotary_pos_emb
 
 
@@ -189,7 +189,7 @@ def integrate_lora_mlp_patch(peft_model: PeftModelForCausalLM):
         if is_mlp_lora and mlp_no_bias and mlp_not_dora:
             layer.mlp.forward = types.MethodType(apply_lora_mlp, layer.mlp)
         else:
-            LOG.warning("unable to apply unsloth lora mlp patch to layer %d", idx)
+            LOG.warning(f"unable to apply unsloth lora mlp patch to layer {idx}")
 
 
 def integrate_lora_patch(peft_model: PeftModelForCausalLM, cfg):
@@ -215,7 +215,7 @@ def integrate_lora_patch(peft_model: PeftModelForCausalLM, cfg):
                 layer.self_attn.apply_qkv = apply_lora_qkv
             else:
                 layer.self_attn.apply_qkv = original_apply_qkv
-                LOG.warning("unable to apply unsloth lora qkv patch to layer %d", idx)
+                LOG.warning(f"unable to apply unsloth lora qkv patch to layer {idx}")
         if cfg.unsloth_lora_o:
             layer_modules = [
                 getattr(layer.self_attn, linear_proj) for linear_proj in ["o_proj"]
@@ -234,9 +234,7 @@ def integrate_lora_patch(peft_model: PeftModelForCausalLM, cfg):
                 layer.self_attn.apply_o = apply_lora_o
             else:
                 layer.self_attn.apply_o = original_apply_o
-                LOG.warning(
-                    "unable to apply unsloth lora o_proj patch to layer %d", idx
-                )
+                LOG.warning(f"unable to apply unsloth lora o_proj patch to layer {idx}")
 
 
 def patch_unsloth_layernorm():
diff --git a/src/axolotl/processing_strategies.py b/src/axolotl/processing_strategies.py
index 4dee4f8a2..4cc5e85a1 100644
--- a/src/axolotl/processing_strategies.py
+++ b/src/axolotl/processing_strategies.py
@@ -5,10 +5,15 @@ from typing import Optional
 
 from PIL import Image, ImageOps
 from PIL.Image import Resampling
-from torch import Tensor
-from transformers import ProcessorMixin
+from torch import Tensor, zeros_like
+from transformers import ProcessorMixin, VoxtralProcessor
 from transformers.image_utils import load_image
 
+from axolotl.utils.dict import remove_none_values
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
 
 class ProcessingStrategy:
     """Base Processing Strategy class"""
@@ -112,7 +117,9 @@ class ProcessingStrategy:
                 )
 
             processed_example = None
-            if "messages" in example:  # OpenAI format
+            if (
+                "messages" in example and example["messages"] is not None
+            ):  # OpenAI format
                 processed_example = example
             else:  # Legacy format
                 processed_example = convert_legacy_format(example)
@@ -132,10 +139,17 @@ class ProcessingStrategy:
                     break
 
             # if the image key exists, add the image to the first message
-            if image_key is not None:
+            if image_key is not None and processed_example[image_key] is not None:
                 # TODO: check if it's normal to be single image only for common datasets
                 # From observation, it's usually a list of single image but some datasets may have several columns for images
                 # Temporary solution: take the first image and suggest people convert their datasets to use multi-content Messages
+                if len(processed_example[image_key]) > 1:
+                    LOG.warning(
+                        f"Found {len(processed_example[image_key])} images in a sample. Using the first one."
+                        "If you are using a dataset with multiple images per sample, please convert it to use multi-content Messages."
+                        "See https://docs.axolotl.ai/docs/multimodal.html#dataset-format"
+                    )
+
                 image_value = processed_example[image_key][0]
 
                 # Handle image loading (Image, url, path, base64)
@@ -191,13 +205,22 @@ class ProcessingStrategy:
                         }
                     )
 
-            processed_examples.append(processed_example)
+            processed_examples.append(remove_none_values(processed_example))
 
         return processed_examples
 
+    def _mask_non_assistant(self, labels: Tensor) -> Tensor:
+        """
+        Mask non assistant regions to -100.
+        To be implemented per subclass.
+        """
+        return labels
+
     def process_labels(self, input_ids: Tensor) -> Tensor:
         labels = input_ids.clone()
 
+        labels = self._mask_non_assistant(labels)
+
         # The labels are the input_ids, and we mask the padding tokens in the loss computation
         labels[labels == self.processor.tokenizer.pad_token_id] = -100
 
@@ -251,6 +274,127 @@ class Gemma3ProcessingStrategy(ProcessingStrategy):
         return labels
 
 
+class Gemma3nProcessingStrategy(ProcessingStrategy):
+    """Processing Strategy class for Gemma3n"""
+
+    def _mask_non_assistant(self, labels: Tensor) -> Tensor:
+        def _find_token_sequence(label, start_pos, token_sequence):
+            """Check if token_sequence appears at start_pos in label"""
+            if start_pos + len(token_sequence) > len(label):
+                return False
+            if label[start_pos] != token_sequence[0]:
+                return False
+            return (
+                label[start_pos : start_pos + len(token_sequence)].tolist()
+                == token_sequence
+            )
+
+        def _find_assistant_end(label, start_pos, assistant_end_tok, mask, i):
+            """
+            Find the end of assistant response and update mask accordingly
+
+            Returns new position to continue from and whether the end seq is found
+            """
+            k = start_pos
+            while k < len(label):
+                if not _find_token_sequence(label, k, assistant_end_tok):
+                    mask[i][k] = 1
+                    k += 1
+                    continue
+
+                return k + len(assistant_end_tok), True
+
+            return k, False
+
+        mask = zeros_like(labels)
+
+        assistant_start_str = "<start_of_turn>model"
+        assistant_end_str = "<end_of_turn>"
+        include_assistant_start_tok = False
+        include_assistant_end_tok = True
+
+        # str to tokens
+        assistant_start_tok = self.processor.tokenizer.encode(
+            assistant_start_str, add_special_tokens=False
+        )
+        assistant_end_tok = self.processor.tokenizer.encode(
+            assistant_end_str, add_special_tokens=False
+        )
+
+        for i, label in enumerate(labels):
+            j = 0
+            # while loop through each tok index in labels[i]
+            while j < len(label):
+                # Check until match start seq
+                if not _find_token_sequence(label, j, assistant_start_tok):
+                    j += 1
+                    continue
+
+                if include_assistant_start_tok:
+                    mask[i][j : j + len(assistant_start_tok)] = 1
+
+                # Find where the assistant response ends
+                start_of_content = j + len(assistant_start_tok)
+                end_pos, found_end_seq = _find_assistant_end(
+                    label, start_of_content, assistant_end_tok, mask, i
+                )
+
+                # Include end token if requested
+                if include_assistant_end_tok and found_end_seq:
+                    mask[i][end_pos - len(assistant_end_tok) : end_pos] = 1
+
+                j = end_pos
+
+            labels[i][mask[i] == 0] = -100
+
+        return labels
+
+    def process_labels(self, input_ids):
+        labels = input_ids.clone()
+        labels = self._mask_non_assistant(labels)
+
+        # Follows https://colab.research.google.com/github/huggingface/huggingface-gemma-recipes/blob/main/notebooks/fine_tune_gemma3n_on_t4.ipynb
+        labels[labels == self.processor.tokenizer.pad_token_id] = -100
+        if hasattr(self.processor.tokenizer, "image_token_id"):
+            labels[labels == self.processor.tokenizer.image_token_id] = -100
+        if hasattr(self.processor.tokenizer, "audio_token_id"):
+            labels[labels == self.processor.tokenizer.audio_token_id] = -100
+        if hasattr(self.processor.tokenizer, "boi_token_id"):
+            labels[labels == self.processor.tokenizer.boi_token_id] = -100
+        if hasattr(self.processor.tokenizer, "eoi_token_id"):
+            labels[labels == self.processor.tokenizer.eoi_token_id] = -100
+
+        return labels
+
+
+class VoxtralProcessingStrategy(ProcessingStrategy):
+    """Processing Strategy class for Voxtral"""
+
+    def __init__(
+        self,
+        processor: VoxtralProcessor,
+        chat_template: Optional[str] = None,
+        image_size: int | tuple[int, int] | None = None,
+        image_resize_algorithm: Resampling | None = None,
+    ):
+        super().__init__(processor, chat_template, image_size, image_resize_algorithm)
+        special_ids = (
+            processor.tokenizer.tokenizer.instruct_tokenizer.audio_encoder.special_ids
+        )
+
+        self.audio_token = special_ids.audio
+        self.begin_audio_token = special_ids.begin_audio
+
+    def process_labels(self, input_ids):
+        labels = input_ids.clone()
+
+        labels[labels == self.processor.tokenizer.pad_token_id] = -100
+        labels[labels == self.audio_token] = -100
+        labels[labels == self.begin_audio_token] = -100
+
+        return labels
+
+
 def get_processing_strategy(
     processor: ProcessorMixin,
     chat_template,
@@ -266,6 +410,10 @@ def get_processing_strategy(
         return Gemma3ProcessingStrategy(
             processor, chat_template, image_size, image_resize_algorithm
         )
+    if chat_template_type == "gemma3n":
+        return Gemma3nProcessingStrategy(
+            processor, chat_template, image_size, image_resize_algorithm
+        )
     if chat_template_type in [
         "llama3_2_vision",
         "llama4",
@@ -276,4 +424,10 @@ def get_processing_strategy(
         return ProcessingStrategy(
             processor, chat_template, image_size, image_resize_algorithm
         )
+
+    if isinstance(processor, VoxtralProcessor):
+        return VoxtralProcessingStrategy(
+            processor, chat_template, image_size, image_resize_algorithm
+        )
+
     raise ValueError(f"Unsupported chat template type: {chat_template_type}")
diff --git a/src/axolotl/prompt_strategies/__init__.py b/src/axolotl/prompt_strategies/__init__.py
index ba0dad053..cf936481e 100644
--- a/src/axolotl/prompt_strategies/__init__.py
+++ b/src/axolotl/prompt_strategies/__init__.py
@@ -2,11 +2,11 @@
 
 import importlib
 import inspect
-import logging
 
 from axolotl.prompt_strategies.user_defined import UserDefinedDatasetConfig
+from axolotl.utils.logging import get_logger
 
-LOG = logging.getLogger("axolotl.prompt_strategies")
+LOG = get_logger(__name__)
 
 
 def load(strategy, tokenizer, cfg, ds_cfg, processor=None):
@@ -17,7 +17,10 @@ def load(strategy, tokenizer, cfg, ds_cfg, processor=None):
             return messages_load(tokenizer, cfg, ds_cfg, processor=processor)
         load_fn = "load"
         package = "axolotl.prompt_strategies"
-        if strategy.split(".")[-1].startswith("load_"):
+        if (
+            strategy.split(".")[-1].startswith("load_")
+            or strategy.split(".")[-1] == "load"
+        ):
             load_fn = strategy.split(".")[-1]
             strategy = ".".join(strategy.split(".")[:-1])
         elif len(strategy.split(".")) > 1:
diff --git a/src/axolotl/prompt_strategies/base.py b/src/axolotl/prompt_strategies/base.py
index c146133fb..370a51a95 100644
--- a/src/axolotl/prompt_strategies/base.py
+++ b/src/axolotl/prompt_strategies/base.py
@@ -3,9 +3,10 @@ module for base dataset transform strategies
 """
 
 import importlib
-import logging
 
-LOG = logging.getLogger("axolotl")
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
 
 
 def load(strategy, cfg, module_base=None, **kwargs):
diff --git a/src/axolotl/prompt_strategies/bradley_terry/__init__.py b/src/axolotl/prompt_strategies/bradley_terry/__init__.py
index 4457c50be..7530aee19 100644
--- a/src/axolotl/prompt_strategies/bradley_terry/__init__.py
+++ b/src/axolotl/prompt_strategies/bradley_terry/__init__.py
@@ -2,11 +2,11 @@
 
 import importlib
 import inspect
-import logging
 
 from axolotl.prompt_strategies.user_defined import UserDefinedDatasetConfig
+from axolotl.utils.logging import get_logger
 
-LOG = logging.getLogger("axolotl.prompt_strategies.bradley_terry")
+LOG = get_logger(__name__)
 
 
 def load(strategy, tokenizer, cfg, ds_cfg):
diff --git a/src/axolotl/prompt_strategies/bradley_terry/chat_template.py b/src/axolotl/prompt_strategies/bradley_terry/chat_template.py
index 67319f5b4..e655f85a1 100644
--- a/src/axolotl/prompt_strategies/bradley_terry/chat_template.py
+++ b/src/axolotl/prompt_strategies/bradley_terry/chat_template.py
@@ -2,7 +2,6 @@
 Bradley-Terry model with chat template prompt strategy.
 """
 
-import logging
 from typing import Any, Dict, Optional
 
 from axolotl.prompt_strategies.chat_template import (
@@ -10,10 +9,11 @@ from axolotl.prompt_strategies.chat_template import (
     ChatTemplateStrategy,
 )
 from axolotl.utils.chat_templates import get_chat_template_from_config
+from axolotl.utils.logging import get_logger
 
 # Configure the logger
-LOG = logging.getLogger("axolotl.prompt_strategies.bradley_terry.chat_template")
-LOG.setLevel(logging.INFO)
+LOG = get_logger(__name__)
+LOG.setLevel("INFO")
 
 
 class BTChatTemplateStrategy(ChatTemplateStrategy):
@@ -44,7 +44,7 @@ class BTChatTemplateStrategy(ChatTemplateStrategy):
 
         if len(chosen_tokenized["input_ids"]) > max_length:
             LOG.warning(
-                f"To-be-trimmed chosen sequence exceeds max sequence length: {len(chosen_tokenized['input_ids'])}",
+                f"To-be-trimmed chosen sequence exceeds max sequence length: {len(chosen_tokenized['input_ids'])}"
             )
 
             chosen_tokenized["input_ids"] = chosen_tokenized["input_ids"][:max_length]
@@ -62,7 +62,7 @@ class BTChatTemplateStrategy(ChatTemplateStrategy):
 
         if len(rejected_tokenized["input_ids"]) > max_length:
             LOG.warning(
-                f"To-be-trimmed rejected sequence exceeds max sequence length: {len(rejected_tokenized['input_ids'])}",
+                f"To-be-trimmed rejected sequence exceeds max sequence length: {len(rejected_tokenized['input_ids'])}"
             )
 
             rejected_tokenized["input_ids"] = rejected_tokenized["input_ids"][
diff --git a/src/axolotl/prompt_strategies/chat_template.py b/src/axolotl/prompt_strategies/chat_template.py
index 047a66e94..8241dd385 100644
--- a/src/axolotl/prompt_strategies/chat_template.py
+++ b/src/axolotl/prompt_strategies/chat_template.py
@@ -2,9 +2,10 @@
 HF Chat Templates prompt strategy
 """
 
-import logging
+# pylint: disable=too-many-lines
+
 from collections import defaultdict
-from typing import Any, Dict, List, Set, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Set, Union
 
 from pydantic import BaseModel
 from transformers import ProcessorMixin
@@ -13,11 +14,16 @@ from axolotl.prompt_strategies.jinja_template_analyzer import JinjaTemplateAnaly
 from axolotl.prompt_tokenizers import PromptTokenizingStrategy
 from axolotl.prompters import IGNORE_TOKEN_ID, Prompter
 from axolotl.utils.chat_templates import get_chat_template_from_config
+from axolotl.utils.dict import remove_none_values
+from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.datasets import DatasetConfig
 
+if TYPE_CHECKING:
+    from axolotl.utils.mistral import HFMistralTokenizer
+
 # Configure the logger
-LOG = logging.getLogger("axolotl")
-LOG.setLevel(logging.INFO)
+LOG = get_logger(__name__)
+LOG.setLevel("INFO")
 
 
 class ChatTemplatePrompter(Prompter):
@@ -29,12 +35,16 @@ class ChatTemplatePrompter(Prompter):
         chat_template: str,
         processor=None,
         max_length=2048,
-        message_property_mappings: Dict[str, str] | None = None,
+        message_property_mappings: dict[str, str] | None = None,
         message_field_training: str | None = None,
         message_field_training_detail: str | None = None,
         field_messages: str = "messages",
         field_system: str = "system",
-        roles: Dict[str, List[str]] | None = None,
+        field_tools: str = "tools",
+        field_thinking: str = "reasoning_content",
+        roles: dict[str, list[str]] | None = None,
+        template_thinking_key: str | None = "reasoning_content",
+        chat_template_kwargs: dict[str, Any] | None = None,
         drop_system_message: bool = False,
     ):
         # check if message_property_mappings is None or empty dict
@@ -42,8 +52,9 @@ class ChatTemplatePrompter(Prompter):
             message_property_mappings = {
                 "role": "role",
                 "content": "content",
-                "reasoning_content": "reasoning_content",
             }
+            if template_thinking_key and field_thinking:
+                message_property_mappings[template_thinking_key] = field_thinking
 
         if roles:
             self.roles = {s: t for t, sources in roles.items() for s in sources}
@@ -65,9 +76,13 @@ class ChatTemplatePrompter(Prompter):
         self.message_field_training_detail = message_field_training_detail
         self.field_messages = field_messages
         self.field_system = field_system
+        self.field_tools = field_tools
+        self.field_thinking = field_thinking
         self.tokenizer = tokenizer
         self.processor: ProcessorMixin | None = processor
         self.chat_template = chat_template
+        self.chat_template_kwargs = chat_template_kwargs or {}
+        self.template_thinking_key: str = template_thinking_key or "reasoning_content"
         self.max_length = max_length
         self.drop_system_message = drop_system_message
 
@@ -75,16 +90,39 @@ class ChatTemplatePrompter(Prompter):
     def chat_template_msg_variables(self) -> Set[str]:
         return self._chat_template_msg_variables
 
-    def build_prompt(self, conversation, add_generation_prompt=False, images=None):
+    def build_prompt(
+        self,
+        conversation: list[dict],
+        add_generation_prompt=False,
+        images=None,
+        tools=None,
+    ):
+        """
+        Build a prompt from a conversation.
+
+        Args:
+            conversation: A list of messages.
+            add_generation_prompt: Whether to add a generation prompt.
+            images: A list of images. (optional)
+            tools: A list of tools. (optional)
+        """
+        chat_template_kwargs = {
+            "chat_template": self.chat_template,
+            "add_generation_prompt": add_generation_prompt,
+            **self.chat_template_kwargs,
+        }
+
+        if tools:
+            chat_template_kwargs["tools"] = tools
+
         if self.processor:
             if not callable(self.processor):
                 raise TypeError("Processor must be callable")
 
             text = self.processor.apply_chat_template(
                 conversation,
-                chat_template=self.chat_template,
                 tokenize=False,
-                add_generation_prompt=add_generation_prompt,
+                **chat_template_kwargs,
             )
             batch = self.processor(
                 text=text,
@@ -101,8 +139,7 @@ class ChatTemplatePrompter(Prompter):
 
         return self.tokenizer.apply_chat_template(
             conversation,
-            add_generation_prompt=add_generation_prompt,
-            chat_template=self.chat_template,
+            **chat_template_kwargs,
         )
 
     def get_offsets_for_train_detail(
@@ -246,9 +283,15 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
         self.train_on_eot = train_on_eot if train_on_eot is not None else train_on_eos
 
         # Default to eos_token if eot_tokens not provided
-        self.eot_tokens = (
-            eot_tokens if eot_tokens is not None else [self.tokenizer.eos_token]
-        )
+        self.eot_tokens = []
+        if eot_tokens is not None:
+            self.eot_tokens = eot_tokens
+        elif (
+            hasattr(self.tokenizer, "eos_token")
+            and self.tokenizer.eos_token is not None
+        ):
+            self.eot_tokens = [self.tokenizer.eos_token]
+
         self.split_thinking = split_thinking
 
         self.images = "images"
@@ -342,6 +385,8 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
         Public method that can handle either a single prompt or a batch of prompts.
         """
 
+        prompt = remove_none_values(prompt)
+
         if not self.is_prompt_batched(prompt) or not self.supports_batched:
             return self._tokenize_single_prompt(prompt)
 
@@ -372,13 +417,15 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
             and not self.prompter.message_field_training_detail  # type: ignore
         ):
             turns = self.get_conversation_thread(prompt)
-            images = self.get_images(prompt)
+            images = self._get_images(prompt)
             prompt_ids = self.prompter.build_prompt(  # type: ignore
                 turns[:-1],
                 add_generation_prompt=True,
                 images=images,
             )
-            tokenized_res = self.prompter.build_prompt(turns, images=images)  # type: ignore
+            tokenized_res = self.prompter.build_prompt(
+                turns, images=images
+            )  # type: ignore
             tokenized_prompt = {}
             if isinstance(tokenized_res, list):
                 input_ids = prompt_ids + tokenized_res[len(prompt_ids) :]
@@ -399,7 +446,8 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
             return tokenized_prompt
 
         turns = self.get_conversation_thread(prompt)
-        input_ids = self.prompter.build_prompt(turns)  # type: ignore
+        tools = self._get_tools(prompt)
+        input_ids = self.prompter.build_prompt(turns, tools=tools)  # type: ignore
         labels = [IGNORE_TOKEN_ID] * len(input_ids)
 
         last_eos_idx = -1
@@ -438,12 +486,20 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
 
                 continue
 
-            turn_start_idx, turn_end_idx = self.find_turn(turns=turns, turn_idx=index)
+            turn_start_idx, turn_end_idx = self.find_turn(
+                turns=turns, turn_idx=index, tools=tools
+            )
 
             LOG.debug(f"Turn indices: start={turn_start_idx}, end={turn_end_idx}")
 
             if should_train and turn_start_idx != -1 and turn_end_idx != -1:
                 if train_detail:
+                    # Block multi-content for now
+                    if not isinstance(content, str):
+                        raise ValueError(
+                            "`train_detail` is not supported when `content` is not a string."
+                        )
+
                     token_offsets = self.prompter.get_offsets_for_train_detail(  # type: ignore
                         content, train_detail
                     )
@@ -540,7 +596,9 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
                 return i
         return -1
 
-    def find_turn(self, turns: list[dict], turn_idx: int):
+    def find_turn(
+        self, turns: list[dict], turn_idx: int, tools: list[dict] | None = None
+    ):
         """
         Locate the starting and ending indices of the specified turn in a conversation.
         """
@@ -553,11 +611,7 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
         if (
             turn_idx == 0
             and turns[0].get("role") == "system"
-            and (
-                "mistral" in self.tokenizer.name_or_path.lower()
-                # gemma3 uses gemma tokenizer
-                or "gemma" in self.tokenizer.name_or_path.lower()
-            )
+            and ("mistral" in self.tokenizer.name_or_path.lower())
         ):
             return -1, -1
 
@@ -571,10 +625,10 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
         turns_with_content = turns[: turn_idx + 1]
 
         # Generate the conversation up to the turn, with final turn replaced with dummy content
-        dummy_ids = self.prompter.build_prompt(turns_with_empty)  # type: ignore
+        dummy_ids = self.prompter.build_prompt(turns_with_empty, tools=tools)  # type: ignore
 
         # Generate the conversation up to the turn, with final turn included
-        full_ids = self.prompter.build_prompt(turns_with_content)  # type: ignore
+        full_ids = self.prompter.build_prompt(turns_with_content, tools=tools)  # type: ignore
 
         if not full_ids or not dummy_ids:
             LOG.warning(f"Empty template generated for turn {turn_idx}")
@@ -627,9 +681,10 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
     def get_conversation_thread(self, prompt):
         turns = []
 
-        possible_sys_turn = self.transform_message(
-            prompt[self.prompter.field_messages][0]
-        )
+        messages = self._get_messages(prompt)
+
+        possible_sys_turn = self.transform_message(messages[0])
+
         if (
             possible_sys_turn["role"] != "system"
             and self.prompter.field_system in prompt
@@ -637,16 +692,17 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
             turn = {"role": "system", "content": prompt[self.prompter.field_system]}
             turns.append(turn)
 
-        for message in prompt[self.prompter.field_messages]:
+        for message in messages:
             transformed_message = self.transform_message(message)
 
-            turn = {
-                **transformed_message,
-                "training": message.get(self.prompter.message_field_training),
-                "training_detail": message.get(
-                    self.prompter.message_field_training_detail
-                ),
-            }
+            turn = transformed_message
+
+            training = message.get(self.prompter.message_field_training)
+            training_detail = message.get(self.prompter.message_field_training_detail)
+            if training is not None:
+                turn["training"] = training
+            if training_detail is not None:
+                turn["training_detail"] = training_detail
 
             turns.append(turn)
 
@@ -655,7 +711,7 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
 
         return turns
 
-    def transform_message(self, message):
+    def transform_message(self, message: dict) -> dict:
         # Build the initial transformed message from the mappings
         transformed_message = {}
         for key, value in self.prompter.message_property_mappings.items():
@@ -691,7 +747,9 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
 
                     # get the thinking content
                     thinking_content = content[t_start_idx + len(tpair[0]) : t_end_idx]
-                    transformed_message["reasoning_content"] = thinking_content.strip()
+                    transformed_message[self.prompter.template_thinking_key] = (
+                        thinking_content.strip()
+                    )
 
                     # take remainder of the content
                     # strip whitespace from beginning of the remainder (thinking tokens)
@@ -732,18 +790,126 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
 
         return transformed_message
 
-    def get_images(self, prompt):
+    def _get_images(self, prompt):
         return prompt.get(self.images, None)
 
+    def _get_tools(self, prompt) -> list[dict] | None:
+        """Get tools from prompt if available."""
+        tools = prompt.get(self.prompter.field_tools, None)
+        if tools is None:
+            return None
+
+        if isinstance(tools, list):
+            return tools
+
+        raise ValueError(
+            "Unknown tools format. Please convert it into a list[dict].\n"
+            f"Current format: {type(tools)}"
+        )
+
+    def _get_messages(self, prompt):
+        messages = prompt.get(self.prompter.field_messages, None)
+        if messages is None:
+            raise ValueError("Messages is null. Please check `field_messages`.")
+
+        if isinstance(messages, list):
+            return messages
+
+        raise ValueError(
+            "Unknown messages format. Please convert it into a list[dict].\n"
+            f"Current format: {type(messages)}"
+        )
+
+
+class MistralStrategy(ChatTemplateStrategy):
+    """
+    Mistral strategy for chat template.
+    """
+
+    def __init__(
+        self,
+        prompter: "ChatTemplatePrompter",
+        tokenizer: "HFMistralTokenizer",
+        train_on_inputs: bool,
+        sequence_len: int,
+        roles_to_train: list[str] | None = None,
+        train_on_eos: str | None = None,
+        train_on_eot: str | None = None,
+        eot_tokens: list[str] | None = None,
+        split_thinking: bool | None = False,
+    ):
+        # Call the parent's parent __init__ (PromptTokenizingStrategy) to skip ChatTemplateStrategy's validation
+        # pylint: disable=non-parent-init-called,super-init-not-called
+        PromptTokenizingStrategy.__init__(
+            self, prompter, tokenizer, train_on_inputs, sequence_len
+        )
+        self.prompter: ChatTemplatePrompter = prompter
+
+        self.roles_to_train = []
+        if roles_to_train:
+            # map roles if exist in prompter.roles else use the role as is
+            self.roles_to_train = [
+                prompter.roles.get(role, role) for role in roles_to_train
+            ]
+
+        self.train_on_eos = train_on_eos
+        # Backward compatibility, load from train_on_eos
+        self.train_on_eot = train_on_eot if train_on_eot is not None else train_on_eos
+
+        # Default to eos_token if eot_tokens not provided
+        self.eot_tokens = []
+        if eot_tokens is not None:
+            self.eot_tokens = eot_tokens
+        else:
+            # set eot_tokens to the eos_token
+            self.eot_tokens = [self.tokenizer.eos_token]
+
+        self.split_thinking = split_thinking
+
+        self.images = "images"
+
+        LOG.debug(
+            f"The chat template uses the following properites on the message: {self.prompter.chat_template_msg_variables}"
+        )
+
+        # Skip the validation that ChatTemplateStrategy calls
+        # TODO: address this in the future with mistral-specific checks
+        # self._validate_eot_and_eos_tokens()
+
+    def find_first_eot_token(self, input_ids, start_idx):
+        """Find the first EOT token in the input_ids starting from start_idx."""
+        # mistral-common tokenizer does not support eot_tokens
+        return self.find_first_eos_token(input_ids, start_idx)
+
+
+class MistralPrompter(ChatTemplatePrompter):
+    """
+    Mistral prompter for chat template.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self._chat_template_msg_variables = set(["tool_call_id", "name", "tool_calls"])
+
 
 class StrategyLoader:
     """
     Load chat template strategy based on configuration.
     """
 
-    def _get_strategy_cls(self):
+    def _get_strategy_cls(self, cfg):
+        if cfg.tokenizer_use_mistral_common:
+            return MistralStrategy
+
         return ChatTemplateStrategy
 
+    def _get_prompter_cls(self, cfg):
+        if cfg.tokenizer_use_mistral_common:
+            return MistralPrompter
+
+        return ChatTemplatePrompter
+
     def _get_strategy_params(self, cfg, ds_cfg: Dict[str, Any]):
         return {
             "train_on_inputs": cfg.train_on_inputs,
@@ -769,14 +935,20 @@ class StrategyLoader:
         else:
             dataset_config = ds_cfg
 
-        chat_template_string = get_chat_template_from_config(
-            cfg=cfg, ds_cfg=dataset_config, tokenizer=tokenizer
-        )
+        if cfg.tokenizer_use_mistral_common:
+            # mistral-common does not use this, so we pass an empty string
+            chat_template_string = ""
+        else:
+            chat_template_string = get_chat_template_from_config(
+                cfg=cfg, ds_cfg=dataset_config, tokenizer=tokenizer
+            )
+
         LOG.info(f"Using chat template:\n---\n{chat_template_string!s}\n---")
 
         prompter_params = {
             "tokenizer": tokenizer,
             "chat_template": chat_template_string,
+            "chat_template_kwargs": cfg.get("chat_template_kwargs", {}),
             "message_property_mappings": dataset_config.get(
                 "message_property_mappings", {}
             ),
@@ -788,6 +960,10 @@ class StrategyLoader:
                 None,
             ),
             "field_messages": dataset_config.get("field_messages", "messages"),
+            "field_thinking": dataset_config.get("field_thinking", "reasoning_content"),
+            "template_thinking_key": dataset_config.get(
+                "template_thinking_key", "reasoning_content"
+            ),
             "roles": dataset_config.get("roles"),
             "drop_system_message": dataset_config.get("drop_system_message", False),
             # we need to add one for detecting sequences with exceeding the `sequence_len` limit.
@@ -796,10 +972,11 @@ class StrategyLoader:
         }
 
         strategy_params = self._get_strategy_params(cfg, dataset_config)
-        strategy_cls = self._get_strategy_cls()
+        strategy_cls = self._get_strategy_cls(cfg)
+        prompter_cls = self._get_prompter_cls(cfg)
 
         strategy = strategy_cls(
-            ChatTemplatePrompter(**prompter_params),
+            prompter_cls(**prompter_params),
             tokenizer=tokenizer,
             **strategy_params,
         )
diff --git a/src/axolotl/prompt_strategies/dpo/chat_template.py b/src/axolotl/prompt_strategies/dpo/chat_template.py
index f04bd7f0d..786770885 100644
--- a/src/axolotl/prompt_strategies/dpo/chat_template.py
+++ b/src/axolotl/prompt_strategies/dpo/chat_template.py
@@ -46,6 +46,14 @@ def default(
         )
 
         messages = sample[field_messages]
+        if isinstance(messages, str):
+            messages = [
+                {
+                    message_property_mappings["role"]: "user",
+                    message_property_mappings["content"]: messages,
+                }
+            ]
+
         messages = [
             {
                 "role": role_map[m[message_property_mappings["role"]]],
@@ -53,13 +61,35 @@ def default(
             }
             for m in messages
         ]
+
+        chosen_raw = sample[field_chosen]
+        if isinstance(chosen_raw, str):
+            chosen_msg = {
+                message_property_mappings["role"]: "assistant",
+                message_property_mappings["content"]: chosen_raw,
+            }
+        elif isinstance(chosen_raw, dict):
+            chosen_msg = chosen_raw
+        else:
+            chosen_msg = chosen_raw[-1]
         chosen = {
-            "role": role_map[sample[field_chosen][message_property_mappings["role"]]],
-            "content": sample[field_chosen][message_property_mappings["content"]],
+            "role": role_map[chosen_msg[message_property_mappings["role"]]],
+            "content": chosen_msg[message_property_mappings["content"]],
         }
+
+        rejected_raw = sample[field_rejected]
+        if isinstance(rejected_raw, str):
+            rejected_msg = {
+                message_property_mappings["role"]: "assistant",
+                message_property_mappings["content"]: rejected_raw,
+            }
+        elif isinstance(rejected_raw, dict):
+            rejected_msg = rejected_raw
+        else:
+            rejected_msg = rejected_raw[-1]
         rejected = {
-            "role": role_map[sample[field_rejected][message_property_mappings["role"]]],
-            "content": sample[field_rejected][message_property_mappings["content"]],
+            "role": role_map[rejected_msg[message_property_mappings["role"]]],
+            "content": rejected_msg[message_property_mappings["content"]],
         }
         dummy_user_message = {"role": "user", "content": "[[dummy_message]]"}
 
@@ -91,4 +121,4 @@ def default(
 
         return result
 
-    return transform_fn
+    return transform_fn, {"remove_columns": [field_messages]}
diff --git a/src/axolotl/prompt_strategies/dpo/user_defined.py b/src/axolotl/prompt_strategies/dpo/user_defined.py
index 1d5f891af..cdd9b8c9c 100644
--- a/src/axolotl/prompt_strategies/dpo/user_defined.py
+++ b/src/axolotl/prompt_strategies/dpo/user_defined.py
@@ -33,7 +33,7 @@ def default(cfg, dataset_idx=0, **kwargs):  # pylint: disable=unused-argument
                 system=sample[field_system], prompt=sample[field_prompt]
             )
         else:
-            sample["prompt"] = prompt_format.format(prompt=sample["prompt"])
+            sample["prompt"] = prompt_format.format(prompt=sample[field_prompt])
         sample["chosen"] = chosen_format.format(chosen=sample[field_chosen])
         sample["rejected"] = rejected_format.format(rejected=sample[field_rejected])
         return sample
diff --git a/src/axolotl/prompt_strategies/jinja_template_analyzer.py b/src/axolotl/prompt_strategies/jinja_template_analyzer.py
index a5f89cfe5..e16a1e22b 100644
--- a/src/axolotl/prompt_strategies/jinja_template_analyzer.py
+++ b/src/axolotl/prompt_strategies/jinja_template_analyzer.py
@@ -3,6 +3,7 @@
 from typing import Dict, Optional, Set, TypedDict, Union
 
 from jinja2 import Environment, meta, nodes
+from jinja2.ext import Extension
 
 
 class JinjaTemplateAnalysis(TypedDict):
@@ -27,6 +28,18 @@ class JinjaTemplateAnalysis(TypedDict):
     iteration_target: Optional[Union[str, list[str]]]
 
 
+class GenerationTagIgnore(Extension):
+    """
+    Ignores the generation and endgeneration tags in Jinja templates.
+    """
+
+    tags = {"generation", "endgeneration"}
+
+    def parse(self, parser):
+        parser.stream.skip(1)
+        return nodes.Const("")
+
+
 class JinjaTemplateAnalyzer:
     """
     Analyzes Jinja templates to extract information about variable usage,
@@ -57,7 +70,9 @@ class JinjaTemplateAnalyzer:
     """
 
     def __init__(self, template: str):
-        self.env: Environment = Environment(autoescape=True)
+        self.env: Environment = Environment(
+            autoescape=True, extensions=[GenerationTagIgnore]
+        )
         self.property_access: Dict[str, Set[str]] = {}
         self.iteration_targets: Dict[str, Union[str, list[str]]] = {}
         self.index_access: Dict[str, Set[Union[int, float]]] = {}
diff --git a/src/axolotl/prompt_strategies/llama2_chat.py b/src/axolotl/prompt_strategies/llama2_chat.py
index 29e091bfd..eef2e1d4d 100644
--- a/src/axolotl/prompt_strategies/llama2_chat.py
+++ b/src/axolotl/prompt_strategies/llama2_chat.py
@@ -24,12 +24,14 @@ For a custom system message, the first "from" can be "system" (followed by alter
 Important: Don't use "special_tokens:" in your config.yml if you are not sure what you are doing!
 """
 
-import logging
 from dataclasses import dataclass, field
 from typing import Generator, List, Sequence
 
 from axolotl.prompt_tokenizers import PromptTokenizingStrategy
 from axolotl.prompters import ALTERNATING_ASSERTION_FAILED_ROLE, IGNORE_TOKEN_ID
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
 
 
 @dataclass
@@ -129,7 +131,7 @@ class LLama2ChatTokenizingStrategy(PromptTokenizingStrategy):
         if cur_len < self.sequence_len:
             if cur_len != total_len:
                 target[:] = IGNORE_TOKEN_ID
-                logging.warning(
+                LOG.warning(
                     f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
                     f" (ignored)"
                 )
diff --git a/src/axolotl/prompt_strategies/messages/__init__.py b/src/axolotl/prompt_strategies/messages/__init__.py
index d014d93a6..6eae9dfd8 100644
--- a/src/axolotl/prompt_strategies/messages/__init__.py
+++ b/src/axolotl/prompt_strategies/messages/__init__.py
@@ -2,9 +2,10 @@
 
 import importlib
 import inspect
-import logging
 
-LOG = logging.getLogger("axolotl.prompt_strategies.messages")
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
 
 
 def load(tokenizer, cfg, ds_cfg, processor=None):
@@ -31,4 +32,3 @@ def load(tokenizer, cfg, ds_cfg, processor=None):
     except Exception as exc:  # pylint: disable=broad-exception-caught
         LOG.error(f"Failed to load prompt strategy `{strategy}`: {str(exc)}")
         raise exc
-    return None
diff --git a/src/axolotl/prompt_strategies/metharme.py b/src/axolotl/prompt_strategies/metharme.py
index 52d77c00c..66da72389 100644
--- a/src/axolotl/prompt_strategies/metharme.py
+++ b/src/axolotl/prompt_strategies/metharme.py
@@ -1,12 +1,12 @@
 """Module containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class"""
 
-import logging
 from typing import Tuple
 
 from axolotl.prompt_tokenizers import InstructionPromptTokenizingStrategy
 from axolotl.prompters import AlpacaPrompter
+from axolotl.utils.logging import get_logger
 
-LOG = logging.getLogger("axolotl")
+LOG = get_logger(__name__)
 
 IGNORE_TOKEN_ID = -100
 
diff --git a/src/axolotl/prompt_strategies/pygmalion.py b/src/axolotl/prompt_strategies/pygmalion.py
index 88208f6ec..51f92f397 100644
--- a/src/axolotl/prompt_strategies/pygmalion.py
+++ b/src/axolotl/prompt_strategies/pygmalion.py
@@ -1,7 +1,6 @@
 """Module containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class"""
 
 import copy
-import logging
 from collections import defaultdict
 from typing import Generator, List, Tuple
 
@@ -10,8 +9,9 @@ from axolotl.prompt_tokenizers import (
     parse_tokenized_to_result,
     tokenize_prompt_default,
 )
+from axolotl.utils.logging import get_logger
 
-LOG = logging.getLogger("axolotl")
+LOG = get_logger(__name__)
 
 IGNORE_TOKEN_ID = -100
 
diff --git a/src/axolotl/prompt_tokenizers.py b/src/axolotl/prompt_tokenizers.py
index c29fd05a4..9ca645de3 100644
--- a/src/axolotl/prompt_tokenizers.py
+++ b/src/axolotl/prompt_tokenizers.py
@@ -1,14 +1,15 @@
 """Module containing PromptTokenizingStrategy and Prompter classes"""
 
 import abc
-import logging
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
+from datasets import Dataset
 from transformers import BatchEncoding, PreTrainedTokenizer
 
 from axolotl.prompters import Prompter
+from axolotl.utils.logging import get_logger
 
-LOG = logging.getLogger("axolotl")
+LOG = get_logger(__name__)
 
 IGNORE_INDEX = -100
 LLAMA_DEFAULT_PAD_TOKEN = "<pad>"  # nosec
@@ -28,6 +29,16 @@ class DatasetWrappingStrategy(abc.ABC):
     Abstract class for wrapping datasets for Chat Messages
     """
 
+    @abc.abstractmethod
+    def wrap_dataset(
+        self,
+        dataset,
+        process_count: int | None = None,
+        keep_in_memory: bool | None = False,
+        **kwargs,
+    ) -> Dataset:
+        pass
+
 
 class PromptTokenizingStrategy(abc.ABC):
     """
diff --git a/src/axolotl/prompters.py b/src/axolotl/prompters.py
index ec680702d..d29da075e 100644
--- a/src/axolotl/prompters.py
+++ b/src/axolotl/prompters.py
@@ -1,12 +1,13 @@
 """Module containing prompters"""
 
-import logging
 from enum import Enum
 from typing import Generator, Optional, Union
 
 from colorama import Fore
 
-LOG = logging.getLogger("axolotl")
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
 IGNORE_TOKEN_ID = -100
 REPR_TEMPLATE = "\n<start>\n" + Fore.CYAN + "{full_prompt}" + Fore.RESET + "\n<end>\n"
 
diff --git a/src/axolotl/train.py b/src/axolotl/train.py
index 52ec8f22b..e8a2cbabe 100644
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -1,11 +1,13 @@
 """Prepare and train a model on a dataset. Can also infer from a model or merge lora"""
 
+from __future__ import annotations
+
 import importlib
 import inspect
-import logging
 import os
 import signal
 import sys
+import typing
 import weakref
 from contextlib import ExitStack
 from pathlib import Path
@@ -13,7 +15,6 @@ from typing import Any, Dict
 
 import torch
 import transformers.modelcard
-from accelerate.utils import save_fsdp_model
 from datasets import Dataset
 from huggingface_hub.errors import OfflineModeIsEnabled
 from peft import PeftConfig, PeftModel
@@ -21,12 +22,10 @@ from transformers import PreTrainedModel, PreTrainedTokenizer, ProcessorMixin
 from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
 from transformers.trainer import Trainer
 
-from axolotl.cli.art import print_axolotl_text_art
 from axolotl.common.datasets import TrainDatasetMeta
 from axolotl.contribs.lgpl import (  # pylint: disable = no-name-in-module
     fix_untrained_tokens,
 )
-from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFRLTrainerBuilder
 from axolotl.integrations.base import PluginManager
 from axolotl.loaders import (
     ModelLoader,
@@ -37,6 +36,7 @@ from axolotl.utils.ctx_managers.sequence_parallel import SequenceParallelContext
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import cleanup_distributed
 from axolotl.utils.freeze import freeze_layers_except
+from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.enums import RLType
 from axolotl.utils.trainer import setup_trainer
 
@@ -45,7 +45,10 @@ try:
 except ImportError:
     BetterTransformer = None
 
-LOG = logging.getLogger(__name__)
+if typing.TYPE_CHECKING:
+    from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFRLTrainerBuilder
+
+LOG = get_logger(__name__)
 
 
 def setup_model_and_tokenizer(
@@ -53,8 +56,8 @@ def setup_model_and_tokenizer(
 ) -> tuple[
     PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None
 ]:
-    """
-    Load the tokenizer, processor (for multimodal models), and model based on configuration.
+    """Load the tokenizer, processor (for multimodal models), and model based on
+    configuration.
 
     Args:
         cfg: Dictionary mapping `axolotl` config keys to values.
@@ -64,9 +67,7 @@ def setup_model_and_tokenizer(
             `None`), and processor (if multimodal, else `None`).
     """
     # Load tokenizer
-    LOG.debug(
-        f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}",
-    )
+    LOG.debug(f"Loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
     tokenizer = load_tokenizer(cfg)
 
     # Load processor for multimodal models if needed
@@ -74,11 +75,8 @@ def setup_model_and_tokenizer(
     if cfg.is_multimodal:
         processor = load_processor(cfg, tokenizer)
 
-    # Load the model and peft_config
-    msg = "loading model"
-    if cfg.adapter:
-        msg += " and peft_config..."
-    LOG.debug(msg)
+    # Load the model
+    LOG.debug("Loading model")
 
     model_loader = ModelLoader(cfg, tokenizer, processor=processor)
     model, peft_config = model_loader.load()
@@ -117,8 +115,11 @@ def setup_reference_model(
             LOG.debug("Passing model_ref: None to RL trainer")
             model_ref = None  # explicit setting to None
         else:
+            reference_model: bool = True
+            if cfg.rl == RLType.GRPO and cfg.trl.beta == 0:
+                reference_model = False
             # load the model again for model_ref/baseline
-            model_loader = ModelLoader(cfg, tokenizer, reference_model=True)
+            model_loader = ModelLoader(cfg, tokenizer, reference_model=reference_model)
             model_ref, _ = model_loader.load()
     return model_ref
 
@@ -204,24 +205,32 @@ def execute_training(
                 )
             )
 
-        if cfg.sequence_parallel_degree > 1:
+        if cfg.context_parallel_size > 1:
             models = [trainer.model]
-            if hasattr(trainer, "ref_model"):
+            if hasattr(trainer, "ref_model") and trainer.ref_model:
                 models.append(trainer.ref_model)
 
             stack.enter_context(
                 SequenceParallelContextManager(
                     models=models,
-                    sequence_parallel_degree=cfg.sequence_parallel_degree,
+                    context_parallel_size=cfg.context_parallel_size,
                     gradient_accumulation_steps=cfg.gradient_accumulation_steps,
                     ring_attn_func=cfg.ring_attn_func,
                     heads_k_stride=cfg.heads_k_stride,
+                    gather_outputs=cfg.rl is RLType.GRPO,
+                    device_mesh=trainer.accelerator.torch_device_mesh,
                 )
             )
 
         LOG.info("Starting trainer...")
+        # TODO: disabling for now as not compatible with FSDP2 + torchao low bit optimizers
+        # if cfg.bf16:
+        #     torch.set_default_dtype(torch.bfloat16)
         trainer.train(resume_from_checkpoint=resume_from_checkpoint)
 
+        plugin_manager = PluginManager.get_instance()
+        plugin_manager.post_train(cfg, trainer.model)
+
 
 def save_trained_model(
     cfg: DictDefault,
@@ -238,45 +247,47 @@ def save_trained_model(
         model: The trained model to save.
         safe_serialization: Whether to use safe serialization.
     """
-    LOG.info(f"Training completed! Saving pre-trained model to {cfg.output_dir}.")
+    LOG.info(f"Training completed! Saving trained model to {cfg.output_dir}.")
 
     # Post training module hooks
     for name, module in model.named_modules():
         if hasattr(module, "_post_training"):
             module._post_training(model, name)  # pylint: disable=protected-access
 
-    # Handle FSDP state dict type
-    state_dict_type = "FULL_STATE_DICT"
-    if trainer.is_fsdp_enabled and str(cfg.fsdp_config.fsdp_version) != "2":
-        if cfg.fsdp_final_state_dict_type:
-            state_dict_type = cfg.fsdp_final_state_dict_type
-        trainer.accelerator.state.fsdp_plugin.set_state_dict_type(state_dict_type)
-        LOG.info(f"Set FSDP state dict type to {state_dict_type} for saving.")
+    # handle QAT
+    if cfg.qat:
+        from axolotl.utils.quantization import convert_qat_model_for_ptq
 
+        LOG.info("Processing QAT model for saving...")
+        convert_qat_model_for_ptq(
+            model,
+            quantize_embedding=cfg.qat.quantize_embedding,
+        )
+        LOG.info(
+            "QAT modules have been converted for PTQ. Please ensure you quantize "
+            "your model weights with `axolotl quantize`."
+        )
     # Handle ReLoRA early return case
-    if cfg.relora_steps:
+    if cfg.relora:
         if cfg.adapter == "lora" and not (cfg.load_in_4bit or cfg.load_in_8bit):
             model = model.merge_and_unload()
         else:
             # final model weights have already been saved by `ReLoRACallback.on_train_end`
             return
 
-    if cfg.fsdp:
-        # TODO: do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
-        # only save on rank 0, otherwise it corrupts output on multi-GPU when multiple
-        # processes attempt to write the same file
-        if (
-            state_dict_type == "SHARDED_STATE_DICT"
-            and cfg.fsdp_config.fsdp_state_dict_type == "SHARDED_STATE_DICT"
-        ):
-            save_fsdp_model(
-                trainer.accelerator.state.fsdp_plugin,
-                trainer.accelerator,
-                trainer.model,
-                cfg.output_dir,
+    if trainer.is_fsdp_enabled or cfg.fsdp_config:
+        if cfg.fsdp_config or cfg.fsdp:
+            if cfg.fsdp_config.final_state_dict_type:
+                state_dict_type = cfg.fsdp_config.final_state_dict_type
+            else:
+                state_dict_type = cfg.fsdp_config.state_dict_type
+            trainer.accelerator.state.fsdp_plugin.set_state_dict_type(state_dict_type)
+        trainer.save_model(cfg.output_dir)
+        if state_dict_type == "SHARDED_STATE_DICT":
+            LOG.info(
+                "The final model was saved with a sharded state dict. Please ensure you merge "
+                "the sharded weights with `merge-sharded-fsdp-weights`."
             )
-        elif state_dict_type == "FULL_STATE_DICT":
-            trainer.save_model(cfg.output_dir)
     elif cfg.deepspeed and is_deepspeed_zero3_enabled():
         # Copied over from: https://github.com/huggingface/accelerate/blob/5ae611118057232f441055f7ef9ba0b0f2b8d533/docs/source/usage_guides/deepspeed.md#saving-and-loading
         trainer.accelerator.wait_for_everyone()
@@ -321,6 +332,8 @@ def save_trained_model(
             save_compressed=cfg.llmcompressor.save_compressed,
         )
 
+    LOG.info(f"Model successfully saved to {cfg.output_dir}")
+
 
 def create_model_card(cfg: DictDefault, trainer: Trainer):
     """
@@ -458,7 +471,7 @@ def handle_untrained_tokens_fix(
 
 
 def setup_model_and_trainer(cfg: DictDefault, dataset_meta: TrainDatasetMeta) -> tuple[
-    HFRLTrainerBuilder | HFCausalTrainerBuilder,
+    "HFRLTrainerBuilder" | "HFCausalTrainerBuilder",
     PeftModel | PreTrainedModel,
     PreTrainedTokenizer,
     PeftConfig | None,
@@ -504,6 +517,9 @@ def setup_model_and_trainer(cfg: DictDefault, dataset_meta: TrainDatasetMeta) ->
         peft_config=peft_config,
     )
 
+    plugin_manager = PluginManager.get_instance()
+    plugin_manager.post_trainer_create(cfg, trainer)
+
     return (
         trainer,
         model,
@@ -526,8 +542,6 @@ def train(
     Returns:
         Tuple of (model, tokenizer) after training
     """
-    print_axolotl_text_art()
-
     # Setup model, tokenizer, (causal or RLHF) trainer, etc.
     (
         trainer,
@@ -537,9 +551,6 @@ def train(
         processor,
     ) = setup_model_and_trainer(cfg, dataset_meta)
 
-    plugin_manager = PluginManager.get_instance()
-    plugin_manager.post_trainer_create(cfg, trainer)
-
     # Handle untrained tokens if configured
     safe_serialization = cfg.save_safetensors is True
     train_dataset = dataset_meta.train_dataset
@@ -556,12 +567,14 @@ def train(
     resume_from_checkpoint = determine_resume_checkpoint(cfg)
     execute_training(cfg, trainer, resume_from_checkpoint)
 
+    # clear cache
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+
     # Save the trained model and cleanup
     save_trained_model(cfg, trainer, model, safe_serialization)
     create_model_card(cfg, trainer)
     if not cfg.use_ray:
         cleanup_distributed()
 
-    plugin_manager.post_train(cfg, model)
-
     return model, tokenizer, trainer
diff --git a/src/axolotl/utils/__init__.py b/src/axolotl/utils/__init__.py
index 3d0ba7c9c..e669413f8 100644
--- a/src/axolotl/utils/__init__.py
+++ b/src/axolotl/utils/__init__.py
@@ -52,3 +52,10 @@ def patch_optimized_env():
     if os.getenv("HF_HUB_ENABLE_HF_TRANSFER") is None:
         os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
     set_pytorch_cuda_alloc_conf()
+
+
+def get_not_null(value, default=None):
+    """
+    return the value if it's not None, otherwise return the default value
+    """
+    return value if value is not None else default
diff --git a/src/axolotl/utils/bench.py b/src/axolotl/utils/bench.py
index d1e972c81..dd3a85b8c 100644
--- a/src/axolotl/utils/bench.py
+++ b/src/axolotl/utils/bench.py
@@ -1,6 +1,7 @@
 """Benchmarking and measurement utilities"""
 
 import functools
+import logging
 
 import torch
 from transformers.utils.import_utils import is_torch_npu_available
@@ -56,10 +57,10 @@ def gpu_memory_usage(device=0):
 
 @check_cuda_device((0.0, 0.0, 0.0))
 def gpu_memory_usage_all(device=0):
-    usage = torch.cuda.memory_allocated(device) / 1024.0**3
-    reserved = torch.cuda.memory_reserved(device) / 1024.0**3
-    smi = gpu_memory_usage_smi(device)
-    return usage, reserved - usage, max(0, smi - reserved)
+    active = torch.cuda.memory_stats().get("active_bytes.all.peak", 0) / 1024.0**3
+    allocated = torch.cuda.max_memory_allocated(device) / 1024.0**3
+    reserved = torch.cuda.max_memory_reserved(device) / 1024.0**3
+    return active, allocated, reserved
 
 
 def mps_memory_usage_all():
@@ -91,21 +92,38 @@ def gpu_memory_usage_smi(device=0):
         return 0.0
 
 
-def log_gpu_memory_usage(log, msg, device):
-    cur_device = get_device_type()
+def get_gpu_memory_usage(device: int | torch.device = 0):
+    cur_device_type = str(get_device_type())
     if torch.backends.mps.is_available():
         usage, cache, misc = mps_memory_usage_all()
-    elif "npu" in str(cur_device) and is_torch_npu_available():
+    elif "npu" in cur_device_type and is_torch_npu_available():
         usage, cache, misc = npu_memory_usage_all(device)
-    else:
+    elif "cuda" in cur_device_type and torch.cuda.is_available():
         usage, cache, misc = gpu_memory_usage_all(device)
+    else:
+        return 0.0, 0.0, 0.0
+
+    return usage, cache, misc
+
+
+def log_gpu_memory_usage(
+    log: logging.Logger | logging.LoggerAdapter,
+    msg: str = "",
+    device: int | torch.device = 0,
+):
+    try:
+        active, allocated, reserved = get_gpu_memory_usage(device)
+    except ValueError:
+        # likely CPU, ignore
+        return
+    cur_device_type = str(get_device_type())
     extras = []
-    if cache > 0:
-        extras.append(f"+{cache:.03f}GB cache")
-    if misc > 0:
-        extras.append(f"+{misc:.03f}GB misc")
-    log.info(
-        f"{str(cur_device)} memory usage {msg}: {usage:.03f}GB ({', '.join(extras)})",
+    if allocated > 0:
+        extras.append(f"+{allocated:.03f}GB allocated")
+    if reserved > 0:
+        extras.append(f"+{reserved:.03f}GB reserved")
+    msg = f"{cur_device_type} memory active:" if not msg else msg
+    log.debug(
+        f"{msg} {active:.03f}GB ({', '.join(extras)})",
         stacklevel=2,
     )
-    return usage, cache, misc
diff --git a/src/axolotl/utils/callbacks/__init__.py b/src/axolotl/utils/callbacks/__init__.py
index 0e7b06093..d3f3126b5 100644
--- a/src/axolotl/utils/callbacks/__init__.py
+++ b/src/axolotl/utils/callbacks/__init__.py
@@ -4,7 +4,6 @@ from __future__ import annotations
 
 import gc
 import json
-import logging
 import os
 import traceback
 from shutil import copyfile
@@ -28,11 +27,14 @@ from transformers import (
     TrainerState,
     TrainingArguments,
 )
-from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, IntervalStrategy
+from transformers.trainer_utils import (
+    PREFIX_CHECKPOINT_DIR,
+    IntervalStrategy,
+    SaveStrategy,
+)
 from trl.models import unwrap_model_for_generation
 
 from axolotl.utils import is_comet_available, is_mlflow_available
-from axolotl.utils.bench import log_gpu_memory_usage
 from axolotl.utils.callbacks.perplexity import Perplexity
 from axolotl.utils.distributed import (
     barrier,
@@ -43,33 +45,15 @@ from axolotl.utils.distributed import (
     is_main_process,
     zero_first,
 )
+from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.config import AxolotlInputConfig
 
 if TYPE_CHECKING:
-    from axolotl.core.trainer_builder import AxolotlTrainingArguments
+    from axolotl.core.training_args import AxolotlTrainingArguments
 
 
 IGNORE_INDEX = -100
-LOG = logging.getLogger("axolotl.callbacks")
-
-
-class EvalFirstStepCallback(
-    TrainerCallback
-):  # pylint: disable=too-few-public-methods disable=unused-argument
-    """
-    Callback to trigger evals on the first step
-    """
-
-    def on_step_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        if args.eval_strategy == IntervalStrategy.STEPS and state.global_step == 1:
-            control.should_evaluate = True
-        return control
+LOG = get_logger(__name__)
 
 
 class SaveBetterTransformerModelCallback(
@@ -83,7 +67,7 @@ class SaveBetterTransformerModelCallback(
         state: TrainerState,
         control: TrainerControl,
         **kwargs,
-    ):
+    ) -> TrainerControl:
         # Save
         if (
             args.save_strategy == IntervalStrategy.STEPS
@@ -108,45 +92,22 @@ class SaveBetterTransformerModelCallback(
         return control
 
 
-class GPUStatsCallback(
-    TrainerCallback
-):  # pylint: disable=too-few-public-methods disable=unused-argument
-    """Callback to track GPU utilization"""
-
-    def __init__(self, cfg):
-        self.cfg = cfg
-        self.logged = False
-
-    def on_step_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        if not self.logged and state.global_step > 1:
-            log_gpu_memory_usage(LOG, "while training", self.cfg.device)
-            self.logged = True
-        return control
-
-
 class LossWatchDogCallback(TrainerCallback):
     """Callback to track loss and stop training if loss is too high"""
 
     def __init__(self, cfg):
         self.cfg = cfg
-        self.logged = False
         self.violations = 0
         self.threshold = cfg.loss_watchdog_threshold
         self.patience = cfg.loss_watchdog_patience or 3
 
     def on_step_end(
         self,
-        _args: TrainingArguments,
+        args: TrainingArguments,  # pylint: disable=unused-argument
         state: TrainerState,
         control: TrainerControl,
         **_kwargs,
-    ):
+    ) -> TrainerControl:
         if len(state.log_history) > 0 and "loss" in state.log_history[-1]:
             if state.log_history[-1]["loss"] > self.threshold:
                 self.violations += 1
@@ -160,6 +121,21 @@ class LossWatchDogCallback(TrainerCallback):
         return control
 
 
+class SaveModelOnFirstStepCallback(TrainerCallback):
+    """Callback to save the model on the first step of training if enabled"""
+
+    def on_step_end(
+        self,
+        args: TrainingArguments,  # pylint: disable=unused-argument
+        state: TrainerState,
+        control: TrainerControl,
+        **_kwargs,
+    ) -> TrainerControl:
+        if state.global_step == 1:
+            control.should_save = True
+        return control
+
+
 def bench_eval_callback_factory(trainer, tokenizer):
     accuracy = evaluate.load("accuracy")
     abcd_idx = [
@@ -753,7 +729,14 @@ def log_prediction_callback_factory(trainer: Trainer, tokenizer, logger: str):
                         ].append(pred_step_text)
                         row_index += 1
                 if logger == "wandb":
-                    wandb.run.log({f"{name} - Predictions vs Ground Truth": pd.DataFrame(table_data)})  # type: ignore[attr-defined]
+                    # type: ignore[attr-defined]
+                    wandb.run.log(
+                        {
+                            f"{name} - Predictions vs Ground Truth": pd.DataFrame(
+                                table_data
+                            )
+                        }
+                    )
                 elif logger == "mlflow" and is_mlflow_available():
                     import mlflow
 
@@ -796,7 +779,7 @@ class SaveAxolotlConfigtoWandBCallback(TrainerCallback):
         control: TrainerControl,
         **kwargs,  # pylint: disable=unused-argument
     ):
-        if is_main_process():
+        if state.is_world_process_zero:
             try:
                 # sync config to top level in run, cannot delete file right away because wandb schedules it to be synced even w/policy = 'now', so let OS delete it later.
                 with NamedTemporaryFile(
@@ -853,21 +836,52 @@ class SaveAxolotlConfigtoWandBCallback(TrainerCallback):
 class GCCallback(TrainerCallback):
     """Callback to garbage collect torch cache"""
 
-    def __init__(self, gc_steps=None):
-        self.gc_steps = gc_steps
+    def __init__(self, gc_steps: int | None = -1):
+        self.gc_steps: int = gc_steps or -1
+        self.next_gc_on_begin_step: int = -1
+
+    def _gc(self):
+        torch.cuda.empty_cache()
+        gc.collect()
+
+    def on_train_begin(
+        self, args, state, control, **kwargs  # pylint: disable=unused-argument
+    ):
+        self._gc()
+
+    def on_step_begin(
+        self, args, state, control, **kwargs  # pylint: disable=unused-argument
+    ):
+        # pylint: disable=consider-using-in
+        if self.next_gc_on_begin_step == state.global_step or state.global_step == 0:
+            self._gc()
 
     def on_step_end(
         self, args, state, control, **kwargs  # pylint: disable=unused-argument
     ):
-        if self.gc_steps > 0 and state.global_step % self.gc_steps == 0:
-            torch.cuda.empty_cache()
-            gc.collect()
+        if control.should_evaluate:
+            # automatically GC before evals so the eval memory spike from the CEL doesn't OOM the trainer
+            self._gc()
+            # also GC on the start of the next step after the eval
+            self.next_gc_on_begin_step = state.global_step + 1
+        elif self.gc_steps > 0 and state.global_step % self.gc_steps == 0:
+            self._gc()
+        elif (
+            args.save_strategy == SaveStrategy.STEPS
+            and state.save_steps > 0
+            and state.global_step % state.save_steps == 0
+        ):
+            # gc on save steps in case anything is loaded to CPU RAM like offloaded tensors
+            self._gc()
+        elif state.global_step >= state.max_steps:
+            if args.save_strategy == SaveStrategy.STEPS:
+                # gc on save steps in case anything is loaded to CPU RAM like offloaded tensors
+                self._gc()
 
     def on_epoch_end(
         self, args, state, control, **kwargs  # pylint: disable=unused-argument
     ):
-        torch.cuda.empty_cache()
-        gc.collect()
+        self._gc()
 
 
 def colab_inference_post_train_callback(trainer: Trainer):
diff --git a/src/axolotl/utils/callbacks/comet_.py b/src/axolotl/utils/callbacks/comet_.py
index b29f997a8..7dce95145 100644
--- a/src/axolotl/utils/callbacks/comet_.py
+++ b/src/axolotl/utils/callbacks/comet_.py
@@ -1,17 +1,17 @@
 """Comet module for trainer callbacks"""
 
-import logging
 from typing import TYPE_CHECKING
 
 import comet_ml
 from transformers import TrainerCallback, TrainerControl, TrainerState
 
 from axolotl.utils.distributed import is_main_process
+from axolotl.utils.logging import get_logger
 
 if TYPE_CHECKING:
-    from axolotl.core.trainer_builder import AxolotlTrainingArguments
+    from axolotl.core.training_args import AxolotlTrainingArguments
 
-LOG = logging.getLogger("axolotl.callbacks")
+LOG = get_logger(__name__)
 
 
 class SaveAxolotlConfigtoCometCallback(TrainerCallback):
diff --git a/src/axolotl/utils/callbacks/lisa.py b/src/axolotl/utils/callbacks/lisa.py
index e226471b1..348cdf2da 100644
--- a/src/axolotl/utils/callbacks/lisa.py
+++ b/src/axolotl/utils/callbacks/lisa.py
@@ -6,17 +6,18 @@ Arxiv: https://arxiv.org/abs/2403.17919
 License: Apache 2.0
 """
 
-import logging
 from functools import reduce
 from typing import TYPE_CHECKING
 
 import numpy as np
 from transformers import TrainerCallback
 
-if TYPE_CHECKING:
-    from axolotl.core.trainer_builder import AxolotlTrainer
+from axolotl.utils.logging import get_logger
 
-LOG = logging.getLogger("axolotl.callbacks.lisa")
+if TYPE_CHECKING:
+    from axolotl.core.trainers import AxolotlTrainer
+
+LOG = get_logger(__name__)
 
 
 def lisa_callback_factory(trainer: "AxolotlTrainer"):
diff --git a/src/axolotl/utils/callbacks/mlflow_.py b/src/axolotl/utils/callbacks/mlflow_.py
index 15ca1ca47..ac72f5e6d 100644
--- a/src/axolotl/utils/callbacks/mlflow_.py
+++ b/src/axolotl/utils/callbacks/mlflow_.py
@@ -1,6 +1,5 @@
 """MLFlow module for trainer callbacks"""
 
-import logging
 import os
 from shutil import copyfile
 from tempfile import NamedTemporaryFile
@@ -10,11 +9,12 @@ import mlflow
 from transformers import TrainerCallback, TrainerControl, TrainerState
 
 from axolotl.utils.distributed import is_main_process
+from axolotl.utils.logging import get_logger
 
 if TYPE_CHECKING:
-    from axolotl.core.trainer_builder import AxolotlTrainingArguments
+    from axolotl.core.training_args import AxolotlTrainingArguments
 
-LOG = logging.getLogger("axolotl.callbacks")
+LOG = get_logger(__name__)
 
 
 def should_log_artifacts() -> bool:
diff --git a/src/axolotl/utils/callbacks/models.py b/src/axolotl/utils/callbacks/models.py
new file mode 100644
index 000000000..5a20d70d9
--- /dev/null
+++ b/src/axolotl/utils/callbacks/models.py
@@ -0,0 +1,23 @@
+"""Helper functions for model classes"""
+
+from typing import Tuple
+
+from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+
+
+def get_causal_lm_model_cls_prefix(model_type: str) -> Tuple[str, str]:
+    if model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
+        causal_lm_cls = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[model_type]
+        causal_lm_cls_prefix = causal_lm_cls
+        for suffix in [
+            "ForCausalLM",
+            "ForConditionalGeneration",
+            "LMHeadModel",
+            "GenerationDecoder",
+        ]:
+            causal_lm_cls_prefix = causal_lm_cls_prefix.replace(suffix, "")
+        return causal_lm_cls_prefix, causal_lm_cls
+    causal_lm_cls_prefix = "".join(
+        [part.capitalize() for part in model_type.split("_")]
+    )
+    return causal_lm_cls_prefix, f"{causal_lm_cls_prefix}ForCausalLM"
diff --git a/src/axolotl/utils/callbacks/profiler.py b/src/axolotl/utils/callbacks/profiler.py
index 36604813f..d26b7f9dd 100644
--- a/src/axolotl/utils/callbacks/profiler.py
+++ b/src/axolotl/utils/callbacks/profiler.py
@@ -19,9 +19,27 @@ class PytorchProfilerCallback(TrainerCallback):
     PyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.
     """
 
-    def __init__(self, steps_to_profile: int = 5):
-        self.steps_to_profile = steps_to_profile
-        if self.steps_to_profile:
+    def __init__(self, steps_to_profile: int = 5, profiler_steps_start: int = 0):
+        # steps are 0 indexed, so to start at 0-th step, we start at beginning of first step,
+        # and finish at end of last step, so 5 steps_to_profile is steps [0, 1, 2, 3, 4]
+        self.profiler_steps_end = profiler_steps_start + steps_to_profile - 1
+        if profiler_steps_start == 0:
+            # start recording memory allocations before everything is allocated, because if we start
+            # at the beginning of step 0, we won't have any memory allocations in the traces
+            torch.cuda.memory._record_memory_history(  # pylint: disable=protected-access
+                enabled="all"
+            )
+            profiler_steps_start = -1
+        self.profiler_steps_start = profiler_steps_start
+
+    def on_step_begin(  # pylint: disable=unused-argument
+        self,
+        args: TrainingArguments,  # pylint: disable=unused-argument
+        state: TrainerState,
+        control: TrainerControl,  # pylint: disable=unused-argument
+        **kwargs,  # pylint: disable=unused-argument
+    ):
+        if state.global_step == self.profiler_steps_start:
             torch.cuda.memory._record_memory_history(  # pylint: disable=protected-access
                 enabled="all"
             )
@@ -33,7 +51,28 @@ class PytorchProfilerCallback(TrainerCallback):
         control: TrainerControl,  # pylint: disable=unused-argument
         **kwargs,  # pylint: disable=unused-argument
     ):
-        if state.global_step == self.steps_to_profile:
+        if state.global_step == self.profiler_steps_end:
+            snapshot = torch.cuda.memory._snapshot()  # pylint: disable=protected-access
+            with open(Path(args.output_dir) / "snapshot.pickle", "wb") as fout:
+                dump(snapshot, fout)
+
+            # tell CUDA to stop recording memory allocations now
+            torch.cuda.memory._record_memory_history(  # pylint: disable=protected-access
+                enabled=None
+            )
+
+    def on_train_end(  # pylint: disable=unused-argument
+        self,
+        args: TrainingArguments,  # pylint: disable=unused-argument
+        state: TrainerState,
+        control: TrainerControl,  # pylint: disable=unused-argument
+        **kwargs,  # pylint: disable=unused-argument
+    ):
+        # make sure to record if we happen to have more steps than steps to profile
+        if (
+            state.global_step >= self.profiler_steps_start
+            and state.global_step < self.profiler_steps_end
+        ):
             snapshot = torch.cuda.memory._snapshot()  # pylint: disable=protected-access
             with open(Path(args.output_dir) / "snapshot.pickle", "wb") as fout:
                 dump(snapshot, fout)
diff --git a/src/axolotl/utils/callbacks/qat.py b/src/axolotl/utils/callbacks/qat.py
new file mode 100644
index 000000000..cf4d9a937
--- /dev/null
+++ b/src/axolotl/utils/callbacks/qat.py
@@ -0,0 +1,50 @@
+"""QAT Callback for HF Causal Trainer"""
+
+from functools import partial
+
+from torch import nn
+from torchao.quantization.qat.embedding import FakeQuantizedEmbedding
+from torchao.quantization.qat.linear import FakeQuantizedLinear
+from transformers import TrainerCallback
+
+from axolotl.utils.logging import get_logger
+from axolotl.utils.schemas.quantization import QATConfig
+
+LOG = get_logger(__name__)
+
+
+def toggle_fake_quant(mod: nn.Module, enable: bool):
+    """
+    Toggle fake quantization for any fake quantized linear or embedding layers in the model.
+
+    Args:
+        mod: The module to toggle fake quantization for.
+        enable: Whether to enable or disable fake quantization.
+    """
+    if isinstance(mod, (FakeQuantizedLinear, FakeQuantizedEmbedding)):
+        if (
+            isinstance(mod, FakeQuantizedLinear)
+            and mod.activation_fake_quantizer is not None
+        ):
+            mod.activation_fake_quantizer.enabled = enable
+        mod.weight_fake_quantizer.enabled = enable
+
+
+class QATCallback(TrainerCallback):
+    """
+    Callback to toggle fake quantization for the model.
+    """
+
+    def __init__(self, cfg: QATConfig):
+        self.cfg = cfg
+
+    def on_step_begin(
+        self, args, state, control, model, **kwargs
+    ):  # pylint: disable=unused-argument
+        if self.cfg.fake_quant_after_n_steps is not None:
+            if state.global_step == 0:
+                LOG.info(f"Disabling fake quantization at step {state.global_step}")
+                model.apply(partial(toggle_fake_quant, enable=False))
+            elif state.global_step == self.cfg.fake_quant_after_n_steps:
+                LOG.info(f"Enabling fake quantization at step {state.global_step}")
+                model.apply(partial(toggle_fake_quant, enable=True))
diff --git a/src/axolotl/utils/chat_templates.py b/src/axolotl/utils/chat_templates.py
deleted file mode 100644
index fb2134852..000000000
--- a/src/axolotl/utils/chat_templates.py
+++ /dev/null
@@ -1,142 +0,0 @@
-"""
-This module provides functionality for selecting chat templates based on user choices.
-These templates are used for formatting messages in a conversation.
-"""
-
-import logging
-from typing import TYPE_CHECKING, Any, Dict, Optional
-
-if TYPE_CHECKING:
-    from transformers import PreTrainedTokenizerBase
-
-LOG = logging.getLogger("axolotl.utils.chat_templates")
-
-_JINJA_TEMPALTE_CHOICE = "jinja"
-_DEFAULT_TEMPLATE_CHOICE = "tokenizer_default"
-_DEFAULT_FALLBACK_CHATML_TEMPLATE_CHOICE_PREFIX = "tokenizer_default_fallback_"
-
-_CHAT_TEMPLATES = {
-    "alpaca": "{{ bos_token }}{% for message in messages %}{% if message['role'] == 'system' and loop.first %}{{ message['content'] }}{% elif message['role'] == 'user' %}{{ '### Instruction:\n' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '### Response:\n' + message['content'] + eos_token }}{% endif %}{% if not loop.last %}{{ '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '\n\n### Response:\n' }}{% endif %}",
-    "mistral_v1": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ ' [INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",  # Mistral 7B V1, Mistral 7B V2, Mixtral 8x7B V1...
-    "mistral_v2v3": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + '[/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",  # V3: Mistral 7B V3, Small, Large...
-    "mistral_v3_tekken": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST]' + message['content'] + '[/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",  # V3-Tekken: Nemo, Pixtral...
-    "mistral_v7_tekken": "{%- set today = strftime_now(\"%Y-%m-%d\") %}\n{%- set default_system_message = \"You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.\\nYour knowledge base was last updated on 2023-10-01. The current date is \" + today + \".\\n\\nWhen you're not sure about some information, you say that you don't have the information and don't make up anything.\\nIf the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. \\\"What are some good restaurants around me?\\\" => \\\"Where are you?\\\" or \\\"When is the next flight to Tokyo\\\" => \\\"Where do you travel from?\\\")\" %}\n\n{{- bos_token }}\n\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = default_system_message %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}\n\n{%- for message in loop_messages %}\n    {%- if message['role'] == 'user' %}\n\t    {%- if message['content'] is string %}\n            {{- '[INST]' + message['content'] + '[/INST]' }}\n\t    {%- else %}\n\t\t    {{- '[INST]' }}\n\t\t    {%- for block in message['content'] %}\n\t\t\t    {%- if block['type'] == 'text' %}\n\t\t\t\t    {{- block['text'] }}\n\t\t\t    {%- elif block['type'] == 'image' or block['type'] == 'image_url' %}\n\t\t\t\t    {{- '[IMG]' }}\n\t\t\t\t{%- else %}\n\t\t\t\t    {{- raise_exception('Only text and image blocks are supported in message content!') }}\n\t\t\t\t{%- endif %}\n\t\t\t{%- endfor %}\n\t\t    {{- '[/INST]' }}\n\t\t{%- endif %}\n    {%- elif message['role'] == 'system' %}\n        {{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }}\n    {%- elif message['role'] == 'assistant' %}\n   {%- if message['content'] is string %}\n     {{- message['content'] }}\n  {%- elif message['content'] is iterable %} \n\t\t {%- for block in message['content'] %}\n\t\t\t    {%- if block['type'] == 'text' %}\n\t\t\t\t    {{- block['text'] }}\n\t\t\t {%- else %}\n\t\t\t\t    {{- raise_exception('Only text blocks are supported in assistant message content!') }}  {%- endif %}\n\t\t\t \n\t\t\t{%- endfor %} {{- eos_token }} {%- else %}\n  {{- raise_exception('Unsupported assistant message content format!') }} \n{%- endif %}  \n{%- else %}\n     {{- raise_exception('Only user, system and assistant roles are supported!') }}\n    {%- endif %}\n{%- endfor %}",
-    "chatml": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
-    "gemma": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}",
-    "gemma3": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n    {%- if messages[0]['content'] is string -%}\n        {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n    {%- else -%}\n        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n    {%- endif -%}\n    {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n    {%- set first_user_prefix = \"\" -%}\n    {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n        {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n    {%- endif -%}\n    {%- if (message['role'] == 'assistant') -%}\n        {%- set role = \"model\" -%}\n    {%- else -%}\n        {%- set role = message['role'] -%}\n    {%- endif -%}\n    {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n    {%- if message['content'] is string -%}\n        {{ message['content'] | trim }}\n    {%- elif message['content'] is iterable -%}\n        {%- for item in message['content'] -%}\n            {%- if item['type'] == 'image' -%}\n                {{ '<start_of_image>' }}\n            {%- elif item['type'] == 'text' -%}\n                {{ item['text'] | trim }}\n            {%- endif -%}\n        {%- endfor -%}\n    {%- else -%}\n        {{ raise_exception(\"Invalid content type\") }}\n    {%- endif -%}\n    {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n    {{'<start_of_turn>model\n'}}\n{%- endif -%}\n",
-    "cohere": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'  + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
-    "llama3": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
-    "llama3_2_vision": '{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- if strftime_now is defined %}\n        {%- set date_string = strftime_now("%d %b %Y") %}\n    {%- else %}\n        {%- set date_string = "26 Jul 2024" %}\n    {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0][\'role\'] == \'system\' %}\n    {%- set system_message = messages[0][\'content\']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = "" %}\n{%- endif %}\n\n{#- Find out if there are any images #}\n{% set image_ns = namespace(has_images=false) %}      \n{%- for message in messages %}\n    {%- for content in message[\'content\'] %}\n        {%- if content[\'type\'] == \'image\' %}\n            {%- set image_ns.has_images = true %}\n        {%- endif %}\n    {%- endfor %}\n{%- endfor %}\n\n{#- Error out if there are images and system message #}\n{%- if image_ns.has_images and not system_message == "" %}\n    {{- raise_exception("Prompting with images is incompatible with system messages.") }}\n{%- endif %}\n\n{#- System message if there are no images #}\n{%- if not image_ns.has_images %}\n    {{- "<|start_header_id|>system<|end_header_id|>\\n\\n" }}\n    {%- if tools is not none %}\n        {{- "Environment: ipython\\n" }}\n    {%- endif %}\n    {{- "Cutting Knowledge Date: December 2023\\n" }}\n    {{- "Today Date: " + date_string + "\\n\\n" }}\n    {%- if tools is not none and not tools_in_user_message %}\n        {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}\n        {{- \'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.\' }}\n        {{- "Do not use variables.\\n\\n" }}\n        {%- for t in tools %}\n            {{- t | tojson(indent=4) }}\n            {{- "\\n\\n" }}\n        {%- endfor %}\n    {%- endif %}\n    {{- system_message }}\n    {{- "<|eot_id|>" }}\n{%- endif %}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0][\'content\']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception("Cannot put tools in the first user message when there\'s no first user message!") }}\n{%- endif %}\n    {{- \'<|start_header_id|>user<|end_header_id|>\\n\\n\' -}}\n    {{- "Given the following functions, please respond with a JSON for a function call " }}\n    {{- "with its proper arguments that best answers the given prompt.\\n\\n" }}\n    {{- \'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.\' }}\n    {{- "Do not use variables.\\n\\n" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- "\\n\\n" }}\n    {%- endfor %}\n    {{- first_user_message + "<|eot_id|>"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == \'ipython\' or message.role == \'tool\' or \'tool_calls\' in message) %}\n    {{- \'<|start_header_id|>\' + message[\'role\'] + \'<|end_header_id|>\\n\\n\' }}\n        {%- if message[\'content\'] is string %}\n            {{- message[\'content\'] }}\n        {%- else %}\n            {%- for content in message[\'content\'] %}\n                {%- if content[\'type\'] == \'image\' %}\n                    {{- \'<|image|>\' }}\n                {%- elif content[\'type\'] == \'text\' %}\n                    {{- content[\'text\'] }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- \'<|eot_id|>\' }}\n    {%- elif \'tool_calls\' in message %}\n        {%- if not message.tool_calls|length == 1 %}\n            {{- raise_exception("This model only supports single tool-calls at once!") }}\n        {%- endif %}\n        {%- set tool_call = message.tool_calls[0].function %}\n        {{- \'<|start_header_id|>assistant<|end_header_id|>\\n\\n\' -}}\n        {{- \'{"name": "\' + tool_call.name + \'", \' }}\n        {{- \'"parameters": \' }}\n        {{- tool_call.arguments | tojson }}\n        {{- "}" }}\n        {{- "<|eot_id|>" }}\n    {%- elif message.role == "tool" or message.role == "ipython" %}\n        {{- "<|start_header_id|>ipython<|end_header_id|>\\n\\n" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- "<|eot_id|>" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- \'<|start_header_id|>assistant<|end_header_id|>\\n\\n\' }}\n{%- endif %}\n',
-    "llama4": "{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- if strftime_now is defined %}\n        {%- set date_string = strftime_now(\"%d %b %Y\") %}\n    {%- else %}\n        {%- set date_string = \"26 Jul 2024\" %}\n    {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}    \n    {%- if messages[0]['content'] is string %}\n        {%- set system_message = messages[0]['content']|trim %}\n    {%- else %}\n        {#- FIXME: The processor requires an array, always. #}\n        {%- set system_message = messages[0]['content'][0]['text']|trim %}\n    {%- endif %}\n    {%- set messages = messages[1:] %}\n    {%- set user_supplied_system_message = true %}\n{%- else %}\n    {%- set system_message = \"\" %}\n    {%- set user_supplied_system_message = false %}\n{%- endif %}\n\n{#- System message if the user supplied one #}\n{%- if user_supplied_system_message %}\n    {{- \"<|header_start|>system<|header_end|>\\n\\n\" }}\n    {%- if tools is not none %}\n        {{- \"Environment: ipython\\n\" }}\n    {%- endif %}\n    {%- if tools is not none and not tools_in_user_message %}\n        {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n        {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n        {{- \"Do not use variables.\\n\\n\" }}\n        {%- for t in tools %}\n            {{- t | tojson(indent=4) }}\n            {{- \"\\n\\n\" }}\n        {%- endfor %}\n    {%- endif %}\n    {{- system_message }}\n    {{- \"<|eot|>\" }}\n{%- endif %}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0]['content']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n    {{- '<|header_start|>user<|header_end|>\\n\\n' -}}\n    {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n    {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n    {{- first_user_message + \"<|eot|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n    {{- '<|header_start|>' + message['role'] + '<|header_end|>\\n\\n' }}\n        {%- if message['content'] is string %}\n            {{- message['content'] }}\n        {%- else %}\n            {%- for content in message['content'] %}\n                {%- if content['type'] == 'image' %}\n                    {{- '<|image|>' }}\n                {%- elif content['type'] == 'text' %}\n                    {{- content['text'] }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- \"<|eot|>\" }}\n    {%- elif 'tool_calls' in message and message.tool_calls|length > 0 %}\n       {{- '<|header_start|>assistant<|header_end|>\\n\\n' -}}\n       {{- '<|python_start|>' }}\n        {%- if message['content'] is string %}\n            {{- message['content'] }}\n        {%- else %}\n            {%- for content in message['content'] %}\n                {%- if content['type'] == 'image' %}\n                    {{- '<|image|>' }}\n                {%- elif content['type'] == 'text' %}\n                    {{- content['text'] }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n       {{- '<|python_end|>' }}\n        {%- for tool_call in message.tool_calls %}\n           {{- '{\"name\": \"' + tool_call.function.name + '\", ' }}\n           {{- '\"parameters\": ' }}\n           {{- tool_call.function.arguments | tojson }}\n           {{- \"}\" }}\n        {%- endfor %}\n       {{- \"<|eot|>\" }}\n    {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n        {{- \"<|header_start|>ipython<|header_end|>\\n\\n\" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- \"<|eot|>\" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|header_start|>assistant<|header_end|>\\n\\n' }}\n{%- endif %}\n",
-    "llava": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}",
-    "phi_3": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
-    "phi_35": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% endif %}",
-    "deepseek_v2": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '<｜User｜>' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '<｜Assistant｜>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<｜Assistant｜>' }}{% endif %}",
-    "deepseek_v3": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' in message %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- else %}{{'<｜Assistant｜>' + message['content'] + '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- endif %}{%- endfor %}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}",
-    "jamba": '{# Variables #}\n{% set ns = namespace(message_count=0, is_last_checked_defined=False) %}\n{##}\n{% set bom_str = bom_str or "<|bom|>" %}\n{% set eom_str = eom_str or "<|eom|>" %}\n{% set default_system_message = "" %}\n{##}\n{% set documents_prefix = "<documents>" %}\n{% set documents_suffix = "</documents>" %}\n{% set tool_definitions_prefix = "<tool_definitions>" %}\n{% set tool_definitions_suffix = "</tool_definitions>" %}\n{% set active_modes_prefix = "<active_output_modes>" %}\n{% set active_modes_suffix = "</active_output_modes>" %}\n{##}\n{% set tool_calls_prefix = "<tool_calls>" %}\n{% set tool_calls_suffix = "</tool_calls>" %}\n{% set citations_prefix = "<citations>" %}\n{% set citations_suffix = "</citations>" %}\n{##}\n{% if add_generation_prompt is not defined %}\n  {% set add_generation_prompt = True %}\n{% endif %}\n{% set role_to_predict = role_to_predict or "assistant" %}\n{% if messages|length > 0 and messages[0].role == "system" %}\n  {% set system_message = messages[0].content %}\n  {% set loop_messages = messages[1:] %}\n{% else %}\n  {% set system_message = default_system_message %}\n  {% set loop_messages = messages %}\n{% endif %}\n{##}\n{##}\n{# Macros #}\n{% macro handle_tool_definitions(tools) %}\n  {{- tool_definitions_prefix -}}\n  {{- "\\n# Tools" -}}\n  {{- "\\n\\n## Functions" -}}\n  {% for tool in tools %}\n    {% set _ = is_param_set(tool, field="type") %}\n    {% set is_tool_type_set = ns.is_last_checked_defined %}\n    {% if is_tool_type_set %}\n      {% if tool.type == "function" %}\n        {% set tool = tool.function %}\n      {% else %}\n        {{ raise_exception("Currently, the only supported tool type is `function`") }}\n      {% endif %}\n    {% endif %}\n    {{- "\\n\\n" + (tool|tojson(indent=2)) -}}\n  {% endfor %}\n  {{- "\\n" + tool_definitions_suffix -}}\n{% endmacro %}\n{##}\n{% macro handle_first_system_message(system_message, tools) %}\n  {{- bom_str + handle_role("system") -}}\n  {% set _ = is_param_set(system_message) %}\n  {% set is_system_message_set = ns.is_last_checked_defined %}\n  {% if is_system_message_set %}\n    {{- system_message -}}\n  {% endif %}\n  {% set _ = is_param_set(tools, is_list=True) %}\n  {% set is_tools_set = ns.is_last_checked_defined %}\n  {% if is_tools_set %}\n    {% if system_message %}\n      {{- "\\n\\n" -}}\n    {% endif %}\n    {{- handle_tool_definitions(tools) -}}\n  {% endif %}\n  {% set ns.message_count = ns.message_count + 1 %}\n{% endmacro %}\n{##}\n{% macro handle_tool_calls(tool_calls) %}\n  {{- tool_calls_prefix + "[\\n" -}}\n  {% for tool_call in tool_calls %}\n    {% set _ = is_param_set(tool_call, field="function") %}\n    {% set is_tool_call_function_set = ns.is_last_checked_defined %}\n    {% if is_tool_call_function_set %}\n      {%- set tool_call = tool_call.function %}\n    {%- endif %}\n    {% set arguments = tool_call.arguments %}\n    {% if arguments is not string %}\n      {%- set arguments = arguments|tojson -%}\n    {%- endif %}\n    {{ "{\\"name\\": \\"" + tool_call.name + "\\", \\"arguments\\": " + arguments + "}" -}}\n    {% if not loop.last %}\n      {{- "," }}\n    {% endif %}\n  {% endfor %}\n  {{- "\\n]" + tool_calls_suffix -}}\n{% endmacro %}\n{##}\n{% macro handle_documents(documents) %}\n  {{- documents_prefix -}}\n  {{- "\\n# Documents" -}}\n  {{- "\\n\\nYou can use the following documents for reference:" -}}\n  {% for doc in documents %}\n    {{- "\\n\\n## Document ID: " + loop.index0|string -}}\n    {% set _ = is_param_set(doc, field="title") %}\n    {% set is_doc_title_set = ns.is_last_checked_defined %}\n    {% if is_doc_title_set %}\n      {{- "\\nTitle: " + doc.title -}}\n    {% endif %}\n    {% for key, value in doc.items() %}\n      {% if key not in ["title", "text"] %}\n        {{- "\\n" + key|title + ": " + value|string -}}\n      {% endif %}\n    {% endfor %}\n    {{- "\\nText: " + doc.text -}}\n  {% endfor %}\n  {{- "\\n" + documents_suffix -}}\n{% endmacro %}\n{##}\n{% macro handle_knobs(knobs) %}\n  {{- active_modes_prefix -}}\n  {{- "\\n# Active Modes" -}}\n  {{ "\\n\\nThe following modes configure the format or style of your responses. You should adhere to all currently" -}}\n  {{ " active modes simultaneously." -}}\n  {% if knobs.citation_mode == "fast" %}\n    {{- "\\n\\n## Citation Mode" -}}\n    {{- "\\n\\nProvide a list of references only for the documents you base your response on. Format your response" -}}\n    {{ " with the original answer followed by a citation section. Use this template:" -}}\n    {{ " `{answer}" + citations_prefix + "DOCUMENT_IDS" + citations_suffix + "`, where DOCUMENT_IDS are the relevant document numbers" -}}\n    {{ " (e.g. [2, 5, 9]), or [] if the answer cannot be supported by the provided documents." -}}\n  {% endif %}\n  {% if knobs.response_format == "json_object" %}\n    {{- "\\n\\n## JSON Mode" -}}\n    {{ "\\n\\nProvide your response in JSON format. Adhere strictly to any schema given by the user." -}}\n    {{ " If an appropriate JSON format exists, use it without modification." -}}\n  {% endif %}\n  {{- "\\n" + active_modes_suffix -}}\n{% endmacro %}\n{##}\n{% macro get_last_user_index(messages) %}\n  {% set ns.last_user_index = 0 %}\n  {% for message in messages %}\n    {% if message.role == \'user\' %}\n      {% set ns.last_user_index = loop.index0 %}\n    {% endif %}\n  {% endfor %}\n  {{- ns.last_user_index -}}\n{% endmacro %}\n{##}\n{% macro handle_last_system_message(documents, knobs, use_documents, use_knobs) %}\n  {{- bom_str + handle_role("system") -}}\n  {% set macros_to_call = [] %}\n  {% set params_for_macros = [] %}\n  {% if use_documents %}\n    {% set macros_to_call = macros_to_call + [handle_documents] %}\n    {% set params_for_macros = params_for_macros + [[documents]] %}\n  {% endif %}\n  {% if use_knobs %}\n    {% set macros_to_call = macros_to_call + [handle_knobs] %}\n    {% set params_for_macros = params_for_macros + [[knobs]] %}\n  {% endif %}\n  {% for i in range(macros_to_call|length) %}\n    {% if i > 0 %}\n      {{- "\\n\\n" -}}\n    {% endif %}\n    {{- macros_to_call[i](*params_for_macros[i]) -}}\n  {% endfor %}\n  {% set ns.message_count = ns.message_count + 1 %}\n{% endmacro %}\n{##}\n{% macro handle_role(role, add_space=True) %}\n  {{- "<|" + role + "|>" -}}\n  {% if add_space %}\n    {{- " " -}}\n  {% endif %}\n{% endmacro %}\n{##}\n{% macro is_param_set(param, field=none, is_list=False) %}\n  {% if field is not none %}\n    {% if field in param %}\n      {% set param = param[field] %}\n    {% else %}\n      {% set param = none %}\n    {% endif %}\n  {% endif %}\n  {% set is_defined = param is defined and param is not none %}\n  {% if is_list %}\n    {% set ns.is_last_checked_defined = is_defined and param|length > 0 %}\n  {% else %}\n    {% set ns.is_last_checked_defined = is_defined %}\n  {% endif %}\n{% endmacro %}\n{##}\n{##}\n{# Template #}\n{{- "<|startoftext|>" -}}\n{% set _ = is_param_set(system_message) %}\n{% set is_system_message_set = ns.is_last_checked_defined %}\n{% set _ = is_param_set(tools, is_list=True) %}\n{% set is_tools_set = ns.is_last_checked_defined %}\n{% set has_system_message = (is_system_message_set or is_tools_set) %}\n{% if has_system_message %}\n  {{- handle_first_system_message(system_message, tools) -}}\n{% endif %}\n{% set last_user_index = get_last_user_index(loop_messages)|int %}\n{% for message in loop_messages %}\n  {% if loop.index0 == last_user_index %}\n    {% set _ = is_param_set(documents, is_list=True) %}\n    {% set use_documents = ns.is_last_checked_defined %}\n    {% set _ = is_param_set(knobs) %}\n    {% set use_knobs = ns.is_last_checked_defined and knobs.is_set %}\n    {% set add_last_system_message = use_documents or use_knobs %}\n    {% if add_last_system_message %}\n      {% if ns.message_count > 0 %}\n        {{- eom_str -}}\n      {% endif %}\n      {{- handle_last_system_message(documents, knobs, use_documents, use_knobs) -}}\n    {% endif %}\n  {% endif %}\n  {% set role = message.role %}\n  {% set _ = is_param_set(message, field="name") %}\n  {% set is_message_name_set = ns.is_last_checked_defined %}\n  {% if is_message_name_set %}\n    {% set message_prefix = handle_role(role) + "(" + message.name + ")" %}\n  {% else %}\n    {% set message_prefix = handle_role(role) %}\n  {% endif %}\n  {% set content = (message.content or "") %}\n  {% if content is not string %}\n    {% set content = content|tojson %}\n  {% endif %}\n  {% if ns.message_count > 0 %}\n    {{- eom_str -}}\n  {% endif %}\n  {{- bom_str + message_prefix + content -}}\n  {% set _ = is_param_set(message, field="tool_calls", is_list=True) %}\n  {% set is_tool_calls_set = ns.is_last_checked_defined %}\n  {% if role == "assistant" and is_tool_calls_set %}\n    {{- handle_tool_calls(message.tool_calls) -}}\n  {% endif %}\n  {% set _ = is_param_set(message, field="citations", is_list=True) %}\n  {% set is_citations_set = ns.is_last_checked_defined %}\n  {% if role == "assistant" and is_citations_set %}\n    {{- citations_prefix + message.citations|map(attribute="document_id")|list|string + citations_suffix -}}\n  {% endif %}\n  {% set ns.message_count = ns.message_count + 1 %}\n{% endfor %}\n{% if add_generation_prompt %}\n  {% if ns.message_count > 0 %}\n    {{- eom_str -}}\n  {% endif %}\n  {{- bom_str + handle_role(role_to_predict, add_space=False) -}}\n  {% set _ = is_param_set(generation_preamble) %}\n  {% set is_generation_preamble_set = ns.is_last_checked_defined %}\n  {% if is_generation_preamble_set and generation_preamble.strip() != "" %}\n    {{- " " + generation_preamble -}}\n  {% endif %}\n  {% set ns.message_count = ns.message_count + 1 %}\n{% else %}\n  {% if ns.message_count > 0 %}\n    {{- eom_str -}}\n  {% endif %}\n{% endif %}\n',
-    "qwen_25": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
-    "qwen3": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {{- messages[0].content + '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n    {%- set index = (messages|length - 1) - loop.index0 %}\n    {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n        {%- set ns.multi_step_tool = false %}\n        {%- set ns.last_query_index = index %}\n    {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {%- set content = message.content %}\n        {%- set reasoning_content = '' %}\n        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n            {%- set reasoning_content = message.reasoning_content %}\n        {%- else %}\n            {%- if '</think>' in message.content %}\n                {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n            {%- endif %}\n        {%- endif %}\n        {%- if loop.index0 > ns.last_query_index %}\n            {%- if loop.last or (not loop.last and reasoning_content) %}\n                {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n            {%- else %}\n                {{- '<|im_start|>' + message.role + '\\n' + content }}\n            {%- endif %}\n        {%- else %}\n            {{- '<|im_start|>' + message.role + '\\n' + content }}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\\n\\n</think>\\n\\n' }}\n    {%- endif %}\n{%- endif %}",
-    "exaone": "{% for message in messages %}{% if loop.first and message['role'] != 'system' %}{{ '[|system|][|endofturn|]\n' }}{% endif %}{{ '[|' + message['role'] + '|]' + message['content'] }}{% if message['role'] == 'user' %}{{ '\n' }}{% else %}{{ '[|endofturn|]\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '[|assistant|]' }}{% endif %}",
-    "metharme": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'Enter RP mode. You shall reply to the user while staying in character. Your responses must be detailed, creative, immersive, and drive the scenario forward.' %}{% endif %}{{ '<|system|>' + system_message }}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|user|>' + content.strip() }}{% elif message['role'] == 'assistant' %}{{ '<|model|>'  + content.strip() }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|model|>' }}{% else %}{{ eos_token }}{% endif %}",
-    "pixtral": '{%- if messages[0]["role"] == "system" %}\n    {%- set system_message = messages[0]["content"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if (message[\'role\'] == \'user\') != (loop.index0 % 2 == 0) %}\n        {{- raise_exception(\'After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\') }}\n    {%- endif %}\n    {%- if message["role"] == "user" %}\n        {%- if loop.last and system_message is defined %}\n            {{- "[INST]" + system_message + "\n\n" }}\n        {%- else %}\n            {{- "[INST]" }}\n        {%- endif %}\n        {%- if message["content"] is not string %}\n            {%- for chunk in message["content"] %}\n                {%- if chunk["type"] == "text" %}\n                    {{- chunk["text"] }}\n                {%- elif chunk["type"] == "image" %}\n                    {{- "[IMG]" }}\n                {%- else %}\n                    {{- raise_exception("Unrecognized content type!") }}\n                {%- endif %}\n            {%- endfor %}\n        {%- else %}\n            {{- message["content"] }}\n        {%- endif %}\n        {{- "[/INST]" }}\n    {%- elif message["role"] == "assistant" %}\n {%- if message["content"] is not string %}\n {%- for chunk in message["content"] %}\n {%- if chunk["type"] == "text" %}\n {{- chunk["text"] }}\n {%- elif chunk["type"] == "image" %}\n {{- "[IMG]" }}\n {%- else %}\n {{- raise_exception("Unrecognized content type!") }}\n{%- endif %}\n{%- endfor %}\n{{- eos_token }}\n{%- else %}\n{{- message["content"] + eos_token }}\n{%- endif %}\n    {%- else %}\n        {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}\n    {%- endif %}\n{%- endfor %}',
-    "qwen2_vl": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
-}
-
-
-def get_chat_template(
-    user_choice: str,
-    jinja_template: Optional[str] = None,
-    tokenizer: Optional["PreTrainedTokenizerBase"] = None,
-) -> str:
-    """
-    Finds the correct chat_template based on the user's choice, jinja_template, and tokenizer.
-
-    Args:
-        user_choice (str): The user's choice of template.
-        jinja_template (Optional[str], optional): The jinja template string. Defaults to None.
-        tokenizer (Optional[PreTrainedTokenizerBase], optional): The tokenizer. Defaults to None.
-
-    Returns:
-        str: The chosen template string.
-
-    Raises:
-        ValueError: If the user_choice is not found in the templates.
-    """
-    if user_choice == _JINJA_TEMPALTE_CHOICE:
-        if not jinja_template:
-            raise ValueError(
-                f"`jinja_template` cannot be None when `chat_template` choice is {_JINJA_TEMPALTE_CHOICE}"
-            )
-        return jinja_template
-
-    if user_choice == _DEFAULT_TEMPLATE_CHOICE:
-        if not tokenizer:
-            raise ValueError(
-                f"`tokenizer` cannot be None when chat_template choice is {_DEFAULT_TEMPLATE_CHOICE}"
-            )
-        if not tokenizer.chat_template:
-            raise ValueError(
-                f"`chat_template choice is {_DEFAULT_TEMPLATE_CHOICE} but tokenizer's chat_template is null. "
-                f"Please add a chat_template in tokenizer config"
-            )
-        return tokenizer.chat_template  # type: ignore
-
-    if user_choice.startswith(_DEFAULT_FALLBACK_CHATML_TEMPLATE_CHOICE_PREFIX):
-        if not tokenizer:
-            raise ValueError(
-                f"`tokenizer` cannot be None when chat_template choice starts with {_DEFAULT_FALLBACK_CHATML_TEMPLATE_CHOICE_PREFIX}"
-            )
-        if tokenizer.chat_template:
-            return tokenizer.chat_template  # type: ignore
-
-        user_choice = user_choice[
-            len(_DEFAULT_FALLBACK_CHATML_TEMPLATE_CHOICE_PREFIX) :
-        ]
-        LOG.warning(
-            f"No chat template found on tokenizer, falling back to {user_choice}. It is recommended to set --train_on_inputs to True for the model to learn this chat template."
-        )
-
-    if user_choice in _CHAT_TEMPLATES:
-        return _CHAT_TEMPLATES[user_choice]
-
-    raise ValueError(f"Template '{user_choice}' not found.")
-
-
-def extract_chat_template_args(cfg, ds_cfg: Optional[Dict[str, Any]] = None):
-    if ds_cfg and ds_cfg.get("chat_template"):
-        chat_template_choice = ds_cfg.get("chat_template") or _DEFAULT_TEMPLATE_CHOICE
-        chat_template_jinja = ds_cfg.get("chat_template_jinja")
-    else:
-        chat_template_choice = cfg.get("chat_template") or _DEFAULT_TEMPLATE_CHOICE
-        chat_template_jinja = cfg.get("chat_template_jinja")
-    return chat_template_choice, chat_template_jinja
-
-
-def get_chat_template_from_config(
-    cfg,
-    ds_cfg: Optional[Dict[str, Any]] = None,
-    tokenizer: Optional["PreTrainedTokenizerBase"] = None,
-) -> str:
-    chat_template_choice, chat_template_jinja = extract_chat_template_args(
-        cfg=cfg, ds_cfg=ds_cfg
-    )
-    return get_chat_template(
-        user_choice=chat_template_choice,
-        jinja_template=chat_template_jinja,
-        tokenizer=tokenizer,
-    )
-
-
-def register_chat_template(template_name: str, chat_template: str):
-    """
-    Registers chat templates.
-
-    Args:
-        template_name (str): The name of the template.
-        chat_template (str): The template string.
-    """
-
-    if template_name in _CHAT_TEMPLATES:
-        raise ValueError(f"Template '{template_name}' already exists.")
-
-    _CHAT_TEMPLATES[template_name] = chat_template
diff --git a/src/axolotl/utils/chat_templates/__init__.py b/src/axolotl/utils/chat_templates/__init__.py
new file mode 100644
index 000000000..337417c7d
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/__init__.py
@@ -0,0 +1,20 @@
+"""
+This module provides functionality for selecting chat templates based on user choices.
+These templates are used for formatting messages in a conversation.
+"""
+
+from .base import (
+    _CHAT_TEMPLATES,
+    extract_chat_template_args,
+    get_chat_template,
+    get_chat_template_from_config,
+    register_chat_template,
+)
+
+__all__ = [
+    "get_chat_template",
+    "extract_chat_template_args",
+    "get_chat_template_from_config",
+    "register_chat_template",
+    "_CHAT_TEMPLATES",
+]
diff --git a/src/axolotl/utils/chat_templates/base.py b/src/axolotl/utils/chat_templates/base.py
new file mode 100644
index 000000000..11d15fc1d
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/base.py
@@ -0,0 +1,125 @@
+"""
+utility functions for chat templates
+"""
+
+import os
+from typing import TYPE_CHECKING, Any, Dict, Optional
+
+from axolotl.utils.logging import get_logger
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizerBase
+
+LOG = get_logger("axolotl.utils.chat_templates")
+
+_JINJA_TEMPLATE_CHOICE = "jinja"
+_DEFAULT_TEMPLATE_CHOICE = "tokenizer_default"
+_DEFAULT_FALLBACK_CHATML_TEMPLATE_CHOICE_PREFIX = "tokenizer_default_fallback_"
+
+TEMPLATE_DIR = os.path.join(os.path.dirname(__file__), "templates")
+_CHAT_TEMPLATES: dict[str, str] = {}
+for filename in [f for f in os.listdir(TEMPLATE_DIR) if f.endswith(".jinja")]:
+    with open(os.path.join(TEMPLATE_DIR, filename), "r", encoding="utf-8") as f:
+        _CHAT_TEMPLATES[filename[:-6]] = f.read()
+
+
+def get_chat_template(
+    user_choice: str,
+    jinja_template: str | None = None,
+    tokenizer: Optional["PreTrainedTokenizerBase"] = None,
+) -> str:
+    """
+    Finds the correct chat_template based on the user's choice, jinja_template, and tokenizer.
+
+    Args:
+        user_choice (str): The user's choice of template.
+        jinja_template (str, optional): The jinja template string or Path to a valid jinja template file. Defaults to None.
+        tokenizer (PreTrainedTokenizerBase, optional): The tokenizer. Defaults to None.
+
+    Returns:
+        str: The chosen template string.
+
+    Raises:
+        ValueError: If the user_choice is not found in the templates.
+    """
+    if user_choice == _JINJA_TEMPLATE_CHOICE:
+        if not jinja_template:
+            raise ValueError(
+                f"`jinja_template` cannot be None when `chat_template` choice is {_JINJA_TEMPLATE_CHOICE}"
+            )
+        if os.path.exists(jinja_template) and os.path.isfile(jinja_template):
+            with open(jinja_template, "r", encoding="utf-8") as file:
+                jinja_template = file.read()
+        return jinja_template
+
+    if user_choice == _DEFAULT_TEMPLATE_CHOICE:
+        if not tokenizer:
+            raise ValueError(
+                f"`tokenizer` cannot be None when chat_template choice is {_DEFAULT_TEMPLATE_CHOICE}"
+            )
+        if not tokenizer.chat_template:
+            raise ValueError(
+                f"`chat_template choice is {_DEFAULT_TEMPLATE_CHOICE} but tokenizer's chat_template is null. "
+                f"Please add a chat_template in tokenizer config"
+            )
+        return tokenizer.chat_template  # type: ignore
+
+    if user_choice.startswith(_DEFAULT_FALLBACK_CHATML_TEMPLATE_CHOICE_PREFIX):
+        if not tokenizer:
+            raise ValueError(
+                f"`tokenizer` cannot be None when chat_template choice starts with {_DEFAULT_FALLBACK_CHATML_TEMPLATE_CHOICE_PREFIX}"
+            )
+        if tokenizer.chat_template:
+            return tokenizer.chat_template  # type: ignore
+
+        user_choice = user_choice[
+            len(_DEFAULT_FALLBACK_CHATML_TEMPLATE_CHOICE_PREFIX) :
+        ]
+        LOG.warning(
+            f"No chat template found on tokenizer, falling back to {user_choice}. It is recommended to set --train_on_inputs to True for the model to learn this chat template."
+        )
+
+    if user_choice in _CHAT_TEMPLATES:
+        return _CHAT_TEMPLATES[user_choice]
+
+    raise ValueError(f"Template '{user_choice}' not found.")
+
+
+def extract_chat_template_args(cfg, ds_cfg: Dict[str, Any] | None = None):
+    if ds_cfg and ds_cfg.get("chat_template"):
+        chat_template_choice = ds_cfg.get("chat_template") or _DEFAULT_TEMPLATE_CHOICE
+        chat_template_jinja = ds_cfg.get("chat_template_jinja")
+    else:
+        chat_template_choice = cfg.get("chat_template") or _DEFAULT_TEMPLATE_CHOICE
+        chat_template_jinja = cfg.get("chat_template_jinja")
+    return chat_template_choice, chat_template_jinja
+
+
+def get_chat_template_from_config(
+    cfg,
+    ds_cfg: Dict[str, Any] | None = None,
+    tokenizer: Optional["PreTrainedTokenizerBase"] = None,
+) -> str:
+    chat_template_choice, chat_template_jinja = extract_chat_template_args(
+        cfg=cfg, ds_cfg=ds_cfg
+    )
+    return get_chat_template(
+        user_choice=chat_template_choice,
+        jinja_template=chat_template_jinja,
+        tokenizer=tokenizer,
+    )
+
+
+def register_chat_template(template_name: str, chat_template: str):
+    """
+    Registers chat templates.
+
+    Args:
+        template_name (str): The name of the template.
+        chat_template (str): The template string.
+    """
+
+    if template_name in _CHAT_TEMPLATES:
+        raise ValueError(f"Template '{template_name}' already exists.")
+
+    _CHAT_TEMPLATES[template_name] = chat_template
diff --git a/src/axolotl/utils/chat_templates/templates/alpaca.jinja b/src/axolotl/utils/chat_templates/templates/alpaca.jinja
new file mode 100644
index 000000000..5e9d63c42
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/alpaca.jinja
@@ -0,0 +1,8 @@
+{{ bos_token }}{% for message in messages %}{% if message['role'] == 'system' and loop.first %}{{ message['content'] }}{% elif message['role'] == 'user' %}{{ '### Instruction:
+' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '### Response:
+' + message['content'] + eos_token }}{% endif %}{% if not loop.last %}{{ '
+
+' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '
+
+### Response:
+' }}{% endif %}
diff --git a/src/axolotl/utils/chat_templates/templates/aya.jinja b/src/axolotl/utils/chat_templates/templates/aya.jinja
new file mode 100644
index 000000000..97e54d4b1
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/aya.jinja
@@ -0,0 +1 @@
+{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Aya, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'  + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}
diff --git a/src/axolotl/utils/chat_templates/templates/chatml.jinja b/src/axolotl/utils/chat_templates/templates/chatml.jinja
new file mode 100644
index 000000000..2116e45ca
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/chatml.jinja
@@ -0,0 +1,4 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}
diff --git a/src/axolotl/utils/chat_templates/templates/cohere.jinja b/src/axolotl/utils/chat_templates/templates/cohere.jinja
new file mode 100644
index 000000000..638ce5ef2
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/cohere.jinja
@@ -0,0 +1 @@
+{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'  + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}
diff --git a/src/axolotl/utils/chat_templates/templates/command_a.jinja b/src/axolotl/utils/chat_templates/templates/command_a.jinja
new file mode 100644
index 000000000..ef0594172
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/command_a.jinja
@@ -0,0 +1,210 @@
+{{ bos_token }}{% if documents %}
+{% set tools = [] %}
+{%- macro document_turn(documents) -%}
+{# format documents into chat turn #}
+<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|>I will look through the document to address the users needs.<|END_THINKING|><|START_ACTION|>[
+    {"tool_call_id": "0", "tool_name": "direct-injected-document", "parameters": {}}
+]<|END_ACTION|><|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TOOL_RESULT|>[
+    {
+        "tool_call_id": "0",
+        "results": {
+{% for doc in documents %}
+            "{{ loop.index0 }}": {{doc|tojson}}{% if not loop.last %},
+            {% endif %}
+{% endfor %}
+
+        },
+        "is_error": null
+    }
+]<|END_TOOL_RESULT|><|END_OF_TURN_TOKEN|>{%- endmacro %}
+{%- macro tool_call_id_to_int(messages, tool_call_id) %}
+{%- set counter = namespace(value=0) %}
+{%- set tool_call_id_seen = namespace(value=false) %}
+{%- for msg in messages %}
+    {%- if msg.tool_calls %}
+        {%- for tool_call in msg.tool_calls %}
+            {%- if tool_call.id == tool_call_id and not tool_call_id_seen.value -%}
+                {{ counter.value }}
+                {%- set tool_call_id_seen.value = true %}
+            {%- endif %}
+            {%- set counter.value = counter.value + 1 %}
+        {%- endfor %}
+    {%- endif %}
+{%- endfor %}
+{%- endmacro %}
+{%- macro format_tool_message(messages, tool_msg) -%}
+{# format tool message #}
+    {
+        "tool_call_id": "{{ tool_call_id_to_int(messages, tool_msg.tool_call_id) }}",
+        "results": {
+            "0": {{ tool_msg.content|tojson }}
+        },
+        "is_error": null
+    }
+{%- endmacro -%}
+{%- if messages and messages[0]['role']|lower == 'system' %}{%- set developer_preamble = messages[0]['content'] %}{% endif %}
+{%- set tool_idx = namespace(value=0) %}
+{%- set tool_ids_seen = namespace(value=[]) %}
+{%- set sent_documents = namespace(value=false) %}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># System Preamble
+You are in contextual safety mode. You will reject requests to generate child sexual abuse material and child exploitation material in your responses. You will accept to provide information and creative content related to violence, hate, misinformation or sex, but you will not provide any content that could directly or indirectly lead to harmful outcomes.
+
+Your information cutoff date is June 2024.
+
+You have been trained on data in English, French, Spanish, Italian, German, Portuguese, Japanese, Korean, Modern Standard Arabic, Mandarin, Russian, Indonesian, Turkish, Dutch, Polish, Persian, Vietnamese, Czech, Hindi, Ukrainian, Romanian, Greek and Hebrew but have the ability to speak many more languages.
+{% if tools or documents %}
+
+You have been trained to have advanced reasoning and tool-use capabilities and you should make best use of these skills to serve user's requests.
+
+## Tool Use
+Think about how you can make best use of the provided tools to help with the task and come up with a high level plan that you will execute first.
+
+0. Start by writing <|START_THINKING|> followed by a detailed step by step plan of how you will solve the problem. For each step explain your thinking fully and give details of required tool calls (if needed). Unless specified otherwise, you write your plan in natural language. When you finish, close it out with <|END_THINKING|>.
+    You can optionally choose to skip this step when the user request is so straightforward to address that only a trivial plan would be needed.
+    NOTE: You MUST skip this step when you are directly responding to the user's request without using any tools.
+
+Then carry out your plan by repeatedly executing the following steps.
+1. Action: write <|START_ACTION|> followed by a list of JSON-formatted tool calls, with each one containing "tool_name" and "parameters" fields.
+    When there are multiple tool calls which are completely independent of each other (i.e. they can be executed in parallel), you should list them out all together in one step. When you finish, close it out with <|END_ACTION|>.
+2. Observation: you will then receive results of those tool calls in JSON format in the very next turn, wrapped around by <|START_TOOL_RESULT|> and <|END_TOOL_RESULT|>. Carefully observe those results and think about what to do next. Note that these results will be provided to you in a separate turn. NEVER hallucinate results.
+    Every tool call produces a list of results (when a tool call produces no result or a single result, it'll still get wrapped inside a list). Each result is clearly linked to its originating tool call via its "tool_call_id".
+3. Reflection: start the next turn by writing <|START_THINKING|> followed by what you've figured out so far, any changes you need to make to your plan, and what you will do next. When you finish, close it out with <|END_THINKING|>.
+    You can optionally choose to skip this step when everything is going according to plan and no special pieces of information or reasoning chains need to be recorded.
+    NOTE: You MUST skip this step when you are done with tool-use actions and are ready to respond to the user.
+
+You can repeat the above 3 steps multiple times (could be 0 times too if no suitable tool calls are available or needed), until you decide it's time to finally respond to the user.
+
+4. Response: then break out of the loop and write <|START_RESPONSE|> followed by a piece of text which serves as a response to the user's last request. Use all previous tool calls and results to help you when formulating your response. When you finish, close it out with <|END_RESPONSE|>.
+{% if enable_citations %}
+
+## Grounding
+Importantly, note that "Reflection" and "Response" above can be grounded.
+Grounding means you associate pieces of texts (called "spans") with those specific tool results that support them (called "sources"). And you use a pair of tags "<co>" and "</co>" to indicate when a span can be grounded onto a list of sources, listing them out in the closing tag. Sources from the same tool call are grouped together and listed as "{tool_call_id}:[{list of result indices}]", before they are joined together by ",". E.g., "<co>span</co: 0:[1,2],1:[0]>" means that "span" is supported by result 1 and 2 from "tool_call_id=0" as well as result 0 from "tool_call_id=1".
+{% endif %}
+
+## Available Tools
+Here is the list of tools that you have available to you.
+You can ONLY use the tools listed here. When a tool is not listed below, it is NOT available and you should NEVER attempt to use it.
+Each tool is represented as a JSON object with fields like "name", "description", "parameters" (per JSON Schema), and optionally, "responses" (per JSON Schema).
+
+```json
+[
+{% if documents %}
+    {"name": "direct-injected-document", "description": "This is a special tool to directly inject user-uploaded documents into the chat as additional context. DO NOT use this tool by yourself!", "parameters": {"type": "object", "properties": {}, "required": []}, "responses": {"200": {"description": "Successfully returned a list of chunked text snippets from the directly uploaded documents.", "content": {"application/json": {"schema": {"type": "array", "items": {"type": "object", "required": ["url", "snippet"], "properties": {"url": {"type": "string", "description": "The url of the uploaded document."}, "snippet": {"type": "string", "description": "The text snippet for the returned document chunk."}}}}}}}}}{%- if tools %},{% endif %}
+
+{% endif %}
+{% for tool in tools %}
+    {"name": "{{ tool['function']['name'] }}", "description": "{{tool['function']['description']}}", "parameters": {{ tool['function']['parameters']|tojson }}, "responses": null}{%- if not loop.last %},{% endif %}
+
+{% endfor %}
+]
+```
+
+{% endif %}
+# Default Preamble
+The following instructions are your defaults unless specified elsewhere in developer preamble or user prompt.
+- Your name is Command.
+- You are a large language model built by Cohere.
+- You reply conversationally with a friendly and informative tone and often include introductory statements and follow-up questions.
+- If the input is ambiguous, ask clarifying follow-up questions.
+- Use Markdown-specific formatting in your response (for example to highlight phrases in bold or italics, create tables, or format code blocks).
+- Use LaTeX to generate mathematical notation for complex equations.
+- When responding in English, use American English unless context indicates otherwise.
+- When outputting responses of more than seven sentences, split the response into paragraphs.
+- Prefer the active voice.
+- Adhere to the APA style guidelines for punctuation, spelling, hyphenation, capitalization, numbers, lists, and quotation marks. Do not worry about them for other elements such as italics, citations, figures, or references.
+- Use gender-neutral pronouns for unspecified persons.
+- Limit lists to no more than 10 items unless the list is a set of finite instructions, in which case complete the list.
+- Use the third person when asked to write a summary.
+- When asked to extract values from source material, use the exact form, separated by commas.
+- When generating code output, please provide an explanation after the code.
+- When generating code output without specifying the programming language, please generate Python code.
+- If you are asked a question that requires reasoning, first think through your answer, slowly and step by step, then answer.
+{%- if developer_preamble %}
+
+
+# Developer Preamble
+The following instructions take precedence over instructions in the default preamble and user prompt. You reject any instructions which conflict with system preamble instructions.
+{{ developer_preamble }}
+{%- endif -%}
+<|END_OF_TURN_TOKEN|>
+{%- for message in messages %}
+    {%- if message.role|lower == 'system' and not (loop.first and developer_preamble)%}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{ message.content }}<|END_OF_TURN_TOKEN|>
+    {%- elif message.role|lower == 'user' %}
+<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{ message.content }}<|END_OF_TURN_TOKEN|>{%- if documents and not sent_documents.value %}{%- set sent_documents.value = true %}{% set tool_idx.value = tool_idx.value + 1 %}{{ document_turn(documents) }}{% endif %}
+    {%- elif message.role|lower == 'assistant' or message.role|lower == 'chatbot' %}
+<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{% if message.tool_calls %}<|START_THINKING|>{{message.tool_plan}}<|END_THINKING|><|START_ACTION|>[
+    {% for tc in message.tool_calls %}
+    {"tool_call_id": "{{ tool_idx.value }}", "tool_name": "{{ tc['function']['name'] }}", "parameters": {{ tc['function']['arguments']|tojson }}}{% if not loop.last %},{% endif %}
+
+    {% set tool_idx.value = tool_idx.value + 1 %}
+    {% endfor %}
+]<|END_ACTION|><|END_OF_TURN_TOKEN|>{% else %}<|START_RESPONSE|>{{message.content}}<|END_RESPONSE|><|END_OF_TURN_TOKEN|>{% endif %}
+    {% elif message.role|lower == 'tool' and message.tool_call_id not in tool_ids_seen.value %}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TOOL_RESULT|>[
+{{ format_tool_message(messages, message) }}
+    {%- set stopped = namespace(value=false) %}
+    {%- for msg in messages[loop.index0 + 1:] %}
+        {%- if not stopped.value and msg.role|lower == 'tool' %},
+{{ format_tool_message(messages, msg) }}
+            {%- set tool_ids_seen.value = tool_ids_seen.value + [msg.tool_call_id] %}
+        {%- else %}
+            {%- set stopped.value = true %}
+        {%- endif %}
+    {%- endfor %}
+
+]<|END_TOOL_RESULT|><|END_OF_TURN_TOKEN|>
+    {%- endif %}
+{%- endfor %}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
+{%- else -%}
+{%- if messages and messages[0]['role']|lower == 'system' %}{%- set developer_preamble = messages[0]['content'] %}{% endif %}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># System Preamble
+{% if safety_mode|upper == 'STRICT' -%}
+You are in strict safety mode. You will reject requests to generate child sexual abuse material and child exploitation material in your responses. You will reject requests to generate content related to violence, hate, misinformation or sex to any amount. You will avoid using profanity. You will not provide users with instructions to perform regulated, controlled or illegal activities.
+{%- else -%}
+You are in contextual safety mode. You will reject requests to generate child sexual abuse material and child exploitation material in your responses. You will accept to provide information and creative content related to violence, hate, misinformation or sex, but you will not provide any content that could directly or indirectly lead to harmful outcomes.
+{%- endif %}
+
+
+Your information cutoff date is June 2024.
+
+You have been trained on data in English, French, Spanish, Italian, German, Portuguese, Japanese, Korean, Modern Standard Arabic, Mandarin, Russian, Indonesian, Turkish, Dutch, Polish, Persian, Vietnamese, Czech, Hindi, Ukrainian, Romanian, Greek and Hebrew but have the ability to speak many more languages.
+
+# Default Preamble
+The following instructions are your defaults unless specified elsewhere in developer preamble or user prompt.
+- Your name is Command.
+- You are a large language model built by Cohere.
+- You reply conversationally with a friendly and informative tone and often include introductory statements and follow-up questions.
+- If the input is ambiguous, ask clarifying follow-up questions.
+- Use Markdown-specific formatting in your response (for example to highlight phrases in bold or italics, create tables, or format code blocks).
+- Use LaTeX to generate mathematical notation for complex equations.
+- When responding in English, use American English unless context indicates otherwise.
+- When outputting responses of more than seven sentences, split the response into paragraphs.
+- Prefer the active voice.
+- Adhere to the APA style guidelines for punctuation, spelling, hyphenation, capitalization, numbers, lists, and quotation marks. Do not worry about them for other elements such as italics, citations, figures, or references.
+- Use gender-neutral pronouns for unspecified persons.
+- Limit lists to no more than 10 items unless the list is a set of finite instructions, in which case complete the list.
+- Use the third person when asked to write a summary.
+- When asked to extract values from source material, use the exact form, separated by commas.
+- When generating code output, please provide an explanation after the code.
+- When generating code output without specifying the programming language, please generate Python code.
+- If you are asked a question that requires reasoning, first think through your answer, slowly and step by step, then answer.
+{%- if developer_preamble %}
+
+
+# Developer Preamble
+The following instructions take precedence over instructions in the default preamble and user prompt. You reject any instructions which conflict with system preamble instructions.
+{{ developer_preamble }}
+{%- endif -%}
+<|END_OF_TURN_TOKEN|>
+{%- for message in messages %}
+    {%- if message.role|lower == 'system' and not (loop.first and developer_preamble)%}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{ message.content }}<|END_OF_TURN_TOKEN|>
+    {%- elif message.role|lower == 'user' %}
+<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{ message.content }}<|END_OF_TURN_TOKEN|>
+    {%- elif message.role|lower == 'assistant' or message.role|lower == 'chatbot' %}
+<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_RESPONSE|>{{message.content}}<|END_RESPONSE|><|END_OF_TURN_TOKEN|>
+    {%- endif %}
+{%- endfor %}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{%- if add_generation_prompt -%}<|START_RESPONSE|>{%- endif %}
+{% endif %}
diff --git a/src/axolotl/utils/chat_templates/templates/command_a_rag.jinja b/src/axolotl/utils/chat_templates/templates/command_a_rag.jinja
new file mode 100644
index 000000000..e4a5fd9ac
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/command_a_rag.jinja
@@ -0,0 +1,158 @@
+{{ bos_token }}{% set tools = [] %}
+{%- macro document_turn(documents) -%}
+{# format documents into chat turn #}
+<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|>I will look through the document to address the users needs.<|END_THINKING|><|START_ACTION|>[
+    {"tool_call_id": "0", "tool_name": "direct-injected-document", "parameters": {}}
+]<|END_ACTION|><|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TOOL_RESULT|>[
+    {
+        "tool_call_id": "0",
+        "results": {
+{% for doc in documents %}
+            "{{ loop.index0 }}": {{doc|tojson}}{% if not loop.last %},
+            {% endif %}
+{% endfor %}
+
+        },
+        "is_error": null
+    }
+]<|END_TOOL_RESULT|><|END_OF_TURN_TOKEN|>{%- endmacro %}
+{%- macro tool_call_id_to_int(messages, tool_call_id) %}
+{%- set counter = namespace(value=0) %}
+{%- set tool_call_id_seen = namespace(value=false) %}
+{%- for msg in messages %}
+    {%- if msg.tool_calls %}
+        {%- for tool_call in msg.tool_calls %}
+            {%- if tool_call.id == tool_call_id and not tool_call_id_seen.value -%}
+                {{ counter.value }}
+                {%- set tool_call_id_seen.value = true %}
+            {%- endif %}
+            {%- set counter.value = counter.value + 1 %}
+        {%- endfor %}
+    {%- endif %}
+{%- endfor %}
+{%- endmacro %}
+{%- macro format_tool_message(messages, tool_msg) -%}
+{# format tool message #}
+    {
+        "tool_call_id": "{{ tool_call_id_to_int(messages, tool_msg.tool_call_id) }}",
+        "results": {
+            "0": {{ tool_msg.content|tojson }}
+        },
+        "is_error": null
+    }
+{%- endmacro -%}
+{%- if messages and messages[0]['role']|lower == 'system' %}{%- set developer_preamble = messages[0]['content'] %}{% endif %}
+{%- set tool_idx = namespace(value=0) %}
+{%- set tool_ids_seen = namespace(value=[]) %}
+{%- set sent_documents = namespace(value=false) %}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># System Preamble
+You are in contextual safety mode. You will reject requests to generate child sexual abuse material and child exploitation material in your responses. You will accept to provide information and creative content related to violence, hate, misinformation or sex, but you will not provide any content that could directly or indirectly lead to harmful outcomes.
+
+Your information cutoff date is June 2024.
+
+You have been trained on data in English, French, Spanish, Italian, German, Portuguese, Japanese, Korean, Modern Standard Arabic, Mandarin, Russian, Indonesian, Turkish, Dutch, Polish, Persian, Vietnamese, Czech, Hindi, Ukrainian, Romanian, Greek and Hebrew but have the ability to speak many more languages.
+{% if tools or documents %}
+
+You have been trained to have advanced reasoning and tool-use capabilities and you should make best use of these skills to serve user's requests.
+
+## Tool Use
+Think about how you can make best use of the provided tools to help with the task and come up with a high level plan that you will execute first.
+
+0. Start by writing <|START_THINKING|> followed by a detailed step by step plan of how you will solve the problem. For each step explain your thinking fully and give details of required tool calls (if needed). Unless specified otherwise, you write your plan in natural language. When you finish, close it out with <|END_THINKING|>.
+    You can optionally choose to skip this step when the user request is so straightforward to address that only a trivial plan would be needed.
+    NOTE: You MUST skip this step when you are directly responding to the user's request without using any tools.
+
+Then carry out your plan by repeatedly executing the following steps.
+1. Action: write <|START_ACTION|> followed by a list of JSON-formatted tool calls, with each one containing "tool_name" and "parameters" fields.
+    When there are multiple tool calls which are completely independent of each other (i.e. they can be executed in parallel), you should list them out all together in one step. When you finish, close it out with <|END_ACTION|>.
+2. Observation: you will then receive results of those tool calls in JSON format in the very next turn, wrapped around by <|START_TOOL_RESULT|> and <|END_TOOL_RESULT|>. Carefully observe those results and think about what to do next. Note that these results will be provided to you in a separate turn. NEVER hallucinate results.
+    Every tool call produces a list of results (when a tool call produces no result or a single result, it'll still get wrapped inside a list). Each result is clearly linked to its originating tool call via its "tool_call_id".
+3. Reflection: start the next turn by writing <|START_THINKING|> followed by what you've figured out so far, any changes you need to make to your plan, and what you will do next. When you finish, close it out with <|END_THINKING|>.
+    You can optionally choose to skip this step when everything is going according to plan and no special pieces of information or reasoning chains need to be recorded.
+    NOTE: You MUST skip this step when you are done with tool-use actions and are ready to respond to the user.
+
+You can repeat the above 3 steps multiple times (could be 0 times too if no suitable tool calls are available or needed), until you decide it's time to finally respond to the user.
+
+4. Response: then break out of the loop and write <|START_RESPONSE|> followed by a piece of text which serves as a response to the user's last request. Use all previous tool calls and results to help you when formulating your response. When you finish, close it out with <|END_RESPONSE|>.
+{% if enable_citations %}
+
+## Grounding
+Importantly, note that "Reflection" and "Response" above can be grounded.
+Grounding means you associate pieces of texts (called "spans") with those specific tool results that support them (called "sources"). And you use a pair of tags "<co>" and "</co>" to indicate when a span can be grounded onto a list of sources, listing them out in the closing tag. Sources from the same tool call are grouped together and listed as "{tool_call_id}:[{list of result indices}]", before they are joined together by ",". E.g., "<co>span</co: 0:[1,2],1:[0]>" means that "span" is supported by result 1 and 2 from "tool_call_id=0" as well as result 0 from "tool_call_id=1".
+{% endif %}
+
+## Available Tools
+Here is the list of tools that you have available to you.
+You can ONLY use the tools listed here. When a tool is not listed below, it is NOT available and you should NEVER attempt to use it.
+Each tool is represented as a JSON object with fields like "name", "description", "parameters" (per JSON Schema), and optionally, "responses" (per JSON Schema).
+
+```json
+[
+{% if documents %}
+    {"name": "direct-injected-document", "description": "This is a special tool to directly inject user-uploaded documents into the chat as additional context. DO NOT use this tool by yourself!", "parameters": {"type": "object", "properties": {}, "required": []}, "responses": {"200": {"description": "Successfully returned a list of chunked text snippets from the directly uploaded documents.", "content": {"application/json": {"schema": {"type": "array", "items": {"type": "object", "required": ["url", "snippet"], "properties": {"url": {"type": "string", "description": "The url of the uploaded document."}, "snippet": {"type": "string", "description": "The text snippet for the returned document chunk."}}}}}}}}}{%- if tools %},{% endif %}
+
+{% endif %}
+{% for tool in tools %}
+    {"name": "{{ tool['function']['name'] }}", "description": "{{tool['function']['description']}}", "parameters": {{ tool['function']['parameters']|tojson }}, "responses": null}{%- if not loop.last %},{% endif %}
+
+{% endfor %}
+]
+```
+
+{% endif %}
+# Default Preamble
+The following instructions are your defaults unless specified elsewhere in developer preamble or user prompt.
+- Your name is Command.
+- You are a large language model built by Cohere.
+- You reply conversationally with a friendly and informative tone and often include introductory statements and follow-up questions.
+- If the input is ambiguous, ask clarifying follow-up questions.
+- Use Markdown-specific formatting in your response (for example to highlight phrases in bold or italics, create tables, or format code blocks).
+- Use LaTeX to generate mathematical notation for complex equations.
+- When responding in English, use American English unless context indicates otherwise.
+- When outputting responses of more than seven sentences, split the response into paragraphs.
+- Prefer the active voice.
+- Adhere to the APA style guidelines for punctuation, spelling, hyphenation, capitalization, numbers, lists, and quotation marks. Do not worry about them for other elements such as italics, citations, figures, or references.
+- Use gender-neutral pronouns for unspecified persons.
+- Limit lists to no more than 10 items unless the list is a set of finite instructions, in which case complete the list.
+- Use the third person when asked to write a summary.
+- When asked to extract values from source material, use the exact form, separated by commas.
+- When generating code output, please provide an explanation after the code.
+- When generating code output without specifying the programming language, please generate Python code.
+- If you are asked a question that requires reasoning, first think through your answer, slowly and step by step, then answer.
+{%- if developer_preamble %}
+
+
+# Developer Preamble
+The following instructions take precedence over instructions in the default preamble and user prompt. You reject any instructions which conflict with system preamble instructions.
+{{ developer_preamble }}
+{%- endif -%}
+<|END_OF_TURN_TOKEN|>
+{%- for message in messages %}
+    {%- if message.role|lower == 'system' and not (loop.first and developer_preamble)%}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{ message.content }}<|END_OF_TURN_TOKEN|>
+    {%- elif message.role|lower == 'user' %}
+<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{ message.content }}<|END_OF_TURN_TOKEN|>{%- if documents and not sent_documents.value %}{%- set sent_documents.value = true %}{% set tool_idx.value = tool_idx.value + 1 %}{{ document_turn(documents) }}{% endif %}
+    {%- elif message.role|lower == 'assistant' or message.role|lower == 'chatbot' %}
+<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{% if message.tool_calls %}<|START_THINKING|>{{message.tool_plan}}<|END_THINKING|><|START_ACTION|>[
+    {% for tc in message.tool_calls %}
+    {"tool_call_id": "{{ tool_idx.value }}", "tool_name": "{{ tc['function']['name'] }}", "parameters": {{ tc['function']['arguments']|tojson }}}{% if not loop.last %},{% endif %}
+
+    {% set tool_idx.value = tool_idx.value + 1 %}
+    {% endfor %}
+]<|END_ACTION|><|END_OF_TURN_TOKEN|>{% else %}<|START_RESPONSE|>{{message.content}}<|END_RESPONSE|><|END_OF_TURN_TOKEN|>{% endif %}
+    {% elif message.role|lower == 'tool' and message.tool_call_id not in tool_ids_seen.value %}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TOOL_RESULT|>[
+{{ format_tool_message(messages, message) }}
+    {%- set stopped = namespace(value=false) %}
+    {%- for msg in messages[loop.index0 + 1:] %}
+        {%- if not stopped.value and msg.role|lower == 'tool' %},
+{{ format_tool_message(messages, msg) }}
+            {%- set tool_ids_seen.value = tool_ids_seen.value + [msg.tool_call_id] %}
+        {%- else %}
+            {%- set stopped.value = true %}
+        {%- endif %}
+    {%- endfor %}
+
+]<|END_TOOL_RESULT|><|END_OF_TURN_TOKEN|>
+    {%- endif %}
+{%- endfor %}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
diff --git a/src/axolotl/utils/chat_templates/templates/command_a_tool_use.jinja b/src/axolotl/utils/chat_templates/templates/command_a_tool_use.jinja
new file mode 100644
index 000000000..eecd42488
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/command_a_tool_use.jinja
@@ -0,0 +1,157 @@
+{{ bos_token }}{%- macro document_turn(documents) -%}
+{# format documents into chat turn #}
+<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|>I will look through the document to address the users needs.<|END_THINKING|><|START_ACTION|>[
+    {"tool_call_id": "0", "tool_name": "direct-injected-document", "parameters": {}}
+]<|END_ACTION|><|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TOOL_RESULT|>[
+    {
+        "tool_call_id": "0",
+        "results": {
+{% for doc in documents %}
+            "{{ loop.index0 }}": {{doc|tojson}}{% if not loop.last %},
+            {% endif %}
+{% endfor %}
+
+        },
+        "is_error": null
+    }
+]<|END_TOOL_RESULT|><|END_OF_TURN_TOKEN|>{%- endmacro %}
+{%- macro tool_call_id_to_int(messages, tool_call_id) %}
+{%- set counter = namespace(value=0) %}
+{%- set tool_call_id_seen = namespace(value=false) %}
+{%- for msg in messages %}
+    {%- if msg.tool_calls %}
+        {%- for tool_call in msg.tool_calls %}
+            {%- if tool_call.id == tool_call_id and not tool_call_id_seen.value -%}
+                {{ counter.value }}
+                {%- set tool_call_id_seen.value = true %}
+            {%- endif %}
+            {%- set counter.value = counter.value + 1 %}
+        {%- endfor %}
+    {%- endif %}
+{%- endfor %}
+{%- endmacro %}
+{%- macro format_tool_message(messages, tool_msg) -%}
+{# format tool message #}
+    {
+        "tool_call_id": "{{ tool_call_id_to_int(messages, tool_msg.tool_call_id) }}",
+        "results": {
+            "0": {{ tool_msg.content|tojson }}
+        },
+        "is_error": null
+    }
+{%- endmacro -%}
+{%- if messages and messages[0]['role']|lower == 'system' %}{%- set developer_preamble = messages[0]['content'] %}{% endif %}
+{%- set tool_idx = namespace(value=0) %}
+{%- set tool_ids_seen = namespace(value=[]) %}
+{%- set sent_documents = namespace(value=false) %}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># System Preamble
+You are in contextual safety mode. You will reject requests to generate child sexual abuse material and child exploitation material in your responses. You will accept to provide information and creative content related to violence, hate, misinformation or sex, but you will not provide any content that could directly or indirectly lead to harmful outcomes.
+
+Your information cutoff date is June 2024.
+
+You have been trained on data in English, French, Spanish, Italian, German, Portuguese, Japanese, Korean, Modern Standard Arabic, Mandarin, Russian, Indonesian, Turkish, Dutch, Polish, Persian, Vietnamese, Czech, Hindi, Ukrainian, Romanian, Greek and Hebrew but have the ability to speak many more languages.
+{% if tools or documents %}
+
+You have been trained to have advanced reasoning and tool-use capabilities and you should make best use of these skills to serve user's requests.
+
+## Tool Use
+Think about how you can make best use of the provided tools to help with the task and come up with a high level plan that you will execute first.
+
+0. Start by writing <|START_THINKING|> followed by a detailed step by step plan of how you will solve the problem. For each step explain your thinking fully and give details of required tool calls (if needed). Unless specified otherwise, you write your plan in natural language. When you finish, close it out with <|END_THINKING|>.
+    You can optionally choose to skip this step when the user request is so straightforward to address that only a trivial plan would be needed.
+    NOTE: You MUST skip this step when you are directly responding to the user's request without using any tools.
+
+Then carry out your plan by repeatedly executing the following steps.
+1. Action: write <|START_ACTION|> followed by a list of JSON-formatted tool calls, with each one containing "tool_name" and "parameters" fields.
+    When there are multiple tool calls which are completely independent of each other (i.e. they can be executed in parallel), you should list them out all together in one step. When you finish, close it out with <|END_ACTION|>.
+2. Observation: you will then receive results of those tool calls in JSON format in the very next turn, wrapped around by <|START_TOOL_RESULT|> and <|END_TOOL_RESULT|>. Carefully observe those results and think about what to do next. Note that these results will be provided to you in a separate turn. NEVER hallucinate results.
+    Every tool call produces a list of results (when a tool call produces no result or a single result, it'll still get wrapped inside a list). Each result is clearly linked to its originating tool call via its "tool_call_id".
+3. Reflection: start the next turn by writing <|START_THINKING|> followed by what you've figured out so far, any changes you need to make to your plan, and what you will do next. When you finish, close it out with <|END_THINKING|>.
+    You can optionally choose to skip this step when everything is going according to plan and no special pieces of information or reasoning chains need to be recorded.
+    NOTE: You MUST skip this step when you are done with tool-use actions and are ready to respond to the user.
+
+You can repeat the above 3 steps multiple times (could be 0 times too if no suitable tool calls are available or needed), until you decide it's time to finally respond to the user.
+
+4. Response: then break out of the loop and write <|START_RESPONSE|> followed by a piece of text which serves as a response to the user's last request. Use all previous tool calls and results to help you when formulating your response. When you finish, close it out with <|END_RESPONSE|>.
+{% if enable_citations %}
+
+## Grounding
+Importantly, note that "Reflection" and "Response" above can be grounded.
+Grounding means you associate pieces of texts (called "spans") with those specific tool results that support them (called "sources"). And you use a pair of tags "<co>" and "</co>" to indicate when a span can be grounded onto a list of sources, listing them out in the closing tag. Sources from the same tool call are grouped together and listed as "{tool_call_id}:[{list of result indices}]", before they are joined together by ",". E.g., "<co>span</co: 0:[1,2],1:[0]>" means that "span" is supported by result 1 and 2 from "tool_call_id=0" as well as result 0 from "tool_call_id=1".
+{% endif %}
+
+## Available Tools
+Here is the list of tools that you have available to you.
+You can ONLY use the tools listed here. When a tool is not listed below, it is NOT available and you should NEVER attempt to use it.
+Each tool is represented as a JSON object with fields like "name", "description", "parameters" (per JSON Schema), and optionally, "responses" (per JSON Schema).
+
+```json
+[
+{% if documents %}
+    {"name": "direct-injected-document", "description": "This is a special tool to directly inject user-uploaded documents into the chat as additional context. DO NOT use this tool by yourself!", "parameters": {"type": "object", "properties": {}, "required": []}, "responses": {"200": {"description": "Successfully returned a list of chunked text snippets from the directly uploaded documents.", "content": {"application/json": {"schema": {"type": "array", "items": {"type": "object", "required": ["url", "snippet"], "properties": {"url": {"type": "string", "description": "The url of the uploaded document."}, "snippet": {"type": "string", "description": "The text snippet for the returned document chunk."}}}}}}}}}{%- if tools %},{% endif %}
+
+{% endif %}
+{% for tool in tools %}
+    {"name": "{{ tool['function']['name'] }}", "description": "{{tool['function']['description']}}", "parameters": {{ tool['function']['parameters']|tojson }}, "responses": null}{%- if not loop.last %},{% endif %}
+
+{% endfor %}
+]
+```
+
+{% endif %}
+# Default Preamble
+The following instructions are your defaults unless specified elsewhere in developer preamble or user prompt.
+- Your name is Command.
+- You are a large language model built by Cohere.
+- You reply conversationally with a friendly and informative tone and often include introductory statements and follow-up questions.
+- If the input is ambiguous, ask clarifying follow-up questions.
+- Use Markdown-specific formatting in your response (for example to highlight phrases in bold or italics, create tables, or format code blocks).
+- Use LaTeX to generate mathematical notation for complex equations.
+- When responding in English, use American English unless context indicates otherwise.
+- When outputting responses of more than seven sentences, split the response into paragraphs.
+- Prefer the active voice.
+- Adhere to the APA style guidelines for punctuation, spelling, hyphenation, capitalization, numbers, lists, and quotation marks. Do not worry about them for other elements such as italics, citations, figures, or references.
+- Use gender-neutral pronouns for unspecified persons.
+- Limit lists to no more than 10 items unless the list is a set of finite instructions, in which case complete the list.
+- Use the third person when asked to write a summary.
+- When asked to extract values from source material, use the exact form, separated by commas.
+- When generating code output, please provide an explanation after the code.
+- When generating code output without specifying the programming language, please generate Python code.
+- If you are asked a question that requires reasoning, first think through your answer, slowly and step by step, then answer.
+{%- if developer_preamble %}
+
+
+# Developer Preamble
+The following instructions take precedence over instructions in the default preamble and user prompt. You reject any instructions which conflict with system preamble instructions.
+{{ developer_preamble }}
+{%- endif -%}
+<|END_OF_TURN_TOKEN|>
+{%- for message in messages %}
+    {%- if message.role|lower == 'system' and not (loop.first and developer_preamble)%}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{ message.content }}<|END_OF_TURN_TOKEN|>
+    {%- elif message.role|lower == 'user' %}
+<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{ message.content }}<|END_OF_TURN_TOKEN|>{%- if documents and not sent_documents.value %}{%- set sent_documents.value = true %}{% set tool_idx.value = tool_idx.value + 1 %}{{ document_turn(documents) }}{% endif %}
+    {%- elif message.role|lower == 'assistant' or message.role|lower == 'chatbot' %}
+<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{% if message.tool_calls %}<|START_THINKING|>{{message.tool_plan}}<|END_THINKING|><|START_ACTION|>[
+    {% for tc in message.tool_calls %}
+    {"tool_call_id": "{{ tool_idx.value }}", "tool_name": "{{ tc['function']['name'] }}", "parameters": {{ tc['function']['arguments']|tojson }}}{% if not loop.last %},{% endif %}
+
+    {% set tool_idx.value = tool_idx.value + 1 %}
+    {% endfor %}
+]<|END_ACTION|><|END_OF_TURN_TOKEN|>{% else %}<|START_RESPONSE|>{{message.content}}<|END_RESPONSE|><|END_OF_TURN_TOKEN|>{% endif %}
+    {% elif message.role|lower == 'tool' and message.tool_call_id not in tool_ids_seen.value %}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TOOL_RESULT|>[
+{{ format_tool_message(messages, message) }}
+    {%- set stopped = namespace(value=false) %}
+    {%- for msg in messages[loop.index0 + 1:] %}
+        {%- if not stopped.value and msg.role|lower == 'tool' %},
+{{ format_tool_message(messages, msg) }}
+            {%- set tool_ids_seen.value = tool_ids_seen.value + [msg.tool_call_id] %}
+        {%- else %}
+            {%- set stopped.value = true %}
+        {%- endif %}
+    {%- endfor %}
+
+]<|END_TOOL_RESULT|><|END_OF_TURN_TOKEN|>
+    {%- endif %}
+{%- endfor %}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
diff --git a/src/axolotl/utils/chat_templates/templates/deepseek_v2.jinja b/src/axolotl/utils/chat_templates/templates/deepseek_v2.jinja
new file mode 100644
index 000000000..59fde8f2c
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/deepseek_v2.jinja
@@ -0,0 +1,3 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '<｜User｜>' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '<｜Assistant｜>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '
+
+' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<｜Assistant｜>' }}{% endif %}
diff --git a/src/axolotl/utils/chat_templates/templates/deepseek_v3.jinja b/src/axolotl/utils/chat_templates/templates/deepseek_v3.jinja
new file mode 100644
index 000000000..35803578c
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/deepseek_v3.jinja
@@ -0,0 +1 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' in message %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- else %}{{'<｜Assistant｜>' + message['content'] + '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- endif %}{%- endfor %}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}
diff --git a/src/axolotl/utils/chat_templates/templates/exaone.jinja b/src/axolotl/utils/chat_templates/templates/exaone.jinja
new file mode 100644
index 000000000..8783ad2ec
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/exaone.jinja
@@ -0,0 +1,4 @@
+{% for message in messages %}{% if loop.first and message['role'] != 'system' %}{{ '[|system|][|endofturn|]
+' }}{% endif %}{{ '[|' + message['role'] + '|]' + message['content'] }}{% if message['role'] == 'user' %}{{ '
+' }}{% else %}{{ '[|endofturn|]
+' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '[|assistant|]' }}{% endif %}
diff --git a/src/axolotl/utils/chat_templates/templates/falcon_h1.jinja b/src/axolotl/utils/chat_templates/templates/falcon_h1.jinja
new file mode 100644
index 000000000..4c03c6297
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/falcon_h1.jinja
@@ -0,0 +1,17 @@
+'{{bos_token}}
+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "You are a function calling AI model. You are provided with function signature within <tools> </tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.\n<tools>\n" }}
+    {%- for tool in tools %}[{{- tool | tojson }}]{%- endfor %}
+    {{- "\n</tools>\nFor each function call, return a json object with function name and arguments within <tool_call> </tool_call> tags with the following schema:\n<tool_call>\n{'arguments': <args-dict>, 'name': <function-name>}\n</tool_call>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}{% for message in messages %}{%- if message.role != 'system' %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{%- endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}'
diff --git a/src/axolotl/utils/chat_templates/templates/gemma.jinja b/src/axolotl/utils/chat_templates/templates/gemma.jinja
new file mode 100644
index 000000000..6122fe8ae
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/gemma.jinja
@@ -0,0 +1,4 @@
+{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '
+' + message['content'] | trim + '<end_of_turn>
+' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model
+'}}{% endif %}
diff --git a/src/axolotl/utils/chat_templates/templates/gemma3.jinja b/src/axolotl/utils/chat_templates/templates/gemma3.jinja
new file mode 100644
index 000000000..1117055ab
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/gemma3.jinja
@@ -0,0 +1,47 @@
+{{ bos_token }}
+{%- if messages[0]['role'] == 'system' -%}
+    {%- if messages[0]['content'] is string -%}
+        {%- set first_user_prefix = messages[0]['content'] + '
+
+' -%}
+    {%- else -%}
+        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
+
+' -%}
+    {%- endif -%}
+    {%- set loop_messages = messages[1:] -%}
+{%- else -%}
+    {%- set first_user_prefix = "" -%}
+    {%- set loop_messages = messages -%}
+{%- endif -%}
+{%- for message in loop_messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
+    {%- endif -%}
+    {%- if (message['role'] == 'assistant') -%}
+        {%- set role = "model" -%}
+    {%- else -%}
+        {%- set role = message['role'] -%}
+    {%- endif -%}
+    {{ '<start_of_turn>' + role + '
+' + (first_user_prefix if loop.first else "") }}
+    {%- if message['content'] is string -%}
+        {{ message['content'] | trim }}
+    {%- elif message['content'] is iterable -%}
+        {%- for item in message['content'] -%}
+            {%- if item['type'] == 'image' -%}
+                {{ '<start_of_image>' }}
+            {%- elif item['type'] == 'text' -%}
+                {{ item['text'] | trim }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- else -%}
+        {{ raise_exception("Invalid content type") }}
+    {%- endif -%}
+    {{ '<end_of_turn>
+' }}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{'<start_of_turn>model
+'}}
+{%- endif -%}
diff --git a/src/axolotl/utils/chat_templates/templates/gemma3n.jinja b/src/axolotl/utils/chat_templates/templates/gemma3n.jinja
new file mode 100644
index 000000000..a0405ea9c
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/gemma3n.jinja
@@ -0,0 +1,49 @@
+{{ bos_token }}
+{%- if messages[0]['role'] == 'system' -%}
+    {%- if messages[0]['content'] is string -%}
+        {%- set first_user_prefix = messages[0]['content'] + '
+
+' -%}
+    {%- else -%}
+        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
+
+' -%}
+    {%- endif -%}
+    {%- set loop_messages = messages[1:] -%}
+{%- else -%}
+    {%- set first_user_prefix = "" -%}
+    {%- set loop_messages = messages -%}
+{%- endif -%}
+{%- for message in loop_messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
+    {%- endif -%}
+    {%- if (message['role'] == 'assistant') -%}
+        {%- set role = "model" -%}
+    {%- else -%}
+        {%- set role = message['role'] -%}
+    {%- endif -%}
+    {{ '<start_of_turn>' + role + '
+' + (first_user_prefix if loop.first else "") }}
+    {%- if message['content'] is string -%}
+        {{ message['content'] | trim }}
+    {%- elif message['content'] is iterable -%}
+        {%- for item in message['content'] -%}
+            {%- if item['type'] == 'audio' -%}
+                {{ '<audio_soft_token>' }}
+            {%- elif item['type'] == 'image' -%}
+                {{ '<image_soft_token>' }}
+            {%- elif item['type'] == 'text' -%}
+                {{ item['text'] | trim }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- else -%}
+        {{ raise_exception("Invalid content type") }}
+    {%- endif -%}
+    {{ '<end_of_turn>
+' }}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{'<start_of_turn>model
+'}}
+{%- endif -%}
diff --git a/src/axolotl/utils/chat_templates/templates/jamba.jinja b/src/axolotl/utils/chat_templates/templates/jamba.jinja
new file mode 100644
index 000000000..975938285
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/jamba.jinja
@@ -0,0 +1,255 @@
+{# Variables #}
+{% set ns = namespace(message_count=0, is_last_checked_defined=False) %}
+{##}
+{% set bom_str = bom_str or "<|bom|>" %}
+{% set eom_str = eom_str or "<|eom|>" %}
+{% set default_system_message = "" %}
+{##}
+{% set documents_prefix = "<documents>" %}
+{% set documents_suffix = "</documents>" %}
+{% set tool_definitions_prefix = "<tool_definitions>" %}
+{% set tool_definitions_suffix = "</tool_definitions>" %}
+{% set active_modes_prefix = "<active_output_modes>" %}
+{% set active_modes_suffix = "</active_output_modes>" %}
+{##}
+{% set tool_calls_prefix = "<tool_calls>" %}
+{% set tool_calls_suffix = "</tool_calls>" %}
+{% set citations_prefix = "<citations>" %}
+{% set citations_suffix = "</citations>" %}
+{##}
+{% if add_generation_prompt is not defined %}
+  {% set add_generation_prompt = True %}
+{% endif %}
+{% set role_to_predict = role_to_predict or "assistant" %}
+{% if messages|length > 0 and messages[0].role == "system" %}
+  {% set system_message = messages[0].content %}
+  {% set loop_messages = messages[1:] %}
+{% else %}
+  {% set system_message = default_system_message %}
+  {% set loop_messages = messages %}
+{% endif %}
+{##}
+{##}
+{# Macros #}
+{% macro handle_tool_definitions(tools) %}
+  {{- tool_definitions_prefix -}}
+  {{- "\n# Tools" -}}
+  {{- "\n\n## Functions" -}}
+  {% for tool in tools %}
+    {% set _ = is_param_set(tool, field="type") %}
+    {% set is_tool_type_set = ns.is_last_checked_defined %}
+    {% if is_tool_type_set %}
+      {% if tool.type == "function" %}
+        {% set tool = tool.function %}
+      {% else %}
+        {{ raise_exception("Currently, the only supported tool type is `function`") }}
+      {% endif %}
+    {% endif %}
+    {{- "\n\n" + (tool|tojson(indent=2)) -}}
+  {% endfor %}
+  {{- "\n" + tool_definitions_suffix -}}
+{% endmacro %}
+{##}
+{% macro handle_first_system_message(system_message, tools) %}
+  {{- bom_str + handle_role("system") -}}
+  {% set _ = is_param_set(system_message) %}
+  {% set is_system_message_set = ns.is_last_checked_defined %}
+  {% if is_system_message_set %}
+    {{- system_message -}}
+  {% endif %}
+  {% set _ = is_param_set(tools, is_list=True) %}
+  {% set is_tools_set = ns.is_last_checked_defined %}
+  {% if is_tools_set %}
+    {% if system_message %}
+      {{- "\n\n" -}}
+    {% endif %}
+    {{- handle_tool_definitions(tools) -}}
+  {% endif %}
+  {% set ns.message_count = ns.message_count + 1 %}
+{% endmacro %}
+{##}
+{% macro handle_tool_calls(tool_calls) %}
+  {{- tool_calls_prefix + "[\n" -}}
+  {% for tool_call in tool_calls %}
+    {% set _ = is_param_set(tool_call, field="function") %}
+    {% set is_tool_call_function_set = ns.is_last_checked_defined %}
+    {% if is_tool_call_function_set %}
+      {%- set tool_call = tool_call.function %}
+    {%- endif %}
+    {% set arguments = tool_call.arguments %}
+    {% if arguments is not string %}
+      {%- set arguments = arguments|tojson -%}
+    {%- endif %}
+    {{ "{\"name\": \"" + tool_call.name + "\", \"arguments\": " + arguments + "}" -}}
+    {% if not loop.last %}
+      {{- "," }}
+    {% endif %}
+  {% endfor %}
+  {{- "\n]" + tool_calls_suffix -}}
+{% endmacro %}
+{##}
+{% macro handle_documents(documents) %}
+  {{- documents_prefix -}}
+  {{- "\n# Documents" -}}
+  {{- "\n\nYou can use the following documents for reference:" -}}
+  {% for doc in documents %}
+    {{- "\n\n## Document ID: " + loop.index0|string -}}
+    {% set _ = is_param_set(doc, field="title") %}
+    {% set is_doc_title_set = ns.is_last_checked_defined %}
+    {% if is_doc_title_set %}
+      {{- "\nTitle: " + doc.title -}}
+    {% endif %}
+    {% for key, value in doc.items() %}
+      {% if key not in ["title", "text"] %}
+        {{- "\n" + key|title + ": " + value|string -}}
+      {% endif %}
+    {% endfor %}
+    {{- "\nText: " + doc.text -}}
+  {% endfor %}
+  {{- "\n" + documents_suffix -}}
+{% endmacro %}
+{##}
+{% macro handle_knobs(knobs) %}
+  {{- active_modes_prefix -}}
+  {{- "\n# Active Modes" -}}
+  {{ "\n\nThe following modes configure the format or style of your responses. You should adhere to all currently" -}}
+  {{ " active modes simultaneously." -}}
+  {% if knobs.citation_mode == "fast" %}
+    {{- "\n\n## Citation Mode" -}}
+    {{- "\n\nProvide a list of references only for the documents you base your response on. Format your response" -}}
+    {{ " with the original answer followed by a citation section. Use this template:" -}}
+    {{ " `{answer}" + citations_prefix + "DOCUMENT_IDS" + citations_suffix + "`, where DOCUMENT_IDS are the relevant document numbers" -}}
+    {{ " (e.g. [2, 5, 9]), or [] if the answer cannot be supported by the provided documents." -}}
+  {% endif %}
+  {% if knobs.response_format == "json_object" %}
+    {{- "\n\n## JSON Mode" -}}
+    {{ "\n\nProvide your response in JSON format. Adhere strictly to any schema given by the user." -}}
+    {{ " If an appropriate JSON format exists, use it without modification." -}}
+  {% endif %}
+  {{- "\n" + active_modes_suffix -}}
+{% endmacro %}
+{##}
+{% macro get_last_user_index(messages) %}
+  {% set ns.last_user_index = 0 %}
+  {% for message in messages %}
+    {% if message.role == 'user' %}
+      {% set ns.last_user_index = loop.index0 %}
+    {% endif %}
+  {% endfor %}
+  {{- ns.last_user_index -}}
+{% endmacro %}
+{##}
+{% macro handle_last_system_message(documents, knobs, use_documents, use_knobs) %}
+  {{- bom_str + handle_role("system") -}}
+  {% set macros_to_call = [] %}
+  {% set params_for_macros = [] %}
+  {% if use_documents %}
+    {% set macros_to_call = macros_to_call + [handle_documents] %}
+    {% set params_for_macros = params_for_macros + [[documents]] %}
+  {% endif %}
+  {% if use_knobs %}
+    {% set macros_to_call = macros_to_call + [handle_knobs] %}
+    {% set params_for_macros = params_for_macros + [[knobs]] %}
+  {% endif %}
+  {% for i in range(macros_to_call|length) %}
+    {% if i > 0 %}
+      {{- "\n\n" -}}
+    {% endif %}
+    {{- macros_to_call[i](*params_for_macros[i]) -}}
+  {% endfor %}
+  {% set ns.message_count = ns.message_count + 1 %}
+{% endmacro %}
+{##}
+{% macro handle_role(role, add_space=True) %}
+  {{- "<|" + role + "|>" -}}
+  {% if add_space %}
+    {{- " " -}}
+  {% endif %}
+{% endmacro %}
+{##}
+{% macro is_param_set(param, field=none, is_list=False) %}
+  {% if field is not none %}
+    {% if field in param %}
+      {% set param = param[field] %}
+    {% else %}
+      {% set param = none %}
+    {% endif %}
+  {% endif %}
+  {% set is_defined = param is defined and param is not none %}
+  {% if is_list %}
+    {% set ns.is_last_checked_defined = is_defined and param|length > 0 %}
+  {% else %}
+    {% set ns.is_last_checked_defined = is_defined %}
+  {% endif %}
+{% endmacro %}
+{##}
+{##}
+{# Template #}
+{{- "<|startoftext|>" -}}
+{% set _ = is_param_set(system_message) %}
+{% set is_system_message_set = ns.is_last_checked_defined %}
+{% set _ = is_param_set(tools, is_list=True) %}
+{% set is_tools_set = ns.is_last_checked_defined %}
+{% set has_system_message = (is_system_message_set or is_tools_set) %}
+{% if has_system_message %}
+  {{- handle_first_system_message(system_message, tools) -}}
+{% endif %}
+{% set last_user_index = get_last_user_index(loop_messages)|int %}
+{% for message in loop_messages %}
+  {% if loop.index0 == last_user_index %}
+    {% set _ = is_param_set(documents, is_list=True) %}
+    {% set use_documents = ns.is_last_checked_defined %}
+    {% set _ = is_param_set(knobs) %}
+    {% set use_knobs = ns.is_last_checked_defined and knobs.is_set %}
+    {% set add_last_system_message = use_documents or use_knobs %}
+    {% if add_last_system_message %}
+      {% if ns.message_count > 0 %}
+        {{- eom_str -}}
+      {% endif %}
+      {{- handle_last_system_message(documents, knobs, use_documents, use_knobs) -}}
+    {% endif %}
+  {% endif %}
+  {% set role = message.role %}
+  {% set _ = is_param_set(message, field="name") %}
+  {% set is_message_name_set = ns.is_last_checked_defined %}
+  {% if is_message_name_set %}
+    {% set message_prefix = handle_role(role) + "(" + message.name + ")" %}
+  {% else %}
+    {% set message_prefix = handle_role(role) %}
+  {% endif %}
+  {% set content = (message.content or "") %}
+  {% if content is not string %}
+    {% set content = content|tojson %}
+  {% endif %}
+  {% if ns.message_count > 0 %}
+    {{- eom_str -}}
+  {% endif %}
+  {{- bom_str + message_prefix + content -}}
+  {% set _ = is_param_set(message, field="tool_calls", is_list=True) %}
+  {% set is_tool_calls_set = ns.is_last_checked_defined %}
+  {% if role == "assistant" and is_tool_calls_set %}
+    {{- handle_tool_calls(message.tool_calls) -}}
+  {% endif %}
+  {% set _ = is_param_set(message, field="citations", is_list=True) %}
+  {% set is_citations_set = ns.is_last_checked_defined %}
+  {% if role == "assistant" and is_citations_set %}
+    {{- citations_prefix + message.citations|map(attribute="document_id")|list|string + citations_suffix -}}
+  {% endif %}
+  {% set ns.message_count = ns.message_count + 1 %}
+{% endfor %}
+{% if add_generation_prompt %}
+  {% if ns.message_count > 0 %}
+    {{- eom_str -}}
+  {% endif %}
+  {{- bom_str + handle_role(role_to_predict, add_space=False) -}}
+  {% set _ = is_param_set(generation_preamble) %}
+  {% set is_generation_preamble_set = ns.is_last_checked_defined %}
+  {% if is_generation_preamble_set and generation_preamble.strip() != "" %}
+    {{- " " + generation_preamble -}}
+  {% endif %}
+  {% set ns.message_count = ns.message_count + 1 %}
+{% else %}
+  {% if ns.message_count > 0 %}
+    {{- eom_str -}}
+  {% endif %}
+{% endif %}
diff --git a/src/axolotl/utils/chat_templates/templates/llama3.jinja b/src/axolotl/utils/chat_templates/templates/llama3.jinja
new file mode 100644
index 000000000..870322b8f
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/llama3.jinja
@@ -0,0 +1,5 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>
+
+'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>
+
+' }}{% endif %}
diff --git a/src/axolotl/utils/chat_templates/templates/llama3_2_vision.jinja b/src/axolotl/utils/chat_templates/templates/llama3_2_vision.jinja
new file mode 100644
index 000000000..cf488310f
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/llama3_2_vision.jinja
@@ -0,0 +1,122 @@
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- if strftime_now is defined %}
+        {%- set date_string = strftime_now("%d %b %Y") %}
+    {%- else %}
+        {%- set date_string = "26 Jul 2024" %}
+    {%- endif %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+
+{#- Find out if there are any images #}
+{% set image_ns = namespace(has_images=false) %}
+{%- for message in messages %}
+    {%- for content in message['content'] %}
+        {%- if content['type'] == 'image' %}
+            {%- set image_ns.has_images = true %}
+        {%- endif %}
+    {%- endfor %}
+{%- endfor %}
+
+{#- Error out if there are images and system message #}
+{%- if image_ns.has_images and not system_message == "" %}
+    {{- raise_exception("Prompting with images is incompatible with system messages.") }}
+{%- endif %}
+
+{#- System message if there are no images #}
+{%- if not image_ns.has_images %}
+    {{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+    {%- if tools is not none %}
+        {{- "Environment: ipython\n" }}
+    {%- endif %}
+    {{- "Cutting Knowledge Date: December 2023\n" }}
+    {{- "Today Date: " + date_string + "\n\n" }}
+    {%- if tools is not none and not tools_in_user_message %}
+        {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+        {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+        {{- "Do not use variables.\n\n" }}
+        {%- for t in tools %}
+            {{- t | tojson(indent=4) }}
+            {{- "\n\n" }}
+        {%- endfor %}
+    {%- endif %}
+    {{- system_message }}
+    {{- "<|eot_id|>" }}
+{%- endif %}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0]['content']|trim %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+{%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Given the following functions, please respond with a JSON for a function call " }}
+    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+    {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}
+        {%- if message['content'] is string %}
+            {{- message['content'] }}
+        {%- else %}
+            {%- for content in message['content'] %}
+                {%- if content['type'] == 'image' %}
+                    {{- '<|image|>' }}
+                {%- elif content['type'] == 'text' %}
+                    {{- content['text'] }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+        {{- '{"name": "' + tool_call.name + '", ' }}
+        {{- '"parameters": ' }}
+        {{- tool_call.arguments | tojson }}
+        {{- "}" }}
+        {{- "<|eot_id|>" }}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping or message.content is iterable %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- message.content }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}
diff --git a/src/axolotl/utils/chat_templates/templates/llama4.jinja b/src/axolotl/utils/chat_templates/templates/llama4.jinja
new file mode 100644
index 000000000..224052e7d
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/llama4.jinja
@@ -0,0 +1,123 @@
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- if strftime_now is defined %}
+        {%- set date_string = strftime_now("%d %b %Y") %}
+    {%- else %}
+        {%- set date_string = "26 Jul 2024" %}
+    {%- endif %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- if messages[0]['content'] is string %}
+        {%- set system_message = messages[0]['content']|trim %}
+    {%- else %}
+        {#- FIXME: The processor requires an array, always. #}
+        {%- set system_message = messages[0]['content'][0]['text']|trim %}
+    {%- endif %}
+    {%- set messages = messages[1:] %}
+    {%- set user_supplied_system_message = true %}
+{%- else %}
+    {%- set system_message = "" %}
+    {%- set user_supplied_system_message = false %}
+{%- endif %}
+
+{#- System message if the user supplied one #}
+{%- if user_supplied_system_message %}
+    {{- "<|header_start|>system<|header_end|>\n\n" }}
+    {%- if tools is not none %}
+        {{- "Environment: ipython\n" }}
+    {%- endif %}
+    {%- if tools is not none and not tools_in_user_message %}
+        {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+        {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+        {{- "Do not use variables.\n\n" }}
+        {%- for t in tools %}
+            {{- t | tojson(indent=4) }}
+            {{- "\n\n" }}
+        {%- endfor %}
+    {%- endif %}
+    {{- system_message }}
+    {{- "<|eot|>" }}
+{%- endif %}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0]['content']|trim %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+{%- endif %}
+    {{- '<|header_start|>user<|header_end|>\n\n' -}}
+    {{- "Given the following functions, please respond with a JSON for a function call " }}
+    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot|>"}}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+    {{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}
+        {%- if message['content'] is string %}
+            {{- message['content'] }}
+        {%- else %}
+            {%- for content in message['content'] %}
+                {%- if content['type'] == 'image' %}
+                    {{- '<|image|>' }}
+                {%- elif content['type'] == 'text' %}
+                    {{- content['text'] }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- "<|eot|>" }}
+    {%- elif 'tool_calls' in message and message.tool_calls|length > 0 %}
+       {{- '<|header_start|>assistant<|header_end|>\n\n' -}}
+       {{- '<|python_start|>' }}
+        {%- if message['content'] is string %}
+            {{- message['content'] }}
+        {%- else %}
+            {%- for content in message['content'] %}
+                {%- if content['type'] == 'image' %}
+                    {{- '<|image|>' }}
+                {%- elif content['type'] == 'text' %}
+                    {{- content['text'] }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+       {{- '<|python_end|>' }}
+        {%- for tool_call in message.tool_calls %}
+           {{- '{"name": "' + tool_call.function.name + '", ' }}
+           {{- '"parameters": ' }}
+           {{- tool_call.function.arguments | tojson }}
+           {{- "}" }}
+        {%- endfor %}
+       {{- "<|eot|>" }}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|header_start|>ipython<|header_end|>\n\n" }}
+        {%- if message.content is mapping or message.content is iterable %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- message.content }}
+        {%- endif %}
+        {{- "<|eot|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|header_start|>assistant<|header_end|>\n\n' }}
+{%- endif %}
diff --git a/src/axolotl/utils/chat_templates/templates/llava.jinja b/src/axolotl/utils/chat_templates/templates/llava.jinja
new file mode 100644
index 000000000..448bf4dbf
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/llava.jinja
@@ -0,0 +1,2 @@
+{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>
+' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}
diff --git a/src/axolotl/utils/chat_templates/templates/metharme.jinja b/src/axolotl/utils/chat_templates/templates/metharme.jinja
new file mode 100644
index 000000000..626d48f29
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/metharme.jinja
@@ -0,0 +1 @@
+{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'Enter RP mode. You shall reply to the user while staying in character. Your responses must be detailed, creative, immersive, and drive the scenario forward.' %}{% endif %}{{ '<|system|>' + system_message }}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|user|>' + content.strip() }}{% elif message['role'] == 'assistant' %}{{ '<|model|>'  + content.strip() }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|model|>' }}{% else %}{{ eos_token }}{% endif %}
diff --git a/src/axolotl/utils/chat_templates/templates/mistral_v1.jinja b/src/axolotl/utils/chat_templates/templates/mistral_v1.jinja
new file mode 100644
index 000000000..409b06d83
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/mistral_v1.jinja
@@ -0,0 +1 @@
+{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ ' [INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}
diff --git a/src/axolotl/utils/chat_templates/templates/mistral_v2v3.jinja b/src/axolotl/utils/chat_templates/templates/mistral_v2v3.jinja
new file mode 100644
index 000000000..3dc6f523d
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/mistral_v2v3.jinja
@@ -0,0 +1 @@
+{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + '[/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}
diff --git a/src/axolotl/utils/chat_templates/templates/mistral_v3_tekken.jinja b/src/axolotl/utils/chat_templates/templates/mistral_v3_tekken.jinja
new file mode 100644
index 000000000..2a6749447
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/mistral_v3_tekken.jinja
@@ -0,0 +1 @@
+{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST]' + message['content'] + '[/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}
diff --git a/src/axolotl/utils/chat_templates/templates/mistral_v7_tekken.jinja b/src/axolotl/utils/chat_templates/templates/mistral_v7_tekken.jinja
new file mode 100644
index 000000000..b97e2a097
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/mistral_v7_tekken.jinja
@@ -0,0 +1,51 @@
+{%- set today = strftime_now("%Y-%m-%d") %}
+{%- set default_system_message = "You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.\nYour knowledge base was last updated on 2023-10-01. The current date is " + today + ".\n\nWhen you're not sure about some information, you say that you don't have the information and don't make up anything.\nIf the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. \"What are some good restaurants around me?\" => \"Where are you?\" or \"When is the next flight to Tokyo\" => \"Where do you travel from?\")" %}
+
+{{- bos_token }}
+
+{%- if messages[0]['role'] == 'system' %}
+    {%- if messages[0]['content'] is string %}
+        {%- set system_message = messages[0]['content'] %}
+    {%- else %}
+        {%- set system_message = messages[0]['content'][0]['text'] %}
+    {%- endif %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = default_system_message %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}
+
+{%- for message in loop_messages %}
+    {%- if message['role'] == 'user' %}
+        {%- if message['content'] is string %}
+            {{- '[INST]' + message['content'] + '[/INST]' }}
+        {%- else %}
+            {{- '[INST]' }}
+            {%- for block in message['content'] %}
+                {%- if block['type'] == 'text' %}
+                    {{- block['text'] }}
+                {%- elif block['type'] in ['image', 'image_url'] %}
+                    {{- '[IMG]' }}
+                {%- else %}
+                    {{- raise_exception('Only text and image blocks are supported in message content!') }}
+                {%- endif %}
+            {%- endfor %}
+            {{- '[/INST]' }}
+        {%- endif %}
+    {%- elif message['role'] == 'system' %}
+        {%- if message['content'] is string %}
+            {{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }}
+        {%- else %}
+            {{- '[SYSTEM_PROMPT]' + message['content'][0]['text'] + '[/SYSTEM_PROMPT]' }}
+        {%- endif %}
+    {%- elif message['role'] == 'assistant' %}
+        {%- if message['content'] is string %}
+            {{- message['content'] + eos_token }}
+        {%- else %}
+            {{- message['content'][0]['text'] + eos_token }}
+        {%- endif %}
+    {%- else %}
+        {{- raise_exception('Only user, system and assistant roles are supported!') }}
+    {%- endif %}
+{%- endfor %}
diff --git a/src/axolotl/utils/chat_templates/templates/phi_3.jinja b/src/axolotl/utils/chat_templates/templates/phi_3.jinja
new file mode 100644
index 000000000..853942eba
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/phi_3.jinja
@@ -0,0 +1,7 @@
+{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '
+' + message['content'] + '<|end|>' + '
+'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '
+' + message['content'] + '<|end|>' + '
+' + '<|assistant|>' + '
+'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '
+'}}{% endif %}{% endfor %}
diff --git a/src/axolotl/utils/chat_templates/templates/phi_35.jinja b/src/axolotl/utils/chat_templates/templates/phi_35.jinja
new file mode 100644
index 000000000..aae8a8f51
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/phi_35.jinja
@@ -0,0 +1,8 @@
+{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>
+' + message['content'] + '<|end|>
+'}}{% elif message['role'] == 'user' %}{{'<|user|>
+' + message['content'] + '<|end|>
+'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>
+' + message['content'] + '<|end|>
+'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>
+' }}{% endif %}
diff --git a/src/axolotl/utils/chat_templates/templates/phi_4.jinja b/src/axolotl/utils/chat_templates/templates/phi_4.jinja
new file mode 100644
index 000000000..ed1861f6c
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/phi_4.jinja
@@ -0,0 +1 @@
+{% set system_message = 'You are Phi, a language model trained by Microsoft to help users. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion. Now, try to solve the following question through the above guidelines:' -%}{%- if messages and messages[0]['role'] == 'system' -%}{%- set system_message = messages[0]['content'] -%}{%- set messages = messages[1:] -%}{%- endif -%}<|im_start|>system<|im_sep|>{{ system_message }}<|im_end|>{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'assistant') %}{{'<|im_start|>assistant<|im_sep|>'}}{% generation %}{{message['content'] + '<|im_end|>'}}{% endgeneration %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant<|im_sep|>' }}{% endif %}
diff --git a/src/axolotl/utils/chat_templates/templates/pixtral.jinja b/src/axolotl/utils/chat_templates/templates/pixtral.jinja
new file mode 100644
index 000000000..a94177112
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/pixtral.jinja
@@ -0,0 +1,53 @@
+{%- if messages[0]["role"] == "system" %}
+    {%- set system_message = messages[0]["content"] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+
+{{- bos_token }}
+{%- for message in loop_messages %}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}
+    {%- endif %}
+    {%- if message["role"] == "user" %}
+        {%- if loop.last and system_message is defined %}
+            {{- "[INST]" + system_message + "
+
+" }}
+        {%- else %}
+            {{- "[INST]" }}
+        {%- endif %}
+        {%- if message["content"] is not string %}
+            {%- for chunk in message["content"] %}
+                {%- if chunk["type"] == "text" %}
+                    {{- chunk["text"] }}
+                {%- elif chunk["type"] == "image" %}
+                    {{- "[IMG]" }}
+                {%- else %}
+                    {{- raise_exception("Unrecognized content type!") }}
+                {%- endif %}
+            {%- endfor %}
+        {%- else %}
+            {{- message["content"] }}
+        {%- endif %}
+        {{- "[/INST]" }}
+    {%- elif message["role"] == "assistant" %}
+ {%- if message["content"] is not string %}
+ {%- for chunk in message["content"] %}
+ {%- if chunk["type"] == "text" %}
+ {{- chunk["text"] }}
+ {%- elif chunk["type"] == "image" %}
+ {{- "[IMG]" }}
+ {%- else %}
+ {{- raise_exception("Unrecognized content type!") }}
+{%- endif %}
+{%- endfor %}
+{{- eos_token }}
+{%- else %}
+{{- message["content"] + eos_token }}
+{%- endif %}
+    {%- else %}
+        {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}
+    {%- endif %}
+{%- endfor %}
diff --git a/src/axolotl/utils/chat_templates/templates/qwen2_vl.jinja b/src/axolotl/utils/chat_templates/templates/qwen2_vl.jinja
new file mode 100644
index 000000000..426b7642d
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/qwen2_vl.jinja
@@ -0,0 +1,7 @@
+{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endif %}<|im_start|>{{ message['role'] }}
+{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
+{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
+{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
+{% endif %}
diff --git a/src/axolotl/utils/chat_templates/templates/qwen3.jinja b/src/axolotl/utils/chat_templates/templates/qwen3.jinja
new file mode 100644
index 000000000..09b82ed03
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/qwen3.jinja
@@ -0,0 +1,87 @@
+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set content = message.content %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in message.content %}
+                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
+                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- else %}
+        {{- '<think>\n\n' }}
+    {%- endif %}
+{%- endif %}
diff --git a/src/axolotl/utils/chat_templates/templates/qwen_25.jinja b/src/axolotl/utils/chat_templates/templates/qwen_25.jinja
new file mode 100644
index 000000000..bdf7919a9
--- /dev/null
+++ b/src/axolotl/utils/chat_templates/templates/qwen_25.jinja
@@ -0,0 +1,54 @@
+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}
diff --git a/src/axolotl/utils/collators/batching.py b/src/axolotl/utils/collators/batching.py
index 45facf832..55e630fbe 100644
--- a/src/axolotl/utils/collators/batching.py
+++ b/src/axolotl/utils/collators/batching.py
@@ -1,7 +1,7 @@
 """Data collators for axolotl to pad labels and position_ids for packed sequences"""
 
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, List
 
 import numpy as np
 from transformers import PreTrainedTokenizerBase
@@ -81,9 +81,11 @@ class DataCollatorForSeq2Seq:
 
                 padding_side = self.tokenizer.padding_side
                 for feature in features:
-                    remainder = [pad_token_id] * (
-                        max_feature_length - len(feature[feature_name])
-                    )
+                    remainder_len = max_feature_length - len(feature[feature_name])
+                    if feature_name == "position_ids":
+                        remainder = list(range(remainder_len))
+                    else:
+                        remainder = [pad_token_id] * remainder_len
                     if isinstance(feature[feature_name], list):
                         feature[feature_name] = (
                             feature[feature_name] + remainder
@@ -106,7 +108,7 @@ class DataCollatorForSeq2Seq:
             pad_to_multiple_of=self.pad_to_multiple_of,
             return_tensors=return_tensors,
         )
-        if not has_attn_mask:
+        if not has_attn_mask and "attention_mask" in features:
             del features["attention_mask"]
 
         # prepare decoder_input_ids
@@ -159,9 +161,11 @@ class V2BatchSamplerDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
     Collator for multipack specific to the using the BatchSampler
     """
 
+    squash_position_ids: bool = False
+
     def __call__(self, features, return_tensors=None):
         if not isinstance(features[0], list):
-            features = [features]
+            features: List[List[dict]] = [features]
         out_features = [{} for _ in features]
         for i, features_ in enumerate(features):
             for feature in features_[0].keys():
@@ -174,6 +178,15 @@ class V2BatchSamplerDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
                         if feature in item
                     ]
                     out_features[i][feature] = np.concatenate(arrays)
+                elif feature == "position_ids" and self.squash_position_ids:
+                    arrays = [
+                        np.array(item[feature]) for item in features_ if feature in item
+                    ]
+                    # concatenate, get total length and create arange of new total position ids
+                    position_ids = np.concatenate(arrays)
+                    total_length = position_ids.shape[0]
+                    position_ids = np.arange(total_length)
+                    out_features[i][feature] = position_ids
                 else:
                     arrays = [
                         np.array(item[feature]) for item in features_ if feature in item
diff --git a/src/axolotl/utils/collators/mm_chat.py b/src/axolotl/utils/collators/mm_chat.py
index 75d72f8dc..0075d4830 100644
--- a/src/axolotl/utils/collators/mm_chat.py
+++ b/src/axolotl/utils/collators/mm_chat.py
@@ -50,7 +50,7 @@ class MultiModalChatDataCollator(DataCollatorMixin):
             # This method requires transformers>=4.49.0
             result = self.processing_strategy.processor.apply_chat_template(
                 example["messages"],
-                add_generation_prompt=True,
+                add_generation_prompt=False,
                 tokenize=True,
                 return_tensors="pt",
                 padding=True,
@@ -84,6 +84,17 @@ class MultiModalChatDataCollator(DataCollatorMixin):
             "attention_mask": attention_mask,
         }
 
+        for key, val in batch.items():
+            if key in ["input_ids", "attention_mask"]:
+                continue
+
+            if key in ["token_type_ids", "cross_attention_mask"]:
+                final_batch[key] = torch.nn.utils.rnn.pad_sequence(
+                    val, batch_first=True, padding_value=0
+                )
+            else:
+                final_batch[key] = torch.stack(val)
+
         # Process the labels
         final_batch["labels"] = self.processing_strategy.process_labels(
             final_batch["input_ids"]
diff --git a/src/axolotl/utils/comet_.py b/src/axolotl/utils/comet_.py
index b4ecc80ad..9eeb6a280 100644
--- a/src/axolotl/utils/comet_.py
+++ b/src/axolotl/utils/comet_.py
@@ -1,11 +1,11 @@
 """Module for wandb utilities"""
 
-import logging
 import os
 
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
 
-LOG = logging.getLogger("axolotl.utils.comet_")
+LOG = get_logger(__name__)
 
 COMET_ENV_MAPPING_OVERRIDE = {
     "comet_mode": "COMET_START_MODE",
diff --git a/src/axolotl/utils/config/__init__.py b/src/axolotl/utils/config/__init__.py
index 49e4cfc6f..c9613c39b 100644
--- a/src/axolotl/utils/config/__init__.py
+++ b/src/axolotl/utils/config/__init__.py
@@ -1,7 +1,6 @@
 """Module for working with config dicts"""
 
 import json
-import logging
 import os
 from typing import Optional
 
@@ -15,13 +14,14 @@ from axolotl.loaders import MULTIMODAL_AUTO_MODEL_MAPPING
 from axolotl.loaders.utils import load_model_config
 from axolotl.utils.bench import log_gpu_memory_usage
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.config import (
     AxolotlConfigWCapabilities as AxolotlConfigWCapabilitiesBase,
 )
 from axolotl.utils.schemas.config import AxolotlInputConfig as AxolotlInputConfigBase
 from axolotl.utils.schemas.datasets import DPODataset, KTODataset, SFTDataset
 
-LOG = logging.getLogger("axolotl")
+LOG = get_logger(__name__)
 
 
 def choose_device(cfg):
@@ -116,9 +116,10 @@ def normalize_config(cfg):
     ]
     choose_device(cfg)
     cfg.ddp = cfg.ddp if cfg.ddp is not None else cfg.world_size != 1
-    if cfg.ddp:
+    if cfg.world_size != 1:
         cfg.device_map = {"": int(os.environ.get("LOCAL_RANK", 0))}
-        cfg.batch_size = cfg.batch_size * cfg.world_size
+        if cfg.fsdp or cfg.fsdp_config or cfg.ddp:
+            cfg.batch_size = cfg.batch_size * cfg.world_size
 
     if not cfg.use_ray:
         # delay resolving dtype until on worker node when launching with ray
@@ -147,8 +148,6 @@ def normalize_config(cfg):
                 f"Invalid value for eval_steps ({eval_steps}) from evals_per_epoch and/or num_epochs. Skipping evaluations."
             )
 
-    cfg.dataset_processes = cfg.dataset_processes or os.cpu_count()
-
     if not cfg.base_model_config:
         cfg.base_model_config = cfg.base_model
 
@@ -274,7 +273,7 @@ def validate_config(
     # Convert datasets to proper format if needed
     if cfg.get("datasets"):
         for idx, ds_cfg in enumerate(cfg["datasets"]):
-            if cfg.get("rl") == "dpo" and not isinstance(ds_cfg, DPODataset):
+            if cfg.get("rl") in ["dpo", "simpo"] and not isinstance(ds_cfg, DPODataset):
                 cfg["datasets"][idx] = DPODataset(**ds_cfg)
             elif cfg.get("rl") == "kto" and not isinstance(ds_cfg, KTODataset):
                 cfg["datasets"][idx] = KTODataset(**dict(ds_cfg))
diff --git a/src/axolotl/utils/ctx_managers/sequence_parallel.py b/src/axolotl/utils/ctx_managers/sequence_parallel.py
index 491cb9877..029d991dd 100644
--- a/src/axolotl/utils/ctx_managers/sequence_parallel.py
+++ b/src/axolotl/utils/ctx_managers/sequence_parallel.py
@@ -6,15 +6,14 @@ import inspect
 import torch
 import torch.distributed as dist
 from torch import nn
+from torch.distributed import DeviceMesh
 from torch.utils.hooks import RemovableHandle
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.utils import ModelOutput
 
 from axolotl.monkeypatch.ring_attn import (
     get_ring_attn_group,
-    patch_prepare_data_loader,
-    patch_prepare_device_mesh,
-    register_ring_attn,
+    register_ring_attn_from_device_mesh,
     update_ring_attn_params,
 )
 from axolotl.utils.schemas.enums import RingAttnFunc
@@ -152,9 +151,18 @@ def apply_sequence_parallelism(
         if "num_items_in_batch" in batch:
             # Approximation; this needed since num_items_in_batch may be counted across
             # all samples in a gradient accumulated batch, not on a per-step basis.
+            local_valid_tokens = (batch["labels"] != -100).sum()
+
+            # All-reduce across sequence parallel ranks to get global token count
+            cp_group = get_ring_attn_group()
+            global_valid_tokens = local_valid_tokens.clone()
+            # we use AVG instead of SUM as using sum seems to scale down the loss by over-accounting the number of tokens
+            dist.all_reduce(global_valid_tokens, op=dist.ReduceOp.AVG, group=cp_group)
+            global_valid_tokens = int(global_valid_tokens.item())
+
             batch["num_items_in_batch"] = (
-                batch["labels"] != -100
-            ).sum() * gradient_accumulation_steps
+                global_valid_tokens * gradient_accumulation_steps
+            )
 
     return batch, original_seq_len, pad_len
 
@@ -169,26 +177,33 @@ class SequenceParallelContextManager:
     Args:
         models: List of models to apply sequence parallelism to pre- and post- forward
             hooks.
-        sequence_parallel_degree: Number of processes to split sequences over.
+        context_parallel_size: Number of processes to split sequences over.
         gradient_accumulation_steps: Number of steps to accumulate gradients over.
         ring_attn_func: Which ring attention function to use. Currently unused.
         heads_k_stride: Sequence parallelism K head stride size. Passed through to
             `varlen_llama3` `ring_flash_attn` implementation.
+        gather_outputs: Whether to gather outputs after model forward pass across the
+            sequence parallel group.
     """
 
     def __init__(
         self,
         models: list[nn.Module],
-        sequence_parallel_degree: int,
+        context_parallel_size: int,
         gradient_accumulation_steps: int,
         ring_attn_func: RingAttnFunc,
         heads_k_stride: int | None,
+        gather_outputs: bool,
+        device_mesh: DeviceMesh | None = None,
     ):
         self.models = models
-        self.sequence_parallel_degree = sequence_parallel_degree
+        self.context_parallel_size = context_parallel_size
         self.gradient_accumulation_steps = gradient_accumulation_steps
         self.ring_attn_func = ring_attn_func
         self.heads_k_stride = heads_k_stride
+        self.gather_outputs = gather_outputs
+        self.device_mesh = device_mesh
+
         self._register_ring_attn()
 
         # Set distributed info for local rank
@@ -227,18 +242,13 @@ class SequenceParallelContextManager:
 
     def _register_ring_attn(self):
         # Initialize ring attn for sequence parallelism
-        register_ring_attn(
-            sequence_parallel_degree=self.sequence_parallel_degree,
+        register_ring_attn_from_device_mesh(
+            device_mesh=self.device_mesh,
+            context_parallel_dim=("cp",),
             heads_k_stride=self.heads_k_stride,
             ring_attn_func=self.ring_attn_func,
         )
 
-        # Patches for accelerate functionality
-        patch_prepare_data_loader()
-        patch_prepare_device_mesh(
-            sequence_parallel_degree=self.sequence_parallel_degree
-        )
-
     def _register_model_hooks(self):
         # Forward pre-hook to apply sequence parallelism
         def sequence_parallel_pre_hook(_, args, kwargs):
@@ -277,16 +287,17 @@ class SequenceParallelContextManager:
 
             return output
 
-        # Register both hooks
+        # Register hooks
         for model in self.models:
             self.hook_handles.append(
                 model.register_forward_pre_hook(
                     sequence_parallel_pre_hook, with_kwargs=True
                 )
             )
-            self.hook_handles.append(
-                model.register_forward_hook(sequence_parallel_post_hook)
-            )
+            if self.gather_outputs:
+                self.hook_handles.append(
+                    model.register_forward_hook(sequence_parallel_post_hook)
+                )
 
     def _gather_outputs(self, output: CausalLMOutputWithPast) -> CausalLMOutputWithPast:
         """Gather sharded outputs from all ranks and reconstruct the full tensor."""
diff --git a/src/axolotl/utils/data/__init__.py b/src/axolotl/utils/data/__init__.py
index 8dedcbe69..d162a7d0b 100644
--- a/src/axolotl/utils/data/__init__.py
+++ b/src/axolotl/utils/data/__init__.py
@@ -1,16 +1,21 @@
-"""
-Data processing modules
-"""
+"""Init for `axolotl.utils.data` module."""
 
-from axolotl.utils.data.pretraining import (  # noqa: F401
+from axolotl.utils.data.pretraining import (
     encode_pretraining,
     wrap_pretraining_dataset,
 )
-from axolotl.utils.data.rl import load_prepare_preference_datasets  # noqa: F401
-from axolotl.utils.data.sft import (  # noqa: F401
+from axolotl.utils.data.rl import prepare_preference_datasets
+from axolotl.utils.data.sft import (
     get_dataset_wrapper,
-    load_prepare_datasets,
-    load_tokenized_prepared_datasets,
-    prepare_dataset,
+    prepare_datasets,
 )
-from axolotl.utils.data.utils import md5  # noqa: F401
+from axolotl.utils.data.utils import md5
+
+__all__ = [
+    "encode_pretraining",
+    "wrap_pretraining_dataset",
+    "prepare_preference_datasets",
+    "get_dataset_wrapper",
+    "prepare_datasets",
+    "md5",
+]
diff --git a/src/axolotl/utils/data/lock.py b/src/axolotl/utils/data/lock.py
new file mode 100644
index 000000000..afd1547af
--- /dev/null
+++ b/src/axolotl/utils/data/lock.py
@@ -0,0 +1,68 @@
+"""Logic for loading / preparing a dataset once over all processes."""
+
+import time
+from pathlib import Path
+from typing import Any, Callable
+
+from filelock import FileLock
+
+from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
+from axolotl.utils.dict import DictDefault
+
+LOCK_FILE_NAME = "datasets_prep.lock"
+READY_FILE_NAME = "datasets_ready.flag"
+PROCESS_COUNTER_FILE_NAME = "process_counter.txt"
+
+
+class FileLockLoader:
+    """
+    Simple class for abstracting single process data loading / processing. The first
+    process that creates a lock file does the work; the remaining procesees simply load
+    the preprocessed dataset once the first process is done.
+    """
+
+    def __init__(self, cfg: DictDefault):
+        self.cfg = cfg
+        self.dataset_prepared_path = (
+            cfg.dataset_prepared_path or DEFAULT_DATASET_PREPARED_PATH
+        )
+        self.lock_file_path = Path(self.dataset_prepared_path) / LOCK_FILE_NAME
+        self.ready_flag_path = Path(self.dataset_prepared_path) / READY_FILE_NAME
+        self.counter_path = Path(self.dataset_prepared_path) / PROCESS_COUNTER_FILE_NAME
+
+    def load(self, load_fn: Callable[[], Any]) -> Any:
+        with FileLock(str(self.lock_file_path)):
+            self._increment_counter()
+
+            if not self.ready_flag_path.exists():
+                result = load_fn()
+                self.ready_flag_path.touch()
+                return result
+
+            while not self.ready_flag_path.exists():
+                time.sleep(1)
+            return load_fn()
+
+    def _increment_counter(self):
+        """Safely increment the process counter."""
+        if self.counter_path.exists():
+            counter_content = self.counter_path.read_text().strip()
+            count = int(counter_content) if counter_content else 0
+        else:
+            count = 0
+        self.counter_path.write_text(str(count + 1))
+
+    def cleanup(self):
+        """Clean up ready flag when last process is done."""
+        with FileLock(str(self.lock_file_path)):
+            counter_content = self.counter_path.read_text().strip()
+            count = int(counter_content) if counter_content else 0
+            count -= 1
+
+            if count <= 0:
+                # Last process cleans everything up
+                self.ready_flag_path.unlink(missing_ok=True)
+                self.counter_path.unlink(missing_ok=True)
+            else:
+                # Still have active processes
+                self.counter_path.write_text(str(count))
diff --git a/src/axolotl/utils/data/pretraining.py b/src/axolotl/utils/data/pretraining.py
index d599fa0fc..ca5b7b5de 100644
--- a/src/axolotl/utils/data/pretraining.py
+++ b/src/axolotl/utils/data/pretraining.py
@@ -1,7 +1,6 @@
 """data handling specific to pretraining"""
 
 import functools
-import logging
 from collections import defaultdict
 from typing import Callable, Dict, List, Optional
 
@@ -12,10 +11,11 @@ from transformers import PreTrainedTokenizerBase
 
 from axolotl.utils.collators import PretrainingBatchSamplerDataCollatorForSeq2Seq
 from axolotl.utils.data.utils import DEFAULT_SEQUENCE_LEN_OVERFLOW_HANDLING
+from axolotl.utils.logging import get_logger
 from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
 from axolotl.utils.trainer import process_pretraining_datasets_for_packing
 
-LOG = logging.getLogger("axolotl")
+LOG = get_logger(__name__)
 
 
 def encode_pretraining(
@@ -225,10 +225,10 @@ def wrap_pretraining_dataset(
     remove_columns = []
     if dataset.features is None:
         for first_row in dataset:
-            remove_columns = first_row.keys()
+            remove_columns = list(first_row.keys())
             break
     else:
-        remove_columns = dataset.features.keys()
+        remove_columns = list(dataset.features.keys())
 
     dataset = dataset.map(
         encode,
@@ -251,7 +251,7 @@ def encode_packed_pretraining(
     # pylint: disable=duplicate-code
     # tokenize all the examples
     # rows get split with stride (overlap)
-    train_dataset = ds_wrapper(Dataset.from_dict(examples))[0]
+    train_dataset = ds_wrapper(dataset=Dataset.from_dict(examples))[0]
 
     train_dataset = process_pretraining_datasets_for_packing(
         train_dataset,
@@ -277,6 +277,7 @@ def encode_packed_pretraining(
         batch_size=1,
         batch_max_len=batch_size * max_seq_length,
         drop_last=True,
+        num_processes=1,
     )
 
     chunked_data = defaultdict(list)
diff --git a/src/axolotl/utils/data/rl.py b/src/axolotl/utils/data/rl.py
index c9271c4b5..50f3b67d0 100644
--- a/src/axolotl/utils/data/rl.py
+++ b/src/axolotl/utils/data/rl.py
@@ -1,84 +1,128 @@
-"""data handling specific to DPO"""
+"""Data handling specific to RL trainers."""
 
 import inspect
-import logging
 from functools import partial
-from pathlib import Path
-from typing import Any, List, Union
+from typing import Any, Callable, Literal
 
-import yaml
-from datasets import Dataset, DatasetDict, concatenate_datasets, load_from_disk
+from datasets import Dataset, DatasetDict
+from transformers import PreTrainedTokenizer
 
-from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
 from axolotl.loaders import load_tokenizer
 from axolotl.prompt_strategies.dpo import load as load_dpo
 from axolotl.prompt_strategies.kto import load as load_kto
 from axolotl.prompt_strategies.orpo import load as load_orpo
-from axolotl.utils.data.shared import datasets_w_name_generator, load_dataset_w_config
-from axolotl.utils.data.utils import deduplicate_and_log_datasets, md5
+from axolotl.utils.data.lock import FileLockLoader
+from axolotl.utils.data.shared import (
+    create_train_validation_split,
+    datasets_with_name_generator,
+    generate_dataset_hash_from_config,
+    load_dataset_with_config,
+    load_preprocessed_dataset,
+    merge_datasets,
+    save_preprocessed_dataset,
+    try_load_from_hub,
+)
+from axolotl.utils.data.utils import (
+    deduplicate_and_log_datasets,
+    retry_on_request_exceptions,
+)
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.distributed import is_main_process, zero_first
+from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.enums import RLType
 
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__)
 
 
-def _get_path(ds_hash, cfg):
-    prepared_ds_path = (
-        Path(cfg.dataset_prepared_path) / ds_hash
-        if cfg.dataset_prepared_path
-        else Path(DEFAULT_DATASET_PREPARED_PATH) / ds_hash
-    )
+@retry_on_request_exceptions(max_retries=3, delay=5)
+def prepare_preference_datasets(
+    cfg: DictDefault, tokenizer: PreTrainedTokenizer
+) -> tuple[Dataset, Dataset | None]:
+    """Load and prepare preference datasets for RL training.
 
-    return prepared_ds_path
+    Loads training and evaluation datasets, handling preprocessing, caching, and
+    deduplication as configured. Uses FileLock for distributed coordination.
+
+    Args:
+        cfg: Configuration object containing dataset and training settings.
+        tokenizer: Tokenizer to use for processing text.
+
+    Returns:
+        Tuple of (train_dataset, eval_dataset). eval_dataset may be None
+            if no evaluation dataset is configured.
+    """
+
+    def _load_datasets():
+        # Load training dataset
+        train_dataset = _load_or_create_dataset_split(cfg, tokenizer, split="train")
+
+        # Load or create evaluation dataset
+        eval_dataset: Dataset | None = None
+        if cfg.test_datasets:
+            eval_dataset = _load_or_create_dataset_split(cfg, tokenizer, split="test")
+        elif cfg.val_set_size:
+            # Create validation split from training data
+            train_dataset, eval_dataset = create_train_validation_split(
+                train_dataset, cfg, cfg.val_set_size
+            )
+
+        return train_dataset, eval_dataset
+
+    # Prepare datasets (with file locking logic for multiple ranks)
+    loader = FileLockLoader(cfg)
+    try:
+        train_dataset, eval_dataset = loader.load(_load_datasets)
+    finally:
+        loader.cleanup()
+
+    # Apply deduplication if configured
+    if cfg.dataset_exact_deduplication:
+        train_dataset, eval_dataset = deduplicate_and_log_datasets(
+            dataset=train_dataset, other_dataset=eval_dataset
+        )
+
+    return train_dataset, eval_dataset
 
 
-def _load_preprocessed_ds(cfg, sub_cfg):
-    ds_hash = md5(yaml.dump(sub_cfg, Dumper=yaml.Dumper))
-    prepared_ds_path = _get_path(ds_hash, cfg)
-    dataset = None
+def _map_dataset(
+    cfg: DictDefault,
+    dataset: Dataset | DatasetDict,
+    ds_transform_fn: Callable[..., Any],
+    tokenizer: Any | None = None,
+    **map_kwargs: Any,
+) -> Dataset:
+    """Apply transformation function to dataset.
 
-    # pylint: disable=duplicate-code
-    if (
-        cfg.dataset_prepared_path
-        and any(prepared_ds_path.glob("*"))
-        and not cfg.is_preprocess
-    ):
-        LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
-        dataset = load_from_disk(str(prepared_ds_path))
+    Args:
+        cfg: Configuration object.
+        dataset: Dataset to transform.
+        ds_transform_fn: Transformation function to apply.
+        tokenizer: Optional tokenizer for transformation.
+        **map_kwargs: Additional arguments for dataset mapping.
 
-    return dataset
-
-
-def _save_preprocessed_ds(cfg, sub_cfg, dataset):
-    ds_hash = md5(yaml.dump(sub_cfg, Dumper=yaml.Dumper))
-    prepared_ds_path = _get_path(ds_hash, cfg)
-
-    if cfg.is_preprocess and is_main_process():
-        LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
-        dataset.save_to_disk(str(prepared_ds_path))
-
-
-def map_dataset(cfg, data_set, ds_transform_fn, tokenizer, **map_kwargs):
+    Returns:
+        Transformed dataset.
+    """
     sig = inspect.signature(ds_transform_fn)
     if "tokenizer" in sig.parameters:
         if not tokenizer:
             tokenizer = load_tokenizer(cfg)
         ds_transform_fn = partial(ds_transform_fn, tokenizer=tokenizer)
 
-    if isinstance(data_set, DatasetDict):
-        data_set = data_set["train"]
+    if isinstance(dataset, DatasetDict):
+        dataset = dataset["train"]
 
-    data_set = data_set.map(
+    dataset = dataset.map(
         ds_transform_fn,
-        desc="Mapping RL Dataset",
         num_proc=cfg.dataset_processes,
+        load_from_cache_file=not cfg.is_preprocess,
+        desc="Mapping RL Dataset",
         **map_kwargs,
     )
 
-    return data_set
+    return dataset
 
 
+<<<<<<< HEAD
 def drop_long_rl_seq(
     sample,
     rl,
@@ -89,6 +133,26 @@ def drop_long_rl_seq(
     result = None
 
     if rl in (RLType.DPO, RLType.IPO, RLType.ORPO, RLType.SIMPO):
+=======
+def _drop_long_sequences(
+    sample: dict[str, Any], rl: RLType, tokenizer: Any, sequence_len: int
+) -> bool:
+    """Filter out samples that exceed maximum sequence length.
+
+    Args:
+        sample: Dataset sample to check.
+        rl: Reinforcement learning type.
+        tokenizer: Tokenizer for length calculation.
+        sequence_len: Maximum allowed sequence length.
+
+    Returns:
+        True if sample should be kept, False if it should be dropped.
+
+    Raises:
+        ValueError: If required keys are missing or RL type is unknown.
+    """
+    if rl in {RLType.DPO, RLType.IPO, RLType.ORPO, RLType.SIMPO}:
+>>>>>>> origin/main
         if not (
             sample.get("prompt") and sample.get("chosen") and sample.get("rejected")
         ):
@@ -201,6 +265,7 @@ def drop_long_rl_seq(
     return result
 
 
+<<<<<<< HEAD
 def load_prepare_preference_datasets(cfg):
     def _is_rl_seq_within_sequence_len(sample, rl, tokenizer, sequence_len):
         """
@@ -245,21 +310,22 @@ def load_prepare_preference_datasets(cfg):
                 config_dataset, use_auth_token, streaming=False
             )
             split_datasets.append(ds)
+=======
+def _load_split(cfg: DictDefault, split: Literal["train", "test"]) -> Dataset:
+    """Load and process dataset split for RL training.
+>>>>>>> origin/main
 
-        tokenizer = load_tokenizer(cfg)
+    Args:
+        cfg: Configuration object containing dataset settings.
+        split: Dataset split to load ("train" or "test").
 
-        for i, data_set in enumerate(split_datasets):
-            _type = dataset_cfgs[i]["type"]
-            if _type:
-                if isinstance(_type, DictDefault):
-                    _type = "user_defined.default"
-                if _cfg.rl is RLType.ORPO:
-                    ds_transform_fn = load_orpo(_type, _cfg, dataset_idx=i)
-                elif _cfg.rl is RLType.KTO:
-                    ds_transform_fn = load_kto(_type, _cfg, dataset_idx=i)
-                else:
-                    ds_transform_fn = load_dpo(_type, _cfg, dataset_idx=i)
+    Returns:
+        Combined and processed dataset for the specified split.
+    """
+    datasets_configs = cfg.datasets if split == "train" else cfg.test_datasets
+    split_datasets: list[Dataset | DatasetDict] = []
 
+<<<<<<< HEAD
                 map_kwargs = {}
                 if isinstance(ds_transform_fn, tuple):
                     ds_transform_fn, map_kwargs = ds_transform_fn
@@ -395,6 +461,104 @@ def load_prepare_preference_datasets(cfg):
     if cfg.dataset_exact_deduplication:
         train_dataset, eval_dataset, _ = deduplicate_and_log_datasets(
             train_dataset=train_dataset, eval_dataset=eval_dataset
+=======
+    for dataset_config in datasets_with_name_generator(datasets_configs):
+        dataset: Dataset | DatasetDict = load_dataset_with_config(
+            dataset_config, cfg.hf_use_auth_token, streaming=False
+>>>>>>> origin/main
         )
+        split_datasets.append(dataset)
 
-    return train_dataset, eval_dataset
+    tokenizer = load_tokenizer(cfg)
+
+    for i, dataset in enumerate(split_datasets):
+        _type = datasets_configs[i]["type"]
+        if _type:
+            if isinstance(_type, DictDefault):
+                _type = "user_defined.default"
+            if cfg.rl is RLType.ORPO:
+                ds_transform_fn = load_orpo(_type, cfg, dataset_idx=i)
+            elif cfg.rl is RLType.KTO:
+                ds_transform_fn = load_kto(_type, cfg, dataset_idx=i)
+            else:
+                ds_transform_fn = load_dpo(_type, cfg, dataset_idx=i)
+
+            map_kwargs: dict[str, Any] = {}
+            if isinstance(ds_transform_fn, tuple):
+                ds_transform_fn, map_kwargs = ds_transform_fn
+            split_datasets[i] = _map_dataset(
+                cfg, dataset, ds_transform_fn, tokenizer, **map_kwargs
+            )
+        else:
+            # If no `type` is provided, assume the dataset is already in the expected format with
+            # "prompt", "chosen", and "rejected" already preprocessed
+            split_datasets[i] = dataset
+
+        if not cfg.skip_prepare_dataset:
+            drop_long = partial(
+                _drop_long_sequences,
+                rl=cfg.rl,
+                tokenizer=tokenizer,
+                sequence_len=cfg.sequence_len,
+            )
+
+            prior_len = len(split_datasets[i])
+            split_datasets[i] = split_datasets[i].filter(
+                drop_long,
+                num_proc=cfg.dataset_processes,
+                load_from_cache_file=not cfg.is_preprocess,
+                desc="Dropping Long Sequences",
+            )
+            dropped = prior_len - len(split_datasets[i])
+            if dropped:
+                LOG.warning(f"Dropped {dropped} long samples from dataset index {i}")
+
+    # Merge datasets
+    dataset = merge_datasets(split_datasets, cfg)
+
+    if not cfg.skip_prepare_dataset:
+        # Save preprocessed dataset
+        dataset_hash = generate_dataset_hash_from_config(
+            cfg, datasets_configs, tokenizer.name_or_path
+        )
+        save_preprocessed_dataset(cfg, dataset, dataset_hash, split)
+
+    return dataset
+
+
+# pylint: disable=duplicate-code
+def _load_or_create_dataset_split(
+    cfg: DictDefault, tokenizer: PreTrainedTokenizer, split: Literal["train", "test"]
+) -> Dataset:
+    """Load preprocessed dataset or create new one for given split.
+
+    Args:
+        cfg: Configuration object.
+        tokenizer: Tokenizer to use for processing text.
+        split: Dataset split to load.
+
+    Returns:
+        Tuple of (dataset, is_preprocessed).
+    """
+    # Select correct dataset configuration based on split
+    datasets_config = cfg.datasets if split == "train" else cfg.test_datasets
+
+    # Generate dataset hash for caching
+    dataset_hash = generate_dataset_hash_from_config(
+        cfg, datasets_config, tokenizer.name_or_path
+    )
+
+    # Try loading from hub if push_dataset_to_hub is configured
+    dataset = None
+    if cfg.push_dataset_to_hub:
+        dataset = try_load_from_hub(cfg, dataset_hash, split)
+
+    # Attempt to load preprocessed dataset
+    if dataset is None:
+        dataset = load_preprocessed_dataset(cfg, dataset_hash)
+
+    # Otherwise, load it
+    if dataset is None:
+        dataset = _load_split(cfg, split=split)
+
+    return dataset
diff --git a/src/axolotl/utils/data/sft.py b/src/axolotl/utils/data/sft.py
index 6de2d2cf7..975f26e71 100644
--- a/src/axolotl/utils/data/sft.py
+++ b/src/axolotl/utils/data/sft.py
@@ -1,407 +1,468 @@
-"""data handling specific to SFT"""
+"""Data handling specific to SFT."""
 
 import functools
-import logging
 import os
 import tempfile
-from pathlib import Path
-from typing import List, Optional, Tuple, Union
+from typing import Literal
 
 from datasets import (
     Dataset,
     DatasetDict,
     IterableDataset,
-    Sequence,
-    Value,
-    concatenate_datasets,
     load_dataset,
-    load_from_disk,
 )
-from transformers import PreTrainedTokenizerBase
+from transformers import PreTrainedTokenizer, ProcessorMixin
 
-from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
-from axolotl.datasets import TokenizedPromptDataset, wrap_dataset_for_tokenized_prompt
-from axolotl.prompt_strategies import load
-from axolotl.prompt_strategies.bradley_terry import load as bradley_terry_load
-from axolotl.prompt_tokenizers import (
-    AlpacaMultipleChoicePromptTokenizingStrategy,
-    AlpacaPromptTokenizingStrategy,
-    AlpacaReflectionPTStrategy,
-    DatasetWrappingStrategy,
-    GPTeacherPromptTokenizingStrategy,
-    JeopardyPromptTokenizingStrategy,
-    OpenAssistantPromptTokenizingStrategy,
-    SummarizeTLDRPromptTokenizingStrategy,
-)
-from axolotl.prompters import (
-    AlpacaPrompter,
-    GPTeacherPrompter,
-    JeopardyPrompter,
-    MultipleChoiceConcisePrompter,
-    MultipleChoiceExplainPrompter,
-    Prompter,
-    ReflectAlpacaPrompter,
-    SummarizeTLDRPrompter,
-    UnsupportedPrompter,
-)
+from axolotl.prompters import Prompter
+from axolotl.utils.data.lock import FileLockLoader
 from axolotl.utils.data.pretraining import wrap_pretraining_dataset
-from axolotl.utils.data.shared import datasets_w_name_generator, load_dataset_w_config
+from axolotl.utils.data.shared import (
+    create_train_validation_split,
+    datasets_with_name_generator,
+    generate_dataset_hash_from_config,
+    load_dataset_with_config,
+    load_preprocessed_dataset,
+    merge_datasets,
+    save_preprocessed_dataset,
+    try_load_from_hub,
+)
 from axolotl.utils.data.utils import (
     deduplicate_and_log_datasets,
     drop_long_seq_in_dataset,
-    md5,
     retry_on_request_exceptions,
 )
+from axolotl.utils.data.wrappers import get_dataset_wrapper
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.distributed import is_local_main_process, zero_first
+from axolotl.utils.distributed import is_local_main_process
+from axolotl.utils.logging import get_logger
 from axolotl.utils.trainer import (
     calculate_total_num_steps,
     process_datasets_for_packing,
 )
 
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__)
 
 
 @retry_on_request_exceptions(max_retries=3, delay=5)
-def prepare_dataset(cfg, tokenizer, processor=None, preprocess_iterable=None):
-    prompters = []
-    if not cfg.pretraining_dataset:
-        with zero_first(is_local_main_process()):
-            if cfg.test_datasets:
-                train_dataset, _, prompters = load_prepare_datasets(
-                    tokenizer,
-                    cfg,
-                    DEFAULT_DATASET_PREPARED_PATH,
-                    split="train",
-                    processor=processor,
-                    preprocess_iterable=preprocess_iterable,
-                )
-                _, eval_dataset, _ = load_prepare_datasets(
-                    tokenizer,
-                    cfg,
-                    DEFAULT_DATASET_PREPARED_PATH,
-                    split="test",
-                    processor=processor,
-                    preprocess_iterable=preprocess_iterable,
-                )
-            else:
-                train_dataset, eval_dataset, prompters = load_prepare_datasets(
-                    tokenizer,
-                    cfg,
-                    DEFAULT_DATASET_PREPARED_PATH,
-                    processor=processor,
-                    preprocess_iterable=preprocess_iterable,
-                )
-    else:
-        # Load streaming dataset if pretraining_dataset is given
-        path = cfg.pretraining_dataset
-        split = "train"
-        name = None
-        data_files = None
-        skip = 0
-        if isinstance(cfg.pretraining_dataset, list) and isinstance(
-            cfg.pretraining_dataset[0], dict
-        ):
-            path = cfg.pretraining_dataset[0]["path"]
-            name = cfg.pretraining_dataset[0]["name"]
-            skip = cfg.pretraining_dataset[0]["skip"]
-            if "split" in cfg.pretraining_dataset[0]:
-                split = cfg.pretraining_dataset[0]["split"]
+def prepare_datasets(
+    cfg: DictDefault,
+    tokenizer: PreTrainedTokenizer,
+    processor: ProcessorMixin | None = None,
+    preprocess_iterable: bool = False,
+) -> tuple[IterableDataset | Dataset, Dataset | None, int, list[Prompter | None]]:
+    """Prepare training and evaluation datasets based on configuration.
 
-            data_files = cfg.pretraining_dataset[0].get("data_files")
+    Args:
+        cfg: Dictionary mapping `axolotl` config keys to values.
+        tokenizer: Tokenizer to use for processing text.
+        processor: Optional processor for multimodal datasets.
+        preprocess_iterable: Whether to use iterable preprocessing.
 
-        ds_wrapper_partial = functools.partial(
-            get_dataset_wrapper,
-            cfg.pretraining_dataset[0],
+    Returns:
+        Tuple of (train_dataset, eval_dataset, total_steps, prompters).
+    """
+    if cfg.pretraining_dataset:
+        return _prepare_pretraining_dataset(
+            cfg, tokenizer, processor, preprocess_iterable
+        )
+    return _prepare_standard_dataset(cfg, tokenizer, processor, preprocess_iterable)
+
+
+def _prepare_standard_dataset(
+    cfg: DictDefault,
+    tokenizer: PreTrainedTokenizer,
+    processor: ProcessorMixin | None,
+    preprocess_iterable: bool,
+) -> tuple[Dataset, Dataset | None, int, list[Prompter | None]]:
+    """Prepare standard (non-pretraining) datasets."""
+
+    def _load_datasets():
+        # Always load training dataset
+        train_dataset, eval_dataset, prompters = _load_and_prepare_datasets(
             tokenizer,
             cfg,
-            cfg.pretraining_dataset[0]["type"] or "pretrain",
+            split="train",
+            processor=processor,
+            preprocess_iterable=preprocess_iterable,
         )
 
-        # when letting accelerator dispatch batches from the main process, we don't need to load the dataset from
-        # other ranks, we just need to present a fake dataset
-        if (
-            cfg.accelerator_config
-            and cfg.accelerator_config.dispatch_batches
-            and not is_local_main_process()
-        ):
-            with tempfile.NamedTemporaryFile(mode="w+", delete=False) as f:
-                f.write("text\n")
-                f.write("lorem ipsum dolor sit amet\n")
-                # rewind the file pointer to the beginning so we can read it again
-                f.seek(0)
-                iter_ds = load_dataset(
-                    "csv", data_files=f.name, split="train", streaming=True
-                )
-        else:
-            iter_ds = load_dataset(
-                path, streaming=True, split=split, name=name, data_files=data_files
-            )
-
-        if skip:
-            LOG.info(f"Skipping {skip} samples from the dataset")
-            iter_ds = iter_ds.skip(skip)
-        train_dataset = wrap_pretraining_dataset(
-            iter_ds,
-            tokenizer,
-            cfg,
-            ds_wrapper_partial,
-            max_tokens=cfg.sequence_len,
-            batch_size=cfg.micro_batch_size,
-            seed=cfg.seed if cfg.seed is not None else 42,
-            buffer_size=cfg.pretrain_multipack_buffer_size or 10_000,
-        )
-        # https://discuss.huggingface.co/t/how-to-use-huggingface-trainer-streaming-datasets-without-wrapping-it-with-torchdatas-iterablewrapper/25230
-        train_dataset = train_dataset.with_format("torch")
-
-        # Load eval dataset (non-streaming) if specified
-        eval_dataset = None
+        # Overwrite eval_dataset if test data exists
         if cfg.test_datasets:
-            _, eval_dataset, _ = load_prepare_datasets(
+            _, eval_dataset, _ = _load_and_prepare_datasets(
                 tokenizer,
                 cfg,
-                DEFAULT_DATASET_PREPARED_PATH,
                 split="test",
                 processor=processor,
                 preprocess_iterable=preprocess_iterable,
             )
 
-        if cfg.dataset_exact_deduplication:
-            LOG.info("Deduplication not available for pretrained datasets")
+        return train_dataset, eval_dataset, prompters
 
-        return train_dataset, eval_dataset, cfg.max_steps, prompters
+    # Prepare datasets (with file locking logic for multiple ranks)
+    loader = FileLockLoader(cfg)
+    try:
+        train_dataset, eval_dataset, prompters = loader.load(_load_datasets)
+    finally:
+        loader.cleanup()
 
+    if os.environ.get("AXOLOTL_IS_PREPROCESS") == "1":
+        return train_dataset, eval_dataset, -1, prompters
+
+    # Validate sample packing configuration for evaluation
     if eval_dataset and cfg.sample_packing and cfg.eval_sample_packing is not False:
         total_eval_steps = calculate_total_num_steps(cfg, eval_dataset, update=False)
         if total_eval_steps == 0:
             raise ValueError(
-                "eval dataset split is too small for sample_packing. You should set `eval_sample_packing: False`. "
+                "eval dataset split is too small for sample_packing. "
+                "You should set `eval_sample_packing: False` in your config."
             )
 
+    # Calculate total number of training steps
     if cfg.max_steps:
         total_num_steps = min(
             calculate_total_num_steps(cfg, train_dataset), cfg.max_steps
         )
-        LOG.info(f"Maximum number of steps set at {total_num_steps}")
     else:
         total_num_steps = calculate_total_num_steps(cfg, train_dataset)
-
+    LOG.info(f"Maximum number of steps set at {total_num_steps}")
     return train_dataset, eval_dataset, total_num_steps, prompters
 
 
-def load_tokenized_prepared_datasets(
-    tokenizer,
-    cfg,
-    default_dataset_prepared_path,
-    split="train",
-    processor=None,
-    preprocess_iterable: Optional[bool] = None,
-) -> Tuple[DatasetDict, List[Prompter]]:
-    cfg_datasets = cfg.test_datasets if split == "test" else cfg.datasets
-    tokenizer_name = cfg.tokenizer_config
+def _prepare_pretraining_dataset(
+    cfg: DictDefault,
+    tokenizer: PreTrainedTokenizer,
+    processor: ProcessorMixin | None,
+    preprocess_iterable: bool,
+) -> tuple[IterableDataset, Dataset | None, int, list[Prompter | None]]:
+    """
+    Prepare dataset for pretraining mode.
 
-    ds_hash = str(
-        md5(
-            (
-                str(cfg.sequence_len)
-                + "@"
-                + str(cfg.sample_packing)
-                + "@"
-                + str(cfg.eval_sample_packing)
-                + "@"
-                + str(cfg.group_by_length)
-                + "@"
-                + str(cfg.kd_temperature or 1.0)
-                + "|".join(
-                    sorted(
-                        [
-                            f"{d.path}:{d.type}:{d.shards}:{d.conversation}:{d.split}:{d.temperature or 1.0}"
-                            for d in cfg_datasets
-                        ]
-                    )
-                )
-                + "|"
-                + tokenizer_name
-            )
+    Note: Pre-training datasets are streamed from the HuggingFace Hub.
+    """
+    # Extract pretraining dataset configuration
+    pretraining_config = _extract_pretraining_config(cfg)
+
+    # Load streaming dataset for training
+    train_dataset = _load_pretraining_dataset(pretraining_config, cfg, tokenizer)
+
+    # Load evaluation dataset if specified
+    eval_dataset = None
+    if cfg.test_datasets:
+        _, eval_dataset, _ = _load_and_prepare_datasets(
+            tokenizer,
+            cfg,
+            split="test",
+            processor=processor,
+            preprocess_iterable=preprocess_iterable,
         )
-    )
-    prepared_ds_path = (
-        Path(cfg.dataset_prepared_path) / ds_hash
-        if cfg.dataset_prepared_path
-        else Path(default_dataset_prepared_path) / ds_hash
-    )
-    dataset = None
-    prompters = []
-    use_auth_token = cfg.hf_use_auth_token
-    try:
-        if cfg.push_dataset_to_hub:
-            LOG.info(
-                f"Attempting to load prepared dataset from Huggingface hub at {cfg.push_dataset_to_hub} (version {ds_hash})..."
-            )
-            dataset = load_dataset(
-                cfg.push_dataset_to_hub,
-                ds_hash,
-                token=use_auth_token,
-            )
-            dataset = dataset[split]
-    except Exception:  # pylint: disable=broad-except # nosec
-        pass
 
-    # pylint: disable=duplicate-code
-    if dataset:
-        # This is for the case where we already loaded a pretokenized dataset from the hub
-        ...
-    elif (
-        cfg.dataset_prepared_path
-        and any(prepared_ds_path.glob("*"))
-        and not cfg.is_preprocess
-        and not cfg.skip_prepare_dataset
+    if cfg.dataset_exact_deduplication:
+        LOG.info("Deduplication not available for pretrained datasets")
+
+    # For pretraining, we return max_steps directly from config
+    return train_dataset, eval_dataset, cfg.max_steps, []
+
+
+def _extract_pretraining_config(cfg: DictDefault) -> DictDefault:
+    """Extract pretraining configuration from the main config."""
+    if isinstance(cfg.pretraining_dataset, list) and isinstance(
+        cfg.pretraining_dataset[0], dict
     ):
-        LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
-        dataset = load_from_disk(str(prepared_ds_path))
-        LOG.info("Prepared dataset loaded from disk...")
+        config = cfg.pretraining_dataset[0]
+        return DictDefault(
+            {
+                "path": config["path"],
+                "name": config["name"],
+                "skip": config["skip"],
+                "split": config.get("split", "train"),
+                "data_files": config.get("data_files"),
+                "type": config.get("type", "pretrain"),
+            }
+        )
+    # Simple string path case
+    return DictDefault(
+        {
+            "path": cfg.pretraining_dataset,
+            "name": None,
+            "skip": 0,
+            "split": "train",
+            "data_files": None,
+            "type": "pretrain",
+        }
+    )
+
+
+def _load_pretraining_dataset(
+    pretraining_config: DictDefault, cfg: DictDefault, tokenizer: PreTrainedTokenizer
+) -> IterableDataset:
+    """Load and prepare a streaming dataset for pretraining."""
+    # Create dataset wrapper partial function
+    dataset_wrapper_partial = functools.partial(
+        get_dataset_wrapper,
+        dataset_config=pretraining_config,
+        tokenizer=tokenizer,
+        cfg=cfg,
+        dataset_base_type=pretraining_config["type"],
+    )
+
+    # Load the actual dataset
+    if (
+        cfg.accelerator_config
+        and cfg.accelerator_config.dispatch_batches
+        and not is_local_main_process()
+    ):
+        iter_dataset = _create_placeholder_dataset()
     else:
-        if cfg.push_dataset_to_hub:
-            LOG.info("Unable to find prepared dataset in Huggingface hub")
-        if cfg.is_preprocess:
-            LOG.info(
-                f"Skipping prepared dataset in {prepared_ds_path} for pre-processing..."
-            )
-        else:
-            LOG.info(f"Unable to find prepared dataset in {prepared_ds_path}")
-        LOG.info("Loading raw datasets...")
-        if not cfg.is_preprocess:
-            LOG.warning(
-                "Processing datasets during training can lead to VRAM instability. Please pre-process your dataset."
-            )
+        iter_dataset = load_dataset(
+            pretraining_config["path"],
+            streaming=True,
+            split=pretraining_config["split"],
+            name=pretraining_config["name"],
+            data_files=pretraining_config["data_files"],
+        )
 
-        if cfg.seed:
-            seed = cfg.seed
-        else:
-            LOG.info("No seed provided, using default seed of 42")
-            seed = 42
+    # Apply skip if specified
+    if pretraining_config["skip"]:
+        LOG.info(f"Skipping {pretraining_config['skip']} samples from the dataset")
+        iter_dataset = iter_dataset.skip(pretraining_config["skip"])
 
-        datasets = []
+    # Wrap the dataset for pretraining
+    train_dataset = wrap_pretraining_dataset(
+        iter_dataset,
+        tokenizer,
+        cfg,
+        dataset_wrapper_partial,
+        max_tokens=cfg.sequence_len,
+        batch_size=cfg.micro_batch_size,
+        seed=cfg.seed,
+        buffer_size=cfg.pretrain_multipack_buffer_size or 10_000,
+    )
 
-        streaming_ds = False
-        if preprocess_iterable:
-            streaming_ds = True
-        # pylint: disable=invalid-name
-        for config_dataset in datasets_w_name_generator(cfg_datasets):
-            ds: Union[Dataset, DatasetDict] = load_dataset_w_config(
-                config_dataset, use_auth_token, streaming=streaming_ds
-            )
+    # Format for PyTorch
+    return train_dataset.with_format("torch")
 
-            d_base_type = d_prompt_style = None
-            d_type = config_dataset.type
-            if isinstance(d_type, str):
-                d_type_split = d_type.split(":")
-                d_base_type = d_type_split[0]
-                d_prompt_style = d_type_split[1] if len(d_type_split) > 1 else None
 
-            if isinstance(ds, DatasetDict):
-                if config_dataset.split and config_dataset.split in ds:
-                    ds = ds[config_dataset.split]
-                elif split in ds:
-                    ds = ds[split]
-                else:
-                    raise ValueError(
-                        f"no {split} split found for dataset {config_dataset.path}, you may specify a split with 'split: `"
-                    )
+def _create_placeholder_dataset() -> IterableDataset:
+    """Create a minimal placeholder dataset for non-main processes."""
+    with tempfile.NamedTemporaryFile(mode="w+", delete=False) as f:
+        f.write("text\n")
+        f.write("lorem ipsum dolor sit amet\n")
+        f.seek(0)
+        return load_dataset("csv", data_files=f.name, split="train", streaming=True)
 
-            # support for using a subset of the data
-            if config_dataset.shards:
-                shards_idx = config_dataset.get("shards_idx", 0)
-                ds = ds.shuffle(seed=seed).shard(
-                    num_shards=config_dataset.shards, index=shards_idx
-                )
 
-            dataset_wrapper, dataset_prompter = get_dataset_wrapper(
-                config_dataset=config_dataset,
-                tokenizer=tokenizer,
-                cfg=cfg,
-                d_base_type=d_base_type,
-                dataset=ds,
-                d_prompt_style=d_prompt_style,
-                processor=processor,
-            )
-            datasets.append(dataset_wrapper)
-            prompters.append(dataset_prompter)
+def _load_tokenized_prepared_datasets(
+    tokenizer: PreTrainedTokenizer,
+    cfg: DictDefault,
+    split: Literal["train", "test"] = "train",
+    processor: ProcessorMixin | None = None,
+    preprocess_iterable: bool = False,
+) -> tuple[Dataset | DatasetDict, list[Prompter | None]]:
+    """Load or create tokenized and prepared datasets for training or testing.
 
-        if len(datasets) == 1:
-            dataset = datasets[0]
-        else:
-            LOG.info("merging datasets")
-            dataset = concatenate_datasets(datasets)
+    Args:
+        tokenizer: Tokenizer for processing text.
+        cfg: Configuration object.
+        split: Dataset split to load ('train' or 'test').
+        processor: Optional processor for multimodal datasets.
+        preprocess_iterable: Whether to use iterable preprocessing.
 
-        if len(datasets) > 1:
-            if cfg.shuffle_merged_datasets:
-                LOG.debug("shuffle merged datasets")
-                dataset = dataset.shuffle(seed=seed)
-            else:
-                LOG.debug("NOT shuffling merged datasets")
+    Returns:
+        Tuple of (dataset, prompters list).
+    """
+    # Select correct dataset configuration based on split
+    datasets_configs = cfg.datasets if split == "train" else cfg.test_datasets
 
-        if not cfg.skip_prepare_dataset:
-            dataset = drop_long_seq_in_dataset(dataset, cfg)
+    # Generate dataset hash for caching
+    dataset_hash = generate_dataset_hash_from_config(
+        cfg, datasets_configs, tokenizer.name_or_path
+    )
 
-            if cfg.sample_packing:
-                dataset, _ = process_datasets_for_packing(cfg, dataset, None)
+    # Try loading from hub if push_dataset_to_hub is configured
+    dataset = None
+    if cfg.push_dataset_to_hub:
+        dataset = try_load_from_hub(cfg, dataset_hash, split)
 
-        if cfg.local_rank == 0 and not cfg.skip_prepare_dataset:
-            LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
-            if isinstance(dataset, IterableDataset):
-                num_workers = cfg.dataset_processes
+    # If not found on hub, try loading from disk
+    if dataset is None:
+        dataset = load_preprocessed_dataset(cfg, dataset_hash)
 
-                def gen_from_iter_ds(_ds, worker_id: List[int], num_workers: List[int]):
-                    """Generator function to correctly splice the dataset for each worker"""
-                    for i, item in enumerate(_ds):
-                        if i % num_workers[0] == worker_id[0]:
-                            yield item
-
-                ds_from_iter = Dataset.from_generator(
-                    functools.partial(gen_from_iter_ds, dataset),
-                    features=dataset.features,
-                    num_proc=num_workers,
-                    split=split,
-                    gen_kwargs={
-                        "worker_id": list(range(num_workers)),
-                        "num_workers": [num_workers] * num_workers,
-                    },
-                )
-                ds_from_iter.save_to_disk(str(prepared_ds_path))
-            else:
-                os.makedirs(prepared_ds_path, exist_ok=True)
-                dataset.save_to_disk(str(prepared_ds_path))
-            if cfg.push_dataset_to_hub:
-                LOG.info(
-                    f"Pushing merged prepared dataset to Huggingface hub at {cfg.push_dataset_to_hub} (version {ds_hash})..."
-                )
-                dataset.push_to_hub(
-                    cfg.push_dataset_to_hub,
-                    ds_hash,
-                    private=True,
-                )
+    # If not found on disk or skipping prepared dataset, load and process raw datasets
+    prompters: list[Prompter | None] = []
+    if dataset is None:
+        dataset, prompters = _load_raw_datasets(
+            cfg,
+            datasets_configs,
+            tokenizer,
+            split,
+            processor,
+            preprocess_iterable,
+        )
 
     return dataset, prompters
 
 
-def load_prepare_datasets(
-    tokenizer: PreTrainedTokenizerBase,
-    cfg,
-    default_dataset_prepared_path,
-    split="train",
-    processor=None,
-    preprocess_iterable: Optional[bool] = False,
-) -> Tuple[Dataset, Dataset, List[Prompter]]:
-    dataset, prompters = load_tokenized_prepared_datasets(
-        tokenizer,
-        cfg,
-        default_dataset_prepared_path,
-        split=split,
-        processor=processor,
-        preprocess_iterable=preprocess_iterable,
+def _load_raw_datasets(
+    cfg: DictDefault,
+    datasets_configs: list,
+    tokenizer: PreTrainedTokenizer,
+    split: str,
+    processor: ProcessorMixin | None = None,
+    preprocess_iterable: bool = False,
+) -> tuple[Dataset, list[Prompter | None]]:
+    """Load, process, merge, and save raw datasets."""
+    LOG.info("Loading raw datasets...", main_process_only=False)
+    if not cfg.is_preprocess and not cfg.skip_prepare_dataset:
+        LOG.warning(
+            "Processing datasets during training can lead to VRAM instability. Please "
+            "pre-process your dataset using `axolotl preprocess path/to/config.yml`."
+        )
+
+    # Load and process individual datasets
+    datasets = []
+    prompters = []
+    for dataset_config in datasets_with_name_generator(datasets_configs):
+        dataset_wrapper, dataset_prompter = _load_and_process_single_dataset(
+            dataset_config=dataset_config,
+            cfg=cfg,
+            tokenizer=tokenizer,
+            split=split,
+            seed=cfg.seed,
+            processor=processor,
+            preprocess_iterable=preprocess_iterable,
+        )
+        datasets.append(dataset_wrapper)
+        prompters.append(dataset_prompter)
+
+    # Merge datasets
+    dataset = merge_datasets(datasets, cfg)
+
+    if not cfg.skip_prepare_dataset:
+        if split == "test" and cfg.eval_sequence_len:
+            dataset = drop_long_seq_in_dataset(dataset, cfg.eval_sequence_len, cfg)
+        else:
+            dataset = drop_long_seq_in_dataset(dataset, cfg.sequence_len, cfg)
+        if cfg.sample_packing:
+            dataset, _ = process_datasets_for_packing(cfg, dataset, None)
+
+        # Save the prepared dataset
+        dataset_hash = generate_dataset_hash_from_config(
+            cfg, datasets_configs, tokenizer.name_or_path
+        )
+        save_preprocessed_dataset(cfg, dataset, dataset_hash, split)
+
+    return dataset, prompters
+
+
+def _load_and_process_single_dataset(
+    dataset_config: DictDefault,
+    cfg: DictDefault,
+    tokenizer: PreTrainedTokenizer,
+    split: str,
+    seed: int,
+    processor: ProcessorMixin | None = None,
+    preprocess_iterable: bool = False,
+) -> tuple[Dataset | IterableDataset, Prompter | None]:
+    """Load and process a single dataset based on the passed config."""
+    # Load the dataset
+    dataset = load_dataset_with_config(
+        dataset_config, cfg.hf_use_auth_token, streaming=preprocess_iterable
     )
 
+    # Parse dataset type
+    d_base_type, d_prompt_style = _parse_dataset_type(dataset_config.type)
+
+    # Select the appropriate split
+    if isinstance(dataset, DatasetDict):
+        if dataset_config.split and dataset_config.split in dataset:
+            dataset = dataset[dataset_config.split]
+        elif split in dataset:
+            dataset = dataset[split]
+        else:
+            raise ValueError(
+                f"no {split} split found for dataset {dataset_config.path}, you may "
+                "specify a split with 'split: ...'"
+            )
+
+    # Apply sharding if configured
+    if dataset_config.shards:
+        shards_idx = dataset_config.get("shards_idx", 0)
+        dataset = dataset.shuffle(seed=seed).shard(
+            num_shards=dataset_config.shards, index=shards_idx
+        )
+
+    # Apply dataset wrapper
+    dataset_wrapper, dataset_prompter = get_dataset_wrapper(
+        dataset_config=dataset_config,
+        tokenizer=tokenizer,
+        cfg=cfg,
+        dataset_base_type=d_base_type,
+        dataset=dataset,
+        dataset_prompt_style=d_prompt_style,
+        processor=processor,
+    )
+
+    return dataset_wrapper, dataset_prompter
+
+
+def _parse_dataset_type(d_type: str) -> tuple[str | None, str | None]:
+    """Parse the dataset type string into base type and prompt style."""
+    if not isinstance(d_type, str):
+        return None, None
+
+    d_type_split = d_type.split(":")
+    d_base_type = d_type_split[0]
+    d_prompt_style = d_type_split[1] if len(d_type_split) > 1 else None
+
+    return d_base_type, d_prompt_style
+
+
+def _handle_train_dataset_split(
+    dataset: Dataset, cfg: DictDefault
+) -> tuple[Dataset, Dataset | None]:
+    """Handle processing for train split, including validation set creation."""
+    val_set_size = (
+        int(cfg.val_set_size) if cfg.val_set_size > 1 else float(cfg.val_set_size)
+    )
+
+    if val_set_size:
+        # Create train/validation split
+        train_dataset, eval_dataset = create_train_validation_split(
+            dataset, cfg, val_set_size
+        )
+        return train_dataset, eval_dataset
+
+    # No validation split - apply deduplication if needed and return as train dataset
+    if cfg.dataset_exact_deduplication:
+        train_dataset, _ = deduplicate_and_log_datasets(dataset=dataset)
+    else:
+        train_dataset = dataset
+
+    return train_dataset, None
+
+
+def _handle_test_dataset_split(
+    dataset: Dataset, cfg: DictDefault
+) -> tuple[None, Dataset | None]:
+    """Handle processing for test split."""
+    if cfg.dataset_exact_deduplication:
+        eval_dataset, _ = deduplicate_and_log_datasets(dataset=dataset)
+    else:
+        eval_dataset = dataset
+
+    return None, eval_dataset
+
+
+def _apply_dataset_sharding(dataset: Dataset, cfg: DictDefault) -> Dataset:
+    """Apply dataset sharding if configured.
+
+    Args:
+        dataset: Dataset to shard.
+        cfg: Configuration object containing shard settings.
+
+    Returns:
+        Sharded dataset or original dataset if no sharding configured.
+    """
     if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
         LOG.info(
             f"Using index #{cfg.dataset_shard_idx} of {cfg.dataset_shard_num} shards"
@@ -410,259 +471,44 @@ def load_prepare_datasets(
             num_shards=cfg.dataset_shard_num,
             index=cfg.dataset_shard_idx,
         )
+    return dataset
 
-    val_set_size = (
-        int(cfg.val_set_size) if cfg.val_set_size > 1 else float(cfg.val_set_size)
+
+def _load_and_prepare_datasets(
+    tokenizer: PreTrainedTokenizer,
+    cfg: DictDefault,
+    split: Literal["train", "test"] = "train",
+    processor: ProcessorMixin | None = None,
+    preprocess_iterable: bool = False,
+) -> tuple[Dataset | None, Dataset | None, list[Prompter | None]]:
+    """Load and prepare datasets with optional validation split and sharding.
+
+    Args:
+        tokenizer: Tokenizer for processing text.
+        cfg: Configuration object.
+        split: Dataset split to load ('train' or 'test').
+        processor: Optional processor for multimodal datasets.
+        preprocess_iterable: Whether to use iterable preprocessing.
+
+    Returns:
+        Tuple of (train_dataset, eval_dataset, prompters).
+    """
+    # Load the base dataset
+    dataset, prompters = _load_tokenized_prepared_datasets(
+        tokenizer,
+        cfg,
+        split=split,
+        processor=processor,
+        preprocess_iterable=preprocess_iterable,
     )
 
-    if split == "train" and val_set_size:
-        seed = cfg.seed if cfg.seed is not None else 42
+    # Apply dataset sharding if configured using shared function
+    dataset = _apply_dataset_sharding(dataset, cfg)
 
-        # ensure we end up with the same fingerprint by doing rank0 first and being able to cache
-        to_hash_train = (
-            dataset._fingerprint  # pylint: disable=protected-access
-            + "|"
-            + str(val_set_size)
-            + "|"
-            + "train"
-            + "|"
-            + str(seed)
-        )
-        to_hash_test = (
-            dataset._fingerprint  # pylint: disable=protected-access
-            + "|"
-            + str(val_set_size)
-            + "|"
-            + "test"
-            + "|"
-            + str(seed)
-        )
-        train_fingerprint = md5(to_hash_train)
-        test_fingerprint = md5(to_hash_test)
-        if cfg.dataset_exact_deduplication:
-            _, _, dataset = deduplicate_and_log_datasets(dataset=dataset)
-        dataset = dataset.train_test_split(
-            test_size=val_set_size,
-            shuffle=False,
-            seed=seed,
-            train_new_fingerprint=train_fingerprint,
-            test_new_fingerprint=test_fingerprint,
-        )
-
-        train_dataset = dataset["train"]
-        eval_dataset = dataset["test"]
-    elif split == "test":
-        if cfg.dataset_exact_deduplication:
-            _, eval_dataset, _ = deduplicate_and_log_datasets(eval_dataset=dataset)
-        else:
-            eval_dataset = dataset
-        train_dataset = None
+    # Apply deduplication and create train / validation splits based on the split type
+    if split == "train":
+        train_dataset, eval_dataset = _handle_train_dataset_split(dataset, cfg)
     else:
-        if cfg.dataset_exact_deduplication:
-            train_dataset, _, _ = deduplicate_and_log_datasets(train_dataset=dataset)
-        else:
-            train_dataset = dataset
-        eval_dataset = None
+        train_dataset, eval_dataset = _handle_test_dataset_split(dataset, cfg)
+
     return train_dataset, eval_dataset, prompters
-
-
-def get_dataset_wrapper(
-    config_dataset,
-    tokenizer,
-    cfg,
-    d_base_type,
-    dataset,
-    d_prompt_style=None,
-    processor=None,  # pylint: disable=unused-argument
-):
-    dataset_wrapper = None
-    dataset_prompter = None
-
-    ds_kwargs = {
-        "process_count": cfg.dataset_processes,
-        "keep_in_memory": cfg.dataset_keep_in_memory is True,
-    }
-
-    LOG.info(
-        f"Loading dataset: {config_dataset['path']} with base_type: {d_base_type} and prompt_style: {d_prompt_style}"
-    )
-
-    if (
-        isinstance(dataset, Dataset)
-        and "input_ids" in dataset.features
-        and "attention_mask" in dataset.features
-        and "labels" in dataset.features
-    ):
-        # dataset is already tokenized, just drop it straight in
-        dataset_prompter = UnsupportedPrompter()
-        dataset_wrapper = dataset
-    elif isinstance(config_dataset.type, DictDefault):
-        ds_strategy = load(
-            "user_defined", tokenizer, cfg, config_dataset.type.to_dict()
-        )
-        dataset_prompter = UnsupportedPrompter()
-        dataset_wrapper = wrap_dataset_for_tokenized_prompt(
-            ds_strategy,
-            dataset,
-            **ds_kwargs,
-        )
-    elif cfg.skip_prepare_dataset:
-        dataset_wrapper = dataset
-    elif ds_strategy := config_dataset.type.startswith(
-        "bradley_terry"
-    ) and bradley_terry_load(
-        config_dataset.type.split(".", 1)[1], tokenizer, cfg, config_dataset
-    ):
-        dataset_prompter = UnsupportedPrompter()
-        dataset_wrapper = wrap_dataset_for_tokenized_prompt(
-            ds_strategy,
-            dataset,
-            **ds_kwargs,
-        )
-    elif config_dataset.type.startswith("stepwise_supervised"):
-        dataset_prompter = UnsupportedPrompter()
-        ds_strategy = load(config_dataset.type, tokenizer, cfg, config_dataset)
-        # we need to explicitly cast boolean labels to int
-        # for compatibility with how trl's PRMTrainer works
-        dataset = dataset.cast_column("labels", Sequence(Value("int64")))
-        dataset_wrapper = TokenizedPromptDataset(
-            ds_strategy,
-            dataset,
-            **ds_kwargs,
-        )
-    elif ds_strategy := load(
-        config_dataset.type, tokenizer, cfg, config_dataset, processor=processor
-    ):
-        if isinstance(ds_strategy, DatasetWrappingStrategy):
-            dataset_wrapper = ds_strategy.wrap_dataset(dataset, **ds_kwargs)
-        else:
-            dataset_prompter = UnsupportedPrompter()
-            dataset_wrapper = wrap_dataset_for_tokenized_prompt(
-                ds_strategy,
-                dataset,
-                **ds_kwargs,
-            )
-    elif d_base_type == "alpaca":
-        dataset_prompter = AlpacaPrompter(d_prompt_style)
-        ds_strategy = AlpacaPromptTokenizingStrategy(
-            dataset_prompter,
-            tokenizer,
-            cfg.train_on_inputs,
-            cfg.sequence_len,
-        )
-        ds_wrapper = wrap_dataset_for_tokenized_prompt(
-            ds_strategy,
-            dataset,
-            **ds_kwargs,
-        )
-        dataset_wrapper = ds_wrapper
-    elif d_base_type == "explainchoice":
-        dataset_prompter = MultipleChoiceExplainPrompter(d_prompt_style)
-        ds_strategy = AlpacaMultipleChoicePromptTokenizingStrategy(
-            dataset_prompter,
-            tokenizer,
-            cfg.train_on_inputs,
-            cfg.sequence_len,
-        )
-        ds_wrapper = wrap_dataset_for_tokenized_prompt(
-            ds_strategy,
-            dataset,
-            **ds_kwargs,
-        )
-        dataset_wrapper = ds_wrapper
-    elif d_base_type == "concisechoice":
-        dataset_prompter = MultipleChoiceConcisePrompter(d_prompt_style)
-        ds_strategy = AlpacaMultipleChoicePromptTokenizingStrategy(
-            dataset_prompter,
-            tokenizer,
-            cfg.train_on_inputs,
-            cfg.sequence_len,
-        )
-        ds_wrapper = wrap_dataset_for_tokenized_prompt(
-            ds_strategy,
-            dataset,
-            **ds_kwargs,
-        )
-        dataset_wrapper = ds_wrapper
-    elif d_base_type == "summarizetldr":
-        dataset_prompter = SummarizeTLDRPrompter(d_prompt_style)
-        ds_strategy = SummarizeTLDRPromptTokenizingStrategy(
-            dataset_prompter,
-            tokenizer,
-            cfg.train_on_inputs,
-            cfg.sequence_len,
-        )
-        ds_wrapper = wrap_dataset_for_tokenized_prompt(
-            ds_strategy,
-            dataset,
-            **ds_kwargs,
-        )
-        dataset_wrapper = ds_wrapper
-    elif d_base_type == "jeopardy":
-        dataset_prompter = JeopardyPrompter(d_prompt_style)
-        ds_strategy = JeopardyPromptTokenizingStrategy(
-            dataset_prompter,
-            tokenizer,
-            cfg.train_on_inputs,
-            cfg.sequence_len,
-        )
-        ds_wrapper = wrap_dataset_for_tokenized_prompt(
-            ds_strategy,
-            dataset,
-            **ds_kwargs,
-        )
-        dataset_wrapper = ds_wrapper
-    elif d_base_type == "oasst":
-        dataset_prompter = AlpacaPrompter(d_prompt_style)
-        ds_strategy = OpenAssistantPromptTokenizingStrategy(
-            dataset_prompter,
-            tokenizer,
-            cfg.train_on_inputs,
-            cfg.sequence_len,
-        )
-        ds_wrapper = wrap_dataset_for_tokenized_prompt(
-            ds_strategy,
-            dataset,
-            **ds_kwargs,
-        )
-        dataset_wrapper = ds_wrapper
-    elif d_base_type == "gpteacher":
-        dataset_prompter = GPTeacherPrompter(d_prompt_style)
-        ds_strategy = GPTeacherPromptTokenizingStrategy(
-            dataset_prompter,
-            tokenizer,
-            cfg.train_on_inputs,
-            cfg.sequence_len,
-        )
-        ds_wrapper = wrap_dataset_for_tokenized_prompt(
-            ds_strategy,
-            dataset,
-            **ds_kwargs,
-        )
-        dataset_wrapper = ds_wrapper
-    elif d_base_type == "reflection":
-        dataset_prompter = ReflectAlpacaPrompter(d_prompt_style)
-        ds_strategy = AlpacaReflectionPTStrategy(
-            dataset_prompter,
-            tokenizer,
-            cfg.train_on_inputs,
-            cfg.sequence_len,
-        )
-        ds_wrapper = wrap_dataset_for_tokenized_prompt(
-            ds_strategy,
-            dataset,
-            **ds_kwargs,
-        )
-        dataset_wrapper = ds_wrapper
-    else:
-        suffix = ""
-        if ":load_" in config_dataset.type:
-            suffix = f" Did you mean {config_dataset.type.replace(':load_', '.load_')}?"
-        LOG.error(
-            f"unhandled prompt tokenization strategy: {config_dataset.type}. {suffix}"
-        )
-        raise ValueError(
-            f"unhandled prompt tokenization strategy: {config_dataset.type} {suffix}"
-        )
-
-    return dataset_wrapper, dataset_prompter
diff --git a/src/axolotl/utils/data/shared.py b/src/axolotl/utils/data/shared.py
index d2e119f77..21c8e472b 100644
--- a/src/axolotl/utils/data/shared.py
+++ b/src/axolotl/utils/data/shared.py
@@ -1,11 +1,21 @@
-"""
-dataset loading shared utils
-"""
+"""Dataset loading shared utils."""
 
+from __future__ import annotations
+
+import functools
+import os
 from pathlib import Path
-from typing import Optional, Union
+from typing import TYPE_CHECKING, Any, Generator
 
-from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
+from datasets import (
+    Dataset,
+    DatasetDict,
+    IterableDataset,
+    IterableDatasetDict,
+    concatenate_datasets,
+    load_dataset,
+    load_from_disk,
+)
 from huggingface_hub import hf_hub_download, snapshot_download
 from huggingface_hub.errors import (
     HFValidationError,
@@ -13,78 +23,142 @@ from huggingface_hub.errors import (
     RevisionNotFoundError,
 )
 
+from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
+from axolotl.utils.data.utils import deduplicate_and_log_datasets, md5
+from axolotl.utils.datasets import get_default_process_count
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
+
+if TYPE_CHECKING:
+    from adlfs import AzureBlobFileSystem
+    from gcsfs import GCSFileSystem
+    from ocifs import OCIFileSystem
+    from s3fs import S3FileSystem
+
+LOG = get_logger(__name__)
+
+EXTENSIONS_TO_DATASET_TYPES = {
+    ".parquet": "parquet",
+    ".arrow": "arrow",
+    ".csv": "csv",
+    ".txt": "text",
+}
 
 
-def get_ds_type(config_dataset: DictDefault):
-    """
-    Get the dataset type from the path if it's not specified
-    """
-    ds_type = "json"
-    if config_dataset.ds_type:
-        ds_type = config_dataset.ds_type
-    elif ".parquet" in config_dataset.path:
-        ds_type = "parquet"
-    elif ".arrow" in config_dataset.path:
-        ds_type = "arrow"
-    elif ".csv" in config_dataset.path:
-        ds_type = "csv"
-    elif ".txt" in config_dataset.path:
-        ds_type = "text"
-    return ds_type
+def get_dataset_type(dataset_config: DictDefault) -> str:
+    """Get the dataset type from the path if it's not specified."""
+    if dataset_config.ds_type:
+        return dataset_config.ds_type
+
+    for extension, dataset_type in EXTENSIONS_TO_DATASET_TYPES.items():
+        if extension in dataset_config.path:
+            return dataset_type
+
+    return "json"
 
 
-def datasets_w_name_generator(dataset_configs: list[DictDefault]):
-    """
-    Yields dataset configs handling multiple names or preprocess_shards
+def datasets_with_name_generator(
+    dataset_configs: list[DictDefault],
+) -> Generator[DictDefault, None, None]:
+    """Yields expanded dataset configurations based on multiple names or preprocessing
+    shards.
+
+    When a dataset config has a list of names, it yields separate configs for each
+    name. When a dataset config specifies preprocessing shards, it yields configs for
+    each shard.
 
     Args:
-        dataset_configs: list of dataset configs (equivalent to cfg.datasets)
+        dataset_configs: List of dataset configuration objects.
+
+    Yields:
+        Individual dataset configurations, expanded as needed for names or shards.
     """
-    for dataset in dataset_configs:
-        if dataset.name and isinstance(dataset.name, list):
-            # load_dataset doesn't properly handle multiple named configurations
-            # at the same time for a given dataset
-            for name in dataset.name:
-                yield DictDefault({**dataset, "name": name})
-        elif dataset.preprocess_shards and not dataset.shards:
-            for shard in range(dataset.preprocess_shards):
+    for config in dataset_configs:
+        if config.name and isinstance(config.name, list):
+            for name in config.name:
+                yield DictDefault({**config, "name": name})
+        elif config.preprocess_shards and not config.shards:
+            for shard_idx in range(config.preprocess_shards):
                 yield DictDefault(
                     {
-                        **dataset,
-                        "shards": dataset.preprocess_shards,
-                        "shards_idx": shard,
+                        **config,
+                        "shards": config.preprocess_shards,
+                        "shards_idx": shard_idx,
                     }
                 )
         else:
-            yield dataset
+            yield config
 
 
-def load_dataset_w_config(
-    config_dataset: DictDefault, use_auth_token: bool, streaming=False
-) -> Union[Dataset, DatasetDict]:
-    """
-    Load a dataset from a config
+def load_dataset_with_config(
+    dataset_config: DictDefault, use_auth_token: bool, streaming=False
+) -> Dataset | IterableDataset:
+    """Load a dataset from a config. Handles datasets that are stored locally, in the
+    HuggingFace Hub, in a remote filesystem (S3, GCS, Azure, OCI), a URL, or
+    `data_files`.
 
     Args:
-        config_dataset: single dataset config
-        use_auth_token: whether to use HF auth token
-        streaming: whether to stream the dataset
+        dataset_config: Single dataset config.
+        use_auth_token: Whether to use HF auth token.
+        streaming: Whether to stream the dataset.
+
+    Returns:
+        Loaded dataset.
     """
-    # pylint: disable=invalid-name
-    ds: Optional[Union[Dataset, DatasetDict]] = None  # pylint: disable=invalid-name
-    ds_from_hub = False
+    # Set up common kwargs for dataset loading
+    load_dataset_kwargs = {
+        "split": dataset_config.split if dataset_config.split else None,
+        "name": dataset_config.name,
+        "streaming": streaming,
+        "trust_remote_code": dataset_config.trust_remote_code,
+    }
+
+    # First check if it's a local path
+    if Path(dataset_config.path).exists():
+        return _load_from_local_path(dataset_config, load_dataset_kwargs)
+
+    # Check if it's a HuggingFace dataset
+    is_hub_dataset = _check_if_hub_dataset(dataset_config, use_auth_token)
+
+    # Check if it's a cloud storage path and get appropriate filesystem
+    remote_fs, storage_options = _get_remote_filesystem(dataset_config.path)
+    is_cloud_dataset = False
+    if remote_fs:
+        try:
+            is_cloud_dataset = remote_fs.exists(dataset_config.path)
+        except (FileNotFoundError, ConnectionError):
+            pass
+
+    # Load from appropriate source
+    if is_hub_dataset:
+        return _load_from_hub(dataset_config, use_auth_token, load_dataset_kwargs)
+    if is_cloud_dataset:
+        return _load_from_cloud(
+            dataset_config, remote_fs, storage_options, load_dataset_kwargs
+        )
+    if dataset_config.path.startswith("https://"):
+        return _load_from_url(dataset_config, load_dataset_kwargs)
+    if dataset_config.data_files:
+        return _load_from_data_files(dataset_config, load_dataset_kwargs)
+
+    raise ValueError(
+        f"The dataset could not be loaded. This could be due to a misconfigured dataset path "
+        f"({dataset_config.path}). Try double-check your path / name / data_files. "
+        f"This is not caused by the dataset type."
+    )
+
+
+def _check_if_hub_dataset(dataset_config: DictDefault, use_auth_token: bool) -> bool:
+    """Check if a dataset exists on the HuggingFace Hub."""
     try:
-        # this is just a basic check to see if the path is a
-        # valid HF dataset that's loadable
         snapshot_download(
-            repo_id=config_dataset.path,
+            repo_id=dataset_config.path,
             repo_type="dataset",
             token=use_auth_token,
-            revision=config_dataset.revision,
+            revision=dataset_config.revision,
             ignore_patterns=["*"],
         )
-        ds_from_hub = True
+        return True
     except (
         RepositoryNotFoundError,
         RevisionNotFoundError,
@@ -93,198 +167,401 @@ def load_dataset_w_config(
         HFValidationError,
         ValueError,
     ):
-        pass
+        return False
 
-    ds_from_cloud = False
-    storage_options: dict = {}
-    remote_file_system = None
-    if config_dataset.path.startswith("s3://"):
+
+def _get_remote_filesystem(
+    path: str,
+) -> tuple[
+    S3FileSystem | GCSFileSystem | AzureBlobFileSystem | OCIFileSystem | None, dict
+]:
+    """Get the appropriate filesystem for a remote path."""
+    if path.startswith("s3://"):
         try:
-            import s3fs  # type: ignore
+            import s3fs
+
+            storage_options = {"anon": False}
+            return s3fs.S3FileSystem(**storage_options), storage_options
         except ImportError as exc:
             raise ImportError("s3:// paths require s3fs to be installed") from exc
 
-        # Reads env, credentials from ~/.aws/credentials, or IAM metadata provider
-        # https://s3fs.readthedocs.io/en/latest/index.html?highlight=storage_options#credentials
-        storage_options = {"anon": False}
-        remote_file_system = s3fs.S3FileSystem(**storage_options)
-    elif config_dataset.path.startswith("gs://") or config_dataset.path.startswith(
-        "gcs://"
-    ):
+    elif path.startswith(("gs://", "gcs://")):
         try:
-            import gcsfs  # type: ignore
+            import gcsfs
+
+            storage_options = {"token": None}  # type: ignore
+            return gcsfs.GCSFileSystem(**storage_options), storage_options
         except ImportError as exc:
             raise ImportError(
                 "gs:// or gcs:// paths require gcsfs to be installed"
             ) from exc
 
-        # gcsfs will use default credentials from the environment else anon
-        # https://gcsfs.readthedocs.io/en/latest/#credentials
-        storage_options = {"token": None}
-        remote_file_system = gcsfs.GCSFileSystem(**storage_options)
-    elif (
-        config_dataset.path.startswith("adl://")
-        or config_dataset.path.startswith("abfs://")
-        or config_dataset.path.startswith("az://")
-    ):
+    elif path.startswith(("adl://", "abfs://", "az://")):
         try:
             import adlfs
+
+            storage_options = {"anon": False}
+            return adlfs.AzureBlobFileSystem(**storage_options), storage_options
         except ImportError as exc:
             raise ImportError(
                 "adl:// or abfs:// paths require adlfs to be installed"
             ) from exc
 
-        # # Ensure you have the following environment variables set:
-        # # Gen 1
-        # storage_options = {
-        #     "tenant_id": AZURE_STORAGE_TENANT_ID,
-        #     "client_id": AZURE_STORAGE_CLIENT_ID,
-        #     "client_secret": AZURE_STORAGE_CLIENT_SECRET,
-        # }
-        # # Gen 2
-        # storage_options = {
-        #     "account_name": AZURE_STORAGE_ACCOUNT_NAME,
-        #     "account_key": AZURE_STORAGE_ACCOUNT_KEY,
-        # }
-
-        # Reads env
-        # https://github.com/fsspec/adlfs?tab=readme-ov-file#setting-credentials
-        storage_options = {"anon": False}
-        remote_file_system = adlfs.AzureBlobFileSystem(**storage_options)
-    elif config_dataset.path.startswith("oci://"):
+    elif path.startswith("oci://"):
         try:
             import ocifs
+
+            storage_options = {}
+            return ocifs.OCIFileSystem(**storage_options), storage_options
         except ImportError as exc:
             raise ImportError("oci:// paths require ocifs to be installed") from exc
 
-        # https://ocifs.readthedocs.io/en/latest/getting-connected.html#Using-Environment-Variables
-        remote_file_system = ocifs.OCIFileSystem(**storage_options)
+    return None, {}
 
-    try:
-        if remote_file_system and remote_file_system.exists(config_dataset.path):
-            ds_from_cloud = True
-    except (FileNotFoundError, ConnectionError):
-        pass
 
-    # gather extra args from the config
-    load_ds_kwargs = {}
-    if config_dataset.split:
-        load_ds_kwargs["split"] = config_dataset.split
+def _load_from_local_path(
+    dataset_config: DictDefault, load_dataset_kwargs: dict
+) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict:
+    """Load a dataset from a local path."""
+    local_path = Path(dataset_config.path)
+
+    if local_path.is_dir():
+        if dataset_config.data_files:
+            dataset_type = get_dataset_type(dataset_config)
+            return load_dataset(
+                dataset_type,
+                data_files=dataset_config.data_files,
+                **load_dataset_kwargs,
+            )
+        try:
+            return load_from_disk(dataset_config.path)
+        except FileNotFoundError:
+            load_dataset_kwargs["streaming"] = False
+            return load_dataset(dataset_config.path, **load_dataset_kwargs)
+    elif local_path.is_file():
+        dataset_type = get_dataset_type(dataset_config)
+        load_dataset_kwargs["streaming"] = False
+        return load_dataset(
+            dataset_type,
+            data_files=dataset_config.path,
+            **load_dataset_kwargs,
+        )
     else:
-        load_ds_kwargs["split"] = None
-
-    # prefer local dataset, even if hub exists
-    local_path = Path(config_dataset.path)
-    if local_path.exists():
-        if local_path.is_dir():
-            if config_dataset.data_files:
-                ds_type = get_ds_type(config_dataset)
-                ds = load_dataset(  # pylint: disable=invalid-name
-                    ds_type,
-                    name=config_dataset.name,
-                    data_files=config_dataset.data_files,
-                    streaming=streaming,
-                    **load_ds_kwargs,
-                )
-            else:
-                try:
-                    ds = load_from_disk(
-                        config_dataset.path
-                    )  # pylint: disable=invalid-name
-                except FileNotFoundError:
-                    ds = load_dataset(
-                        config_dataset.path,
-                        name=config_dataset.name,
-                        streaming=False,
-                        **load_ds_kwargs,
-                    )
-        elif local_path.is_file():
-            ds_type = get_ds_type(config_dataset)
-
-            ds = load_dataset(  # pylint: disable=invalid-name
-                ds_type,
-                name=config_dataset.name,
-                data_files=config_dataset.path,
-                streaming=False,
-                **load_ds_kwargs,
-            )
-        else:
-            raise ValueError(
-                "unhandled dataset load: local path exists, but is neither a directory or a file"
-            )
-    elif ds_from_hub:
-        ds = load_dataset(
-            config_dataset.path,
-            name=config_dataset.name,
-            streaming=streaming,
-            data_files=config_dataset.data_files,
-            token=use_auth_token,
-            revision=config_dataset.revision,
-            trust_remote_code=config_dataset.trust_remote_code,
-            **load_ds_kwargs,
-        )
-    elif ds_from_cloud and remote_file_system:
-        if remote_file_system.isdir(config_dataset.path):
-            ds = load_from_disk(
-                config_dataset.path,
-                storage_options=storage_options,
-            )
-        elif remote_file_system.isfile(config_dataset.path):
-            ds_type = get_ds_type(config_dataset)
-            ds = load_dataset(
-                ds_type,
-                name=config_dataset.name,
-                data_files=config_dataset.path,
-                streaming=streaming,
-                storage_options=storage_options,
-                trust_remote_code=config_dataset.trust_remote_code,
-                **load_ds_kwargs,
-            )
-    elif config_dataset.path.startswith("https://"):
-        ds_type = get_ds_type(config_dataset)
-        ds = load_dataset(
-            ds_type,
-            name=config_dataset.name,
-            data_files=config_dataset.path,
-            streaming=streaming,
-            storage_options=storage_options,
-            trust_remote_code=config_dataset.trust_remote_code,
-            **load_ds_kwargs,
-        )
-    elif config_dataset.data_files:
-        fp: str | list[str] | None = None
-        if isinstance(config_dataset.data_files, str):
-            fp = hf_hub_download(
-                repo_id=config_dataset.path,
-                repo_type="dataset",
-                filename=config_dataset.data_files,
-                revision=config_dataset.revision,
-            )
-        elif isinstance(config_dataset.data_files, list):
-            fp = []
-            for file in config_dataset.data_files:
-                fp.append(
-                    hf_hub_download(
-                        repo_id=config_dataset.path,
-                        repo_type="dataset",
-                        filename=file,
-                        revision=config_dataset.revision,
-                    )
-                )
-        else:
-            raise ValueError("data_files must be either a string or list of strings")
-        ds = load_dataset(
-            "json",
-            name=config_dataset.name,
-            data_files=fp,
-            streaming=streaming,
-            **load_ds_kwargs,
-        )
-    if not ds:
         raise ValueError(
-            "The dataset could not be loaded. This could be due to a misconfigured dataset path "
-            f"({config_dataset.path}). Try double-check your path / name / data_files. "
-            "This is not caused by the dataset type."
+            "Unhandled dataset load: local path exists, but is neither a directory or a file"
         )
 
-    return ds
+
+def _load_from_hub(
+    dataset_config: DictDefault, use_auth_token: bool, load_dataset_kwargs: dict
+) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict:
+    """Load a dataset from the HuggingFace Hub."""
+    return load_dataset(
+        dataset_config.path,
+        data_files=dataset_config.data_files,
+        token=use_auth_token,
+        revision=dataset_config.revision,
+        **load_dataset_kwargs,
+    )
+
+
+def _load_from_cloud(
+    dataset_config: DictDefault,
+    remote_fs: S3FileSystem | GCSFileSystem | AzureBlobFileSystem | OCIFileSystem,
+    storage_options: dict,
+    load_dataset_kwargs: dict,
+) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict:
+    """Load a dataset from cloud storage."""
+    if remote_fs.isdir(dataset_config.path):
+        return load_from_disk(
+            dataset_config.path,
+            storage_options=storage_options,
+        )
+
+    if remote_fs.isfile(dataset_config.path):
+        dataset_type = get_dataset_type(dataset_config)
+        return load_dataset(
+            dataset_type,
+            data_files=dataset_config.path,
+            storage_options=storage_options,
+            **load_dataset_kwargs,
+        )
+
+    raise ValueError(
+        f"Cloud path {dataset_config.path} is neither a directory nor a file"
+    )
+
+
+def _load_from_url(
+    dataset_config: DictDefault, load_dataset_kwargs: dict
+) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict:
+    """Load a dataset from a URL."""
+    dataset_type = get_dataset_type(dataset_config)
+    return load_dataset(
+        dataset_type,
+        data_files=dataset_config.path,
+        **load_dataset_kwargs,
+    )
+
+
+def _load_from_data_files(
+    dataset_config: DictDefault, load_dataset_kwargs: dict
+) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict:
+    """Load a dataset from data files."""
+    file_path = None
+
+    if isinstance(dataset_config.data_files, str):
+        file_path = hf_hub_download(
+            repo_id=dataset_config.path,
+            repo_type="dataset",
+            filename=dataset_config.data_files,
+            revision=dataset_config.revision,
+        )
+    elif isinstance(dataset_config.data_files, list):
+        file_path = [
+            hf_hub_download(
+                repo_id=dataset_config.path,
+                repo_type="dataset",
+                filename=file,
+                revision=dataset_config.revision,
+            )
+            for file in dataset_config.data_files
+        ]
+    else:
+        raise ValueError("data_files must be either a string or list of strings")
+
+    return load_dataset("json", data_files=file_path, **load_dataset_kwargs)
+
+
+def generate_split_fingerprints(
+    dataset: Dataset, val_set_size: int | float, seed: int
+) -> tuple[str, str]:
+    """Generate consistent fingerprints for train/test splits."""
+    fingerprint = dataset._fingerprint  # pylint: disable=protected-access
+
+    train_hash_input = f"{fingerprint}|{val_set_size}|train|{seed}"
+    test_hash_input = f"{fingerprint}|{val_set_size}|test|{seed}"
+
+    train_fingerprint = md5(train_hash_input)
+    test_fingerprint = md5(test_hash_input)
+
+    return train_fingerprint, test_fingerprint
+
+
+def get_prepared_dataset_path(cfg: DictDefault, dataset_hash: str) -> Path:
+    """Get standardized path for prepared datasets.
+
+    Args:
+        cfg: Configuration object.
+        dataset_hash: Hash identifying the specific dataset configuration.
+
+    Returns:
+        Path where the prepared dataset should be stored.
+    """
+    base_path = cfg.dataset_prepared_path or DEFAULT_DATASET_PREPARED_PATH
+    return Path(base_path) / dataset_hash
+
+
+def create_train_validation_split(
+    dataset: Dataset, cfg: DictDefault, val_set_size: int | float
+) -> tuple[Dataset, Dataset]:
+    """Create train/validation split with consistent fingerprinting.
+
+    Args:
+        dataset: Dataset to split.
+        cfg: Configuration object containing seed and other settings.
+        val_set_size: Size of validation set (absolute number or fraction).
+
+    Returns:
+        Tuple of (train_dataset, eval_dataset).
+    """
+    train_fingerprint, test_fingerprint = generate_split_fingerprints(
+        dataset, val_set_size, cfg.seed
+    )
+
+    # Apply deduplication before splitting if configured
+    if cfg.dataset_exact_deduplication:
+        dataset, _ = deduplicate_and_log_datasets(dataset=dataset)
+
+    split_dataset = dataset.train_test_split(
+        test_size=val_set_size,
+        shuffle=False,
+        seed=cfg.seed,
+        train_new_fingerprint=train_fingerprint,
+        test_new_fingerprint=test_fingerprint,
+    )
+
+    return split_dataset["train"], split_dataset["test"]
+
+
+def _generate_from_iterable_dataset(
+    dataset: IterableDataset, worker_id: list[int], num_workers: list[int]
+) -> Generator[Any, None, None]:
+    """Generator function to correctly split the dataset for each worker"""
+    for i, item in enumerate(dataset):
+        if i % num_workers[0] == worker_id[0]:
+            yield item
+
+
+def save_preprocessed_dataset(
+    cfg: DictDefault,
+    dataset: Dataset,
+    dataset_hash: str,
+    split: str,
+) -> None:
+    """Save preprocessed dataset to disk and optionally push to the HF Hub."""
+    prepared_ds_path = get_prepared_dataset_path(cfg, dataset_hash)
+    num_workers = cfg.dataset_processes or get_default_process_count()
+    if isinstance(dataset, IterableDataset):
+        ds_from_iter = Dataset.from_generator(
+            functools.partial(_generate_from_iterable_dataset, dataset),
+            features=dataset.features,
+            num_proc=num_workers,
+            split=split,
+            gen_kwargs={
+                "worker_id": list(range(num_workers)),
+                "num_workers": [num_workers] * num_workers,
+            },
+        )
+        ds_from_iter.save_to_disk(
+            str(prepared_ds_path),
+            num_proc=num_workers,
+            max_shard_size=None,
+            num_shards=cfg.num_dataset_shards_to_save,
+        )
+    else:
+        min_rows_per_proc = 256
+        os.makedirs(prepared_ds_path, exist_ok=True)
+        dataset.save_to_disk(
+            str(prepared_ds_path),
+            num_proc=min(max(1, len(dataset) // min_rows_per_proc), num_workers),
+            max_shard_size=None,
+            num_shards=cfg.num_dataset_shards_to_save,
+        )
+    if cfg.push_dataset_to_hub:
+        LOG.info(
+            "Pushing merged prepared dataset to Huggingface hub at "
+            f"{cfg.push_dataset_to_hub} (version {dataset_hash})...",
+            main_process_only=False,
+        )
+        dataset.push_to_hub(
+            cfg.push_dataset_to_hub,
+            dataset_hash,
+            private=True,
+        )
+
+
+def load_preprocessed_dataset(cfg: DictDefault, dataset_hash: str) -> Dataset | None:
+    """Load preprocessed dataset from disk if available.
+
+    Args:
+        cfg: Configuration object.
+        dataset_hash: Hash identifying the dataset configuration.
+
+    Returns:
+        Loaded dataset if found and conditions are met, None otherwise.
+    """
+    prepared_ds_path = get_prepared_dataset_path(cfg, dataset_hash)
+
+    if (
+        cfg.dataset_prepared_path
+        and any(prepared_ds_path.glob("*"))
+        and not cfg.skip_prepare_dataset
+        and not cfg.is_preprocess
+    ):
+        LOG.info(
+            f"Loading prepared dataset from disk at {prepared_ds_path}...",
+            main_process_only=True,
+        )
+        return load_from_disk(str(prepared_ds_path))
+
+    LOG.info(
+        f"Unable to find prepared dataset in {prepared_ds_path}",
+        main_process_only=True,
+    )
+    return None
+
+
+def try_load_from_hub(
+    cfg: DictDefault, dataset_hash: str, split: str
+) -> Dataset | None:
+    """Try to load the prepared dataset from HuggingFace Hub."""
+    try:
+        LOG.info(
+            "Attempting to load prepared dataset from HuggingFace Hub at "
+            f"{cfg.push_dataset_to_hub} (version {dataset_hash})..."
+        )
+        dataset = load_dataset(
+            cfg.push_dataset_to_hub,
+            dataset_hash,
+            token=cfg.hf_use_auth_token,
+        )
+        return dataset[split]
+    except Exception:  # pylint: disable=broad-except # nosec
+        LOG.info("Unable to find prepared dataset in HuggingFace Hub")
+        return None
+
+
+def generate_dataset_hash_from_config(
+    cfg: DictDefault, cfg_datasets: list, tokenizer_name: str
+) -> str:
+    """Generate a hash to uniquely identify a dataset configuration for SFT.
+
+    Args:
+        cfg: Main configuration object.
+        cfg_datasets: List of dataset configurations.
+        tokenizer_name: Name of the tokenizer being used.
+
+    Returns:
+        MD5 hash string representing the configuration.
+    """
+    config_str = (
+        f"{cfg.sequence_len}@{cfg.sample_packing}@{cfg.eval_sample_packing}@"
+        f"{cfg.group_by_length}@{cfg.kd_temperature or 1.0}|"
+        f"{'|'.join(sorted([f'{d.path}:{d.type}:{d.shards}:{d.conversation}:{d.split}:{d.temperature or 1.0}' for d in cfg_datasets]))}"
+        f"|{tokenizer_name}"
+    )
+    return str(md5(config_str))
+
+
+def merge_datasets(datasets: list[Dataset], cfg: DictDefault) -> Dataset:
+    """Merge multiple datasets into one with optional shuffling.
+
+    Args:
+        datasets: List of datasets to merge.
+        cfg: Configuration object containing shuffle settings.
+
+    Returns:
+        Merged dataset.
+    """
+    if len(datasets) == 1:
+        ds = datasets[0]
+
+        # Do not shuffle if curriculum sampling is enabled or
+        # shuffle_merged_datasets is disabled
+        if cfg.curriculum_sampling or not cfg.shuffle_merged_datasets:
+            return ds
+
+        return ds.shuffle(seed=cfg.seed)
+
+    # If enabled, shuffle each dataset independently before merging.
+    # This allows curriculum learning strategies to be applied at the dataset level.
+    if cfg.shuffle_before_merging_datasets:
+        LOG.info("Shuffling each dataset individually before merging...")
+        datasets = [ds.shuffle(seed=cfg.seed) for ds in datasets]
+
+    LOG.info("Merging datasets...")
+    merged_dataset = concatenate_datasets(datasets)
+
+    if cfg.shuffle_merged_datasets:
+        LOG.debug("Shuffling merged datasets...")
+        if cfg.curriculum_sampling:
+            LOG.warning(
+                "Shuffling merged datasets with curriculum sampling is not recommended. "
+                "This will randomize the order of samples."
+            )
+        merged_dataset = merged_dataset.shuffle(seed=cfg.seed)
+    else:
+        LOG.debug("Not shuffling merged datasets.")
+
+    return merged_dataset
diff --git a/src/axolotl/utils/data/utils.py b/src/axolotl/utils/data/utils.py
index 4e5d27f33..33126ea34 100644
--- a/src/axolotl/utils/data/utils.py
+++ b/src/axolotl/utils/data/utils.py
@@ -1,10 +1,11 @@
-"""data handling helpers"""
+"""Data handling helpers"""
 
+import contextlib
 import functools
 import hashlib
-import logging
 import time
 from enum import Enum
+from typing import Callable
 
 import huggingface_hub
 import numpy as np
@@ -12,18 +13,17 @@ import requests
 from datasets import Dataset, IterableDataset
 
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
 from axolotl.utils.samplers.utils import get_dataset_lengths
 from axolotl.utils.trainer import truncate_or_drop_long_seq
 
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__)
 
 DEFAULT_SEQUENCE_LEN_OVERFLOW_HANDLING = "drop"
 
 
 class RetryStrategy(Enum):
-    """
-    Enum for retry strategies.
-    """
+    """Enum for retry strategies."""
 
     CONSTANT = 1
     LINEAR = 2
@@ -32,7 +32,18 @@ class RetryStrategy(Enum):
 
 def retry_on_request_exceptions(
     max_retries=3, delay=1, retry_strategy: RetryStrategy = RetryStrategy.LINEAR
-):
+) -> Callable:
+    """Decorator that retries function calls on specific request exceptions.
+
+    Args:
+        max_retries: Maximum number of retry attempts.
+        delay: Base delay between retries in seconds.
+        retry_strategy: Strategy for calculating retry delays.
+
+    Returns:
+        Decorated function with retry logic.
+    """
+
     def decorator(func):
         @functools.wraps(func)
         def wrapper(*args, **kwargs):  # pylint: disable=inconsistent-return-statements
@@ -42,6 +53,7 @@ def retry_on_request_exceptions(
                 except (
                     requests.exceptions.ReadTimeout,
                     requests.exceptions.ConnectionError,
+                    requests.exceptions.HTTPError,
                     huggingface_hub.errors.HfHubHTTPError,
                 ) as exc:
                     if attempt < max_retries - 1:
@@ -61,6 +73,7 @@ def retry_on_request_exceptions(
 
 
 def md5(to_hash: str, encoding: str = "utf-8") -> str:
+    """Generate MD5 hash of a string."""
     try:
         return hashlib.md5(to_hash.encode(encoding), usedforsecurity=False).hexdigest()
     except TypeError:
@@ -68,134 +81,118 @@ def md5(to_hash: str, encoding: str = "utf-8") -> str:
 
 
 def sha256(to_hash: str, encoding: str = "utf-8") -> str:
+    """Generate SHA256 hash of a string."""
     return hashlib.sha256(to_hash.encode(encoding)).hexdigest()
 
 
-def deduplicate_dataset(
-    dataset: Dataset, seen_hashes: dict[str, list[int]], other_dataset: Dataset = None
-) -> Dataset:
-    unique_indices = []
+def _deduplicate_dataset(
+    dataset: Dataset,
+    seen_hashes: set[str] | None = None,
+) -> tuple[Dataset, set[str]]:
+    """Remove duplicate rows from a dataset using SHA256 hashes.
 
+    Args:
+        dataset: Dataset to deduplicate.
+        seen_hashes: Set of previously seen row hashes (for cross-deduplication).
+
+    Returns:
+        Tuple of deduplicated dataset and the set of seen hashes.
+    """
+    if seen_hashes is None:
+        seen_hashes = set()
+
+    unique_indices = []
     for idx, row in enumerate(dataset):
-        row_hash = sha256(str(row))  # Using SHA256 for collision resistance.
+        row_hash = sha256(str(row))  # Using SHA256 for collision resistance
         if row_hash not in seen_hashes:
-            seen_hashes[row_hash] = [idx]
+            seen_hashes.add(row_hash)
             unique_indices.append(idx)
-        else:
-            # Check for collision by looking up the original dataset indices
-            original_indices = seen_hashes[row_hash]
-            is_duplicate = False
-            for original_idx in original_indices:
-                if (
-                    not idx == original_idx
-                    and original_idx < len(dataset)
-                    and str(dataset[original_idx]) == str(row)
-                ):
-                    is_duplicate = True
-                    break
-                # Check in the other dataset if provided
-                if other_dataset is not None:
-                    if original_idx < len(other_dataset) and str(
-                        other_dataset[original_idx]
-                    ) == str(row):
-                        is_duplicate = True
-                        break
-            if not is_duplicate:
-                seen_hashes[row_hash].append(idx)
-                unique_indices.append(idx)
-                continue
-    return dataset.select(unique_indices)
+
+    return dataset.select(unique_indices), seen_hashes
 
 
 def deduplicate_and_log_datasets(
-    *,
-    train_dataset: Dataset = None,
-    eval_dataset: Dataset = None,
-    dataset: Dataset = None,
-) -> tuple[Dataset, Dataset, Dataset]:
-    """
-    Deduplicates train, eval, and an optional dataset if provided, logging original and new sizes.
+    dataset: Dataset,
+    other_dataset: Dataset | None = None,
+    dataset_name: str | None = "train",
+    other_name: str | None = "eval",
+) -> tuple[Dataset, Dataset | None]:
+    """Deduplicate datasets, with optional cross-dataset deduplication.
+
+    Args:
+        dataset: Primary dataset to deduplicate.
+        other_dataset: Optional second dataset to deduplicate against the first.
+        dataset_name: Name for the primary dataset (for logging).
+        other_name: Name for the second dataset (for logging).
 
     Returns:
-        tuple: Deduplicated train, eval, and additional datasets.
+        Tuple of (deduplicated_dataset, deduplicated_other_dataset).
     """
-    seen_hashes: dict[str, list[int]] = {}
+    # Deduplicate primary dataset
+    LOG.info(
+        f"Starting deduplication for {dataset_name} dataset. Original size: {len(dataset)}"
+    )
+    dataset, seen_rows = _deduplicate_dataset(dataset)
+    LOG.info(
+        f"Deduplication complete for {dataset_name} dataset. New size: {len(dataset)}"
+    )
 
-    # Handle cases where datasets are None
-    if train_dataset is not None:
+    # Deduplicate second dataset if provided
+    if other_dataset is not None:
         LOG.info(
-            f"Starting deduplication for train dataset. Original size: {len(train_dataset)}"
-        )
-        train_dataset = deduplicate_dataset(
-            dataset=train_dataset, seen_hashes=seen_hashes
+            f"Starting deduplication for {other_name} dataset. Original size: {len(other_dataset)}"
         )
+        other_dataset, _ = _deduplicate_dataset(other_dataset, seen_rows)
         LOG.info(
-            f"Deduplication complete for train dataset. New size: {len(train_dataset)}"
-        )
-    else:
-        LOG.info("Train dataset is None. Skipping deduplication.")
-
-    if eval_dataset is not None:
-        LOG.info(
-            f"Starting deduplication for eval dataset. Original size: {len(eval_dataset)}"
-        )
-        eval_dataset = deduplicate_dataset(
-            dataset=eval_dataset, seen_hashes=seen_hashes, other_dataset=train_dataset
-        )
-        LOG.info(
-            f"Deduplication complete for eval dataset. New size: {len(eval_dataset)}"
-        )
-    else:
-        LOG.info("Eval dataset is None. Skipping deduplication.")
-
-    if dataset is not None and (eval_dataset is None and train_dataset is None):
-        LOG.info(
-            f"Starting deduplication for combined dataset. Original size: {len(dataset)}"
-        )
-        dataset = deduplicate_dataset(dataset=dataset, seen_hashes=seen_hashes)
-        LOG.info(
-            f"Deduplication complete for combined dataset. New size: {len(dataset)}"
+            f"Deduplication complete for {other_name} dataset. New size: {len(other_dataset)}"
         )
 
-    return train_dataset, eval_dataset, dataset
+    return dataset, other_dataset
 
 
-def drop_long_seq_in_dataset(dataset: Dataset, cfg: DictDefault):
+def drop_long_seq_in_dataset(
+    dataset: Dataset, sequence_len: int, cfg: DictDefault
+) -> Dataset:
+    """Remove sequences longer than configured maximum from dataset.
+
+    Args:
+        dataset: Dataset to filter.
+        sequence_len: Maximum length for sequences to keep
+        cfg: Dictionary mapping `axolotl` config keys to values.
+
+    Returns:
+        Filtered dataset with long sequences removed.
+    """
     if "input_ids" not in dataset.column_names:
         LOG.warning(
-            "Dataset does not contain 'input_ids' column. Skip drop long seq. This is expected for RewardModeling."
+            "Dataset does not contain 'input_ids' column. Skip drop long seq. This is "
+            "expected for reward modeling."
         )
         return dataset
 
-    # Get the handling method from config, default to "drop" for backward compatibility
-    # Support legacy alias "excess_token_handling" as well
+    # Get the handling method from config, default to "drop" for backward compatibility.
+    # Support legacy alias "excess_token_handling" as well.
     handling = cfg.get(
         "sequence_len_overflow_handling",
-        cfg.get("excess_token_handling", "drop"),
+        cfg.get("excess_token_handling", DEFAULT_SEQUENCE_LEN_OVERFLOW_HANDLING),
     )
 
-    # Use the new function with the specified handling mode
+    # Use the function with the specified handling mode
     seq_handler = functools.partial(
         truncate_or_drop_long_seq,
-        sequence_len=cfg.sequence_len,
+        sequence_len=sequence_len,
         min_sequence_len=cfg.min_sample_len,
         handling=handling,
     )
 
-    try:
+    with contextlib.suppress(AttributeError):
         ds_lengths = get_dataset_lengths(dataset, from_arrow=True)
         min_input_len = np.min(ds_lengths)
         LOG.info(f"min_input_len: {min_input_len}")
         max_input_len = np.max(ds_lengths)
         LOG.info(f"max_input_len: {max_input_len}")
-    except AttributeError:
-        pass
 
-    try:
-        prior_len = len(dataset)
-    except TypeError:
-        # handle iterable datasets case
-        prior_len = None
+    prior_len = len(dataset) if hasattr(dataset, "__len__") else None
 
     filter_map_kwargs = {}
     if not isinstance(dataset, IterableDataset):
@@ -207,7 +204,7 @@ def drop_long_seq_in_dataset(dataset: Dataset, cfg: DictDefault):
         if handling == "truncate":
             drop_long_kwargs["desc"] = "Truncating Long Sequences"
         else:  # handling == "drop"
-            drop_long_kwargs["desc"] = "Dropping Long Sequences"
+            drop_long_kwargs["desc"] = f"Dropping Long Sequences (>{sequence_len})"
 
     if handling == "truncate":
         # Use map for truncate mode
@@ -217,7 +214,7 @@ def drop_long_seq_in_dataset(dataset: Dataset, cfg: DictDefault):
             **filter_map_kwargs,
             **drop_long_kwargs,
         )
-        LOG.info(f"Truncated long samples in dataset to {cfg.sequence_len} tokens")
+        LOG.info(f"Truncated long samples in dataset to {sequence_len} tokens")
     else:  # handling == "drop"
         # Use filter for drop mode
         dataset = dataset.filter(
diff --git a/src/axolotl/utils/data/wrappers.py b/src/axolotl/utils/data/wrappers.py
new file mode 100644
index 000000000..b6dc42c71
--- /dev/null
+++ b/src/axolotl/utils/data/wrappers.py
@@ -0,0 +1,425 @@
+"""Data handling specific to SFT."""
+
+import logging
+from typing import Any, NoReturn, cast
+
+from datasets import (
+    Dataset,
+    IterableDataset,
+    Sequence,
+    Value,
+)
+from transformers import PreTrainedTokenizer
+from transformers.processing_utils import ProcessorMixin
+
+from axolotl.datasets import TokenizedPromptDataset, wrap_dataset_for_tokenized_prompt
+from axolotl.prompt_strategies import load
+from axolotl.prompt_strategies.bradley_terry import load as bradley_terry_load
+from axolotl.prompt_tokenizers import (
+    AlpacaMultipleChoicePromptTokenizingStrategy,
+    AlpacaPromptTokenizingStrategy,
+    AlpacaReflectionPTStrategy,
+    DatasetWrappingStrategy,
+    GPTeacherPromptTokenizingStrategy,
+    JeopardyPromptTokenizingStrategy,
+    OpenAssistantPromptTokenizingStrategy,
+    PromptTokenizingStrategy,
+    SummarizeTLDRPromptTokenizingStrategy,
+)
+from axolotl.prompters import (
+    AlpacaPrompter,
+    GPTeacherPrompter,
+    JeopardyPrompter,
+    MultipleChoiceConcisePrompter,
+    MultipleChoiceExplainPrompter,
+    Prompter,
+    ReflectAlpacaPrompter,
+    SummarizeTLDRPrompter,
+    UnsupportedPrompter,
+)
+from axolotl.utils.dict import DictDefault
+
+LOG = logging.getLogger(__name__)
+
+
+def handle_unknown_dataset_strategy(dataset_config: DictDefault) -> NoReturn:
+    """Raise error for unknown dataset strategy."""
+    ds_type = dataset_config.type
+    suffix = ""
+    if ":load_" in ds_type:
+        suffix = f"Did you mean {ds_type.replace(':load_', '.load_')}?"
+
+    error_message = f"unhandled prompt tokenization strategy: {ds_type}. {suffix}"
+    LOG.error(error_message)
+    raise ValueError(error_message)
+
+
+# pylint: disable=too-many-return-statements
+def get_dataset_wrapper(
+    dataset_config: DictDefault,
+    tokenizer: PreTrainedTokenizer,
+    cfg: DictDefault,
+    dataset_base_type: str | None,
+    dataset: Dataset | IterableDataset,
+    dataset_prompt_style: str | None = None,
+    processor: ProcessorMixin | None = None,  # pylint: disable=unused-argument
+) -> tuple[Dataset | IterableDataset, Prompter | None]:
+    """Create an appropriate dataset wrapper and prompter based on dataset
+    configuration.
+
+    Args:
+        dataset_config: Configuration for the dataset.
+        tokenizer: Tokenizer to use for processing text.
+        cfg: Global configuration object.
+        dataset_base_type: The base type of the dataset.
+        dataset: The actual dataset object.
+        dataset_prompt_style: Optional prompt style specification.
+        processor: Optional processor for multimodal datasets.
+
+    Returns:
+        tuple of (dataset_wrapper, dataset_prompter).
+    """
+    # Common parameters for dataset wrapping
+    dataset_kwargs: dict[str, Any] = {
+        "process_count": cfg.dataset_processes,
+        "keep_in_memory": cfg.dataset_keep_in_memory is True,
+    }
+
+    LOG.info(
+        f"Loading dataset: {dataset_config['path']} with base_type: "
+        f"{dataset_base_type} and prompt_style: {dataset_prompt_style}"
+    )
+
+    # Dataset is already tokenized
+    if _is_dataset_already_tokenized(dataset):
+        return dataset, UnsupportedPrompter()
+
+    # Custom dataset type definition
+    if isinstance(dataset_config.type, DictDefault):
+        return _handle_custom_dataset_type(
+            dataset_config, tokenizer, cfg, dataset, dataset_kwargs
+        )
+
+    # Skip preparation if configured
+    if cfg.skip_prepare_dataset:
+        return dataset, None
+
+    # Bradley-Terry dataset
+    if dataset_config.type.startswith("bradley_terry"):
+        return _handle_bradley_terry_dataset(
+            dataset_config, tokenizer, cfg, dataset, dataset_kwargs
+        )
+
+    # Stepwise supervised dataset
+    if dataset_config.type.startswith("stepwise_supervised"):
+        return _handle_stepwise_supervised_dataset(
+            dataset_config, tokenizer, cfg, dataset, dataset_kwargs
+        )
+
+    # Try to load prompt tokenizer / dataset wrapper strategy from registry
+    dataset_strategy = load(
+        dataset_config.type, tokenizer, cfg, dataset_config, processor=processor
+    )
+    if dataset_strategy:
+        return _handle_loaded_strategy(dataset_strategy, dataset, dataset_kwargs)
+
+    # Known dataset types with specific handling
+    if dataset_base_type in DATASET_HANDLERS:
+        handler = DATASET_HANDLERS[dataset_base_type]
+        return handler(dataset_prompt_style, tokenizer, cfg, dataset, dataset_kwargs)
+
+    # Unhandled dataset type
+    handle_unknown_dataset_strategy(dataset_config)
+
+
+def _is_dataset_already_tokenized(dataset: Dataset | IterableDataset) -> bool:
+    """Check if the dataset is already tokenized."""
+    return (
+        isinstance(dataset, Dataset)
+        and "input_ids" in dataset.features
+        and "attention_mask" in dataset.features
+        and "labels" in dataset.features
+    )
+
+
+def _handle_custom_dataset_type(
+    dataset_config: DictDefault,
+    tokenizer: PreTrainedTokenizer,
+    cfg: DictDefault,
+    dataset: Dataset | IterableDataset,
+    dataset_kwargs: dict[str, Any],
+) -> tuple[Dataset | IterableDataset, Prompter]:
+    """Handle a custom dataset type defined in the configuration."""
+    dataset_strategy = cast(
+        PromptTokenizingStrategy,
+        load("user_defined", tokenizer, cfg, dataset_config.type.to_dict()),
+    )
+    dataset_prompter = UnsupportedPrompter()
+    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
+        dataset_strategy,
+        dataset,
+        **dataset_kwargs,
+    )
+    return dataset_wrapper, dataset_prompter
+
+
+def _handle_bradley_terry_dataset(
+    dataset_config: DictDefault,
+    tokenizer: PreTrainedTokenizer,
+    cfg: DictDefault,
+    dataset: Dataset | IterableDataset,
+    dataset_kwargs: dict[str, Any],
+) -> tuple[Dataset | IterableDataset, Prompter | None]:
+    """Handle a Bradley-Terry dataset."""
+    bt_type = dataset_config.type.split(".", 1)[1]
+    dataset_strategy = bradley_terry_load(bt_type, tokenizer, cfg, dataset_config)
+
+    if not dataset_strategy:
+        handle_unknown_dataset_strategy(dataset_config)
+
+    dataset_prompter = UnsupportedPrompter()
+    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
+        dataset_strategy,
+        dataset,
+        **dataset_kwargs,
+    )
+
+    return dataset_wrapper, dataset_prompter
+
+
+def _handle_stepwise_supervised_dataset(
+    dataset_config: DictDefault,
+    tokenizer: PreTrainedTokenizer,
+    cfg: DictDefault,
+    dataset: Dataset | IterableDataset,
+    dataset_kwargs: dict[str, Any],
+) -> tuple[Dataset | IterableDataset, Prompter]:
+    """Handle a stepwise supervised dataset."""
+    dataset_prompter = UnsupportedPrompter()
+    dataset_strategy = load(dataset_config.type, tokenizer, cfg, dataset_config)
+
+    # We need to explicitly cast boolean labels to int
+    # for compatibility with how trl's PRMTrainer works
+    if isinstance(dataset, Dataset):
+        dataset = dataset.cast_column("labels", Sequence(Value("int64")))
+
+    dataset_wrapper = TokenizedPromptDataset(
+        dataset_strategy,
+        dataset,
+        **dataset_kwargs,
+    )
+    return dataset_wrapper, dataset_prompter
+
+
+def _handle_loaded_strategy(
+    dataset_strategy: PromptTokenizingStrategy | DatasetWrappingStrategy,
+    dataset: Dataset | IterableDataset,
+    dataset_kwargs: dict[str, Any],
+) -> tuple[Dataset | IterableDataset, Prompter | None]:
+    """Handle a dataset with a strategy loaded from the registry."""
+    if isinstance(dataset_strategy, DatasetWrappingStrategy):
+        return dataset_strategy.wrap_dataset(dataset, **dataset_kwargs), None
+
+    dataset_prompter = UnsupportedPrompter()
+    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
+        dataset_strategy,
+        dataset,
+        **dataset_kwargs,
+    )
+    return dataset_wrapper, dataset_prompter
+
+
+def _handle_alpaca_dataset(
+    dataset_prompt_style: str | None,
+    tokenizer: PreTrainedTokenizer,
+    cfg: DictDefault,
+    dataset: Dataset | IterableDataset,
+    dataset_kwargs: dict[str, Any],
+) -> tuple[Dataset | IterableDataset, Prompter]:
+    """Handle an Alpaca dataset."""
+    dataset_prompter = AlpacaPrompter(dataset_prompt_style)
+    dataset_strategy = AlpacaPromptTokenizingStrategy(
+        dataset_prompter,
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
+        dataset_strategy,
+        dataset,
+        **dataset_kwargs,
+    )
+    return dataset_wrapper, dataset_prompter
+
+
+def _handle_explainchoice_dataset(
+    dataset_prompt_style: str | None,
+    tokenizer: PreTrainedTokenizer,
+    cfg: DictDefault,
+    dataset: Dataset | IterableDataset,
+    dataset_kwargs: dict[str, Any],
+) -> tuple[Dataset | IterableDataset, Prompter]:
+    """Handle an ExplainChoice dataset."""
+    dataset_prompter = MultipleChoiceExplainPrompter(dataset_prompt_style)
+    dataset_strategy = AlpacaMultipleChoicePromptTokenizingStrategy(
+        dataset_prompter,
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
+        dataset_strategy,
+        dataset,
+        **dataset_kwargs,
+    )
+    return dataset_wrapper, dataset_prompter
+
+
+def _handle_concisechoice_dataset(
+    dataset_prompt_style: str | None,
+    tokenizer: PreTrainedTokenizer,
+    cfg: DictDefault,
+    dataset: Dataset | IterableDataset,
+    dataset_kwargs: dict[str, Any],
+) -> tuple[Dataset | IterableDataset, Prompter]:
+    """Handle a ConciseChoice dataset."""
+    dataset_prompter = MultipleChoiceConcisePrompter(dataset_prompt_style)
+    dataset_strategy = AlpacaMultipleChoicePromptTokenizingStrategy(
+        dataset_prompter,
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
+        dataset_strategy,
+        dataset,
+        **dataset_kwargs,
+    )
+    return dataset_wrapper, dataset_prompter
+
+
+def _handle_summarizetldr_dataset(
+    dataset_prompt_style: str | None,
+    tokenizer: PreTrainedTokenizer,
+    cfg: DictDefault,
+    dataset: Dataset | IterableDataset,
+    dataset_kwargs: dict[str, Any],
+) -> tuple[Dataset | IterableDataset, Prompter]:
+    """Handle a SummarizeTLDR dataset."""
+    dataset_prompter = SummarizeTLDRPrompter(dataset_prompt_style)
+    dataset_strategy = SummarizeTLDRPromptTokenizingStrategy(
+        dataset_prompter,
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
+        dataset_strategy,
+        dataset,
+        **dataset_kwargs,
+    )
+    return dataset_wrapper, dataset_prompter
+
+
+def _handle_jeopardy_dataset(
+    dataset_prompt_style: str | None,
+    tokenizer: PreTrainedTokenizer,
+    cfg: DictDefault,
+    dataset: Dataset | IterableDataset,
+    dataset_kwargs: dict[str, Any],
+) -> tuple[Dataset | IterableDataset, Prompter]:
+    """Handle a Jeopardy dataset."""
+    dataset_prompter = JeopardyPrompter(dataset_prompt_style)
+    dataset_strategy = JeopardyPromptTokenizingStrategy(
+        dataset_prompter,
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
+        dataset_strategy,
+        dataset,
+        **dataset_kwargs,
+    )
+    return dataset_wrapper, dataset_prompter
+
+
+def _handle_oasst_dataset(
+    dataset_prompt_style: str | None,
+    tokenizer: PreTrainedTokenizer,
+    cfg: DictDefault,
+    dataset: Dataset | IterableDataset,
+    dataset_kwargs: dict[str, Any],
+) -> tuple[Dataset | IterableDataset, Prompter]:
+    """Handle an OpenAssistant dataset."""
+    dataset_prompter = AlpacaPrompter(dataset_prompt_style)
+    dataset_strategy = OpenAssistantPromptTokenizingStrategy(
+        dataset_prompter,
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
+        dataset_strategy,
+        dataset,
+        **dataset_kwargs,
+    )
+    return dataset_wrapper, dataset_prompter
+
+
+def _handle_gpteacher_dataset(
+    dataset_prompt_style: str | None,
+    tokenizer: PreTrainedTokenizer,
+    cfg: DictDefault,
+    dataset: Dataset | IterableDataset,
+    dataset_kwargs: dict[str, Any],
+) -> tuple[Dataset | IterableDataset, Prompter]:
+    """Handle a GPTeacher dataset."""
+    dataset_prompter = GPTeacherPrompter(dataset_prompt_style)
+    dataset_strategy = GPTeacherPromptTokenizingStrategy(
+        dataset_prompter,
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
+        dataset_strategy,
+        dataset,
+        **dataset_kwargs,
+    )
+    return dataset_wrapper, dataset_prompter
+
+
+def _handle_reflection_dataset(
+    dataset_prompt_style: str | None,
+    tokenizer: PreTrainedTokenizer,
+    cfg: DictDefault,
+    dataset: Dataset | IterableDataset,
+    dataset_kwargs: dict[str, Any],
+) -> tuple[Dataset | IterableDataset, Prompter]:
+    """Handle a Reflection dataset."""
+    dataset_prompter = ReflectAlpacaPrompter(dataset_prompt_style)
+    dataset_strategy = AlpacaReflectionPTStrategy(
+        dataset_prompter,
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
+        dataset_strategy,
+        dataset,
+        **dataset_kwargs,
+    )
+    return dataset_wrapper, dataset_prompter
+
+
+DATASET_HANDLERS = {
+    "alpaca": _handle_alpaca_dataset,
+    "explainchoice": _handle_explainchoice_dataset,
+    "concisechoice": _handle_concisechoice_dataset,
+    "summarizetldr": _handle_summarizetldr_dataset,
+    "jeopardy": _handle_jeopardy_dataset,
+    "oasst": _handle_oasst_dataset,
+    "gpteacher": _handle_gpteacher_dataset,
+    "reflection": _handle_reflection_dataset,
+}
diff --git a/src/axolotl/utils/datasets.py b/src/axolotl/utils/datasets.py
new file mode 100644
index 000000000..93e1a2416
--- /dev/null
+++ b/src/axolotl/utils/datasets.py
@@ -0,0 +1,11 @@
+"""helper functions for datasets"""
+
+import os
+
+
+def get_default_process_count():
+    if axolotl_dataset_processes := os.environ.get("AXOLOTL_DATASET_PROCESSES"):
+        return int(axolotl_dataset_processes)
+    if runpod_cpu_count := os.environ.get("RUNPOD_CPU_COUNT"):
+        return int(runpod_cpu_count)
+    return os.cpu_count()
diff --git a/src/axolotl/utils/dict.py b/src/axolotl/utils/dict.py
index f24f7c4a9..c2670dfeb 100644
--- a/src/axolotl/utils/dict.py
+++ b/src/axolotl/utils/dict.py
@@ -36,3 +36,16 @@ class DictDefault(Dict):
             p[key] = self
             object.__delattr__(self, "__parent")
             object.__delattr__(self, "__key")
+
+
+def remove_none_values(obj):
+    """
+    Remove null from a dictionary-like obj or list.
+    These can appear due to Dataset loading causing schema merge.
+    See https://github.com/axolotl-ai-cloud/axolotl/pull/2909
+    """
+    if hasattr(obj, "items"):
+        return {k: remove_none_values(v) for k, v in obj.items() if v is not None}
+    if isinstance(obj, list):
+        return [remove_none_values(elem) for elem in obj]
+    return obj
diff --git a/src/axolotl/utils/distributed.py b/src/axolotl/utils/distributed.py
index 8c52102c8..48771fd97 100644
--- a/src/axolotl/utils/distributed.py
+++ b/src/axolotl/utils/distributed.py
@@ -1,6 +1,4 @@
-"""
-utility helpers for distributed checks
-"""
+"""Utilities for distributed functionality."""
 
 import os
 import pickle  # nosec
@@ -10,6 +8,7 @@ from datetime import timedelta
 import torch
 import torch.distributed as dist
 from accelerate import PartialState
+from accelerate.utils import ParallelismConfig
 from transformers.utils.import_utils import (
     is_torch_cuda_available,
     is_torch_mps_available,
@@ -19,7 +18,7 @@ from transformers.utils.import_utils import (
 distributed_state = None  # pylint: disable=invalid-name
 
 
-def get_device_type():
+def get_device_type() -> torch.device:
     device = torch.device("cpu")
     if is_torch_cuda_available():
         device = torch.device("cuda")
@@ -30,7 +29,7 @@ def get_device_type():
     return device
 
 
-def get_device_count():
+def get_device_count() -> int:
     cur_device = get_device_type()
     if "cuda" in str(cur_device):
         return torch.cuda.device_count()
@@ -39,7 +38,7 @@ def get_device_count():
     return 1
 
 
-def get_current_device():
+def get_current_device() -> int:
     cur_device = get_device_type()
     if "cuda" in str(cur_device):
         return torch.cuda.current_device()
@@ -48,14 +47,26 @@ def get_current_device():
     return 0
 
 
-def is_distributed():
-    """
-    Check if distributed training is initialized.
-    """
+def init_distributed_state():
     global distributed_state  # pylint: disable=global-statement
-    if not distributed_state:
+    if distributed_state is None:
         timeout = int(os.environ.get("AXOLOTL_NCCL_TIMEOUT", 1800))
-        distributed_state = PartialState(timeout=timedelta(seconds=timeout))
+        try:
+            distributed_state = PartialState(timeout=timedelta(seconds=timeout))
+        except ValueError:
+            pass
+
+
+def get_distributed_state() -> PartialState | None:
+    return distributed_state
+
+
+def is_distributed() -> bool:
+    """Check if distributed training is initialized."""
+    init_distributed_state()
+
+    if distributed_state is None:
+        return False
 
     return distributed_state.use_distributed and distributed_state.initialized
 
@@ -69,31 +80,31 @@ def barrier():
         dist.barrier()
 
 
-def is_main_process(use_environ=False):
+def is_main_process() -> bool:
     """
     Check if the current process is the main process. If not in distributed mode,
     always return `True`.
 
-    Args:
-    - use_environ (bool, optional): Use environment variable to determine main process.
+    We use a simpler logic when the distributed state is not initialized: we just log
+    on the 0-th local rank.
 
     Returns:
-    - bool: `True` if the current process is the main process, `False` otherwise.
+        `True` if the current process is the main process, `False` otherwise.
     """
-    if use_environ:
+    if get_distributed_state() is None:
         return os.environ.get("LOCAL_RANK", "0") == "0"
     if not is_distributed():
         return True
     return dist.get_rank() == 0
 
 
-def is_local_main_process(use_environ=False):
-    if use_environ:
+def is_local_main_process() -> bool:
+    if get_distributed_state() is None:
         return os.environ.get("LOCAL_RANK", "0") == "0"
     return PartialState().is_local_main_process
 
 
-def get_world_size():
+def get_world_size() -> int:
     return int(os.getenv("WORLD_SIZE", "1"))
 
 
@@ -103,14 +114,19 @@ def cleanup_distributed():
     termination or when training successfully completes.
     """
     # Ensure that all operations are completed before destroying the process group
-    torch.cuda.synchronize()
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+
+    if torch.xpu.is_available():
+        torch.xpu.synchronize()
+
     # Destroy the process group
     if torch.distributed.is_initialized():
         torch.distributed.destroy_process_group()
 
 
 @contextmanager
-def zero_first(is_main):
+def zero_first(is_main: bool):
     """
     runs the wrapped context so that rank 0 runs first before other ranks
     """
@@ -278,3 +294,77 @@ def reduce_and_broadcast(fn1, fn2):
     # Use compute_and_broadcast to compute the reduced value on the main process
     # and then broadcast it to all ranks
     return compute_and_broadcast(lambda: fn2(gathered_values))
+
+
+def build_parallelism_config(cfg):
+    pc_kwargs = _get_parallel_config_kwargs(
+        get_world_size(),
+        cfg.tensor_parallel_size,
+        cfg.context_parallel_size,
+        cfg.dp_shard_size,
+        cfg.dp_replicate_size,
+        bool(cfg.fsdp or cfg.fsdp_config),
+    )
+
+    if pc_kwargs:
+        parallelism_config = ParallelismConfig(
+            **pc_kwargs,
+        )
+        device_mesh = parallelism_config.build_device_mesh("cuda")
+
+        return parallelism_config, device_mesh
+    return None, None
+
+
+def _get_parallel_config_kwargs(
+    world_size: int,
+    tensor_parallel_size: int = 1,
+    context_parallel_size: int = 1,
+    dp_shard_size: int | None = None,
+    dp_replicate_size: int | None = None,
+    is_fsdp: bool = False,
+):
+    pc_kwargs = {}
+    remaining_world_size = world_size
+
+    if tensor_parallel_size and tensor_parallel_size > 1:
+        pc_kwargs["tp_size"] = tensor_parallel_size
+        remaining_world_size = remaining_world_size // tensor_parallel_size
+
+    if context_parallel_size and context_parallel_size > 1:
+        pc_kwargs["cp_size"] = context_parallel_size
+        remaining_world_size = remaining_world_size // context_parallel_size
+
+    if dp_shard_size is None and dp_replicate_size in (None, 1):
+        if remaining_world_size > 1:
+            pc_kwargs["dp_shard_size"] = remaining_world_size
+            remaining_world_size = 1
+
+    if dp_replicate_size and dp_replicate_size > 1:
+        pc_kwargs["dp_replicate_size"] = dp_replicate_size
+        remaining_world_size = remaining_world_size // dp_replicate_size
+
+    if remaining_world_size > 1 and dp_shard_size and dp_shard_size > 1:
+        if not is_fsdp:
+            raise ValueError(
+                "dp_shard_size was configured without a corresponding fsdp_config! "
+                "Please ensure you have configured FSDP using fsdp_config."
+            )
+        pc_kwargs["dp_shard_size"] = dp_shard_size
+        remaining_world_size = remaining_world_size // dp_shard_size
+        if remaining_world_size > 1 and "dp_replicate_size" not in pc_kwargs:
+            pc_kwargs["dp_replicate_size"] = remaining_world_size
+            remaining_world_size = 1
+
+    if remaining_world_size > 1:
+        if "dp_shard_size" not in pc_kwargs and is_fsdp:
+            pc_kwargs["dp_shard_size"] = remaining_world_size
+            remaining_world_size = 1
+
+    if remaining_world_size > 1:
+        raise ValueError(
+            f"The configured parallelisms are incompatible with the current world size ({get_world_size()})!\n"
+            f"{pc_kwargs}"
+        )
+
+    return pc_kwargs
diff --git a/src/axolotl/utils/environment.py b/src/axolotl/utils/environment.py
index 1cc609a68..3c83c87cb 100644
--- a/src/axolotl/utils/environment.py
+++ b/src/axolotl/utils/environment.py
@@ -2,12 +2,15 @@
 utils to get GPU info for the current environment
 """
 
+from importlib.metadata import version
+
 from accelerate.utils.environment import (
     check_cuda_p2p_ib_support as accelerate_check_cuda_p2p_ib_support,
 )
 from accelerate.utils.environment import (
     get_gpu_info,
 )
+from packaging.version import Version, parse
 
 
 def check_cuda_p2p_ib_support():
@@ -26,3 +29,13 @@ def check_cuda_p2p_ib_support():
     except Exception:  # pylint: disable=broad-except # nosec
         pass
     return True
+
+
+def get_package_version(package: str) -> Version:
+    version_str = version(package)
+    return parse(version_str)
+
+
+def is_package_version_ge(package: str, version_: str) -> bool:
+    package_version = get_package_version(package)
+    return package_version >= parse(version_)
diff --git a/src/axolotl/utils/freeze.py b/src/axolotl/utils/freeze.py
index 65ca62137..936708f04 100644
--- a/src/axolotl/utils/freeze.py
+++ b/src/axolotl/utils/freeze.py
@@ -5,9 +5,8 @@ module to freeze/unfreeze parameters by name
 import re
 from typing import Callable, List, Tuple, Union
 
-from accelerate.logging import get_logger
-
 from axolotl.utils.distributed import is_main_process
+from axolotl.utils.logging import get_logger
 
 LOG = get_logger(__name__)
 
diff --git a/src/axolotl/utils/import_helper.py b/src/axolotl/utils/import_helper.py
new file mode 100644
index 000000000..f7d20099c
--- /dev/null
+++ b/src/axolotl/utils/import_helper.py
@@ -0,0 +1,28 @@
+"""
+Helper for importing modules from strings
+"""
+
+import importlib
+
+
+def get_cls_from_module_str(module_str: str):
+    # use importlib to dynamically load the reward function from the module
+    if not isinstance(module_str, str) or not module_str.strip():
+        raise ValueError("module_str must be a non-empty string")
+
+    parts = module_str.split(".")
+    if len(parts) < 2:
+        raise ValueError(f"Invalid module string format: {module_str}")
+
+    try:
+        cls_name = parts[-1]
+        module_path = ".".join(parts[:-1])
+        mod = importlib.import_module(module_path)
+        mod_cls = getattr(mod, cls_name)
+        return mod_cls
+    except ImportError as e:
+        raise ImportError(f"Failed to import module '{module_path}': {e}") from e
+    except AttributeError as e:
+        raise AttributeError(
+            f"Class '{cls_name}' not found in module '{module_path}': {e}"
+        ) from e
diff --git a/src/axolotl/utils/logging.py b/src/axolotl/utils/logging.py
new file mode 100644
index 000000000..7cc3530ae
--- /dev/null
+++ b/src/axolotl/utils/logging.py
@@ -0,0 +1,49 @@
+"""Logging helpers to only log on main process."""
+
+import functools
+import logging
+import os
+
+from axolotl.utils.distributed import is_main_process
+
+# Adapted from Accelerate
+# https://github.com/huggingface/accelerate/blob/main/src/accelerate/logging.py
+
+
+class MultiProcessAdapter(logging.LoggerAdapter):
+    """
+    Logger adapter for distributed logging, specifically to only log on main process.
+    """
+
+    @staticmethod
+    def _should_log(main_process_only: bool):
+        return not main_process_only or is_main_process()
+
+    def log(self, level, msg, *args, **kwargs):
+        main_process_only = kwargs.pop("main_process_only", True)
+        kwargs.setdefault("stacklevel", 2)
+
+        if self.isEnabledFor(level) and self._should_log(main_process_only):
+            msg, kwargs = self.process(msg, kwargs)
+            self.logger.log(level, msg, *args, **kwargs)
+
+    @functools.lru_cache(maxsize=10)
+    def warning_once(self, *args, **kwargs):
+        """
+        This method is identical to `logger.warning()`, but will emit the warning with the same message only once
+
+        Note: The cache is for the function arguments, so 2 different callers using the same arguments will hit the
+        cache. The assumption here is that all warning messages are unique across the code. If they aren't then need to
+        switch to another type of cache that includes the caller frame information in the hashing function.
+        """
+        self.warning(*args, **kwargs)
+
+
+def get_logger(name: str, log_level: str | None = None) -> MultiProcessAdapter:
+    if log_level is None:
+        log_level = os.environ.get("AXOLOTL_LOG_LEVEL", None)
+    logger = logging.getLogger(name)
+    if log_level is not None:
+        logger.setLevel(log_level.upper())
+        logger.root.setLevel(log_level.upper())
+    return MultiProcessAdapter(logger, extra={})
diff --git a/src/axolotl/utils/mistral/__init__.py b/src/axolotl/utils/mistral/__init__.py
new file mode 100644
index 000000000..eb1e2df89
--- /dev/null
+++ b/src/axolotl/utils/mistral/__init__.py
@@ -0,0 +1,5 @@
+"""Init for `axolotl.utils.mistral` module."""
+
+from axolotl.utils.mistral.mistral_tokenizer import HFMistralTokenizer
+
+__all__ = ["HFMistralTokenizer"]
diff --git a/src/axolotl/utils/mistral/mistral_tokenizer.py b/src/axolotl/utils/mistral/mistral_tokenizer.py
new file mode 100644
index 000000000..61cbdc5b0
--- /dev/null
+++ b/src/axolotl/utils/mistral/mistral_tokenizer.py
@@ -0,0 +1,220 @@
+"""Wrapper for MistralTokenizer from mistral-common"""
+
+import os
+from typing import Optional
+
+import numpy as np
+from mistral_common.protocol.instruct.validator import ValidationMode
+from mistral_common.tokens.tokenizers.utils import download_tokenizer_from_hf_hub
+from torch import Tensor
+from transformers.tokenization_mistral_common import MistralCommonTokenizer
+from transformers.tokenization_utils_base import VERY_LARGE_INTEGER
+
+
+class HFMistralTokenizer(MistralCommonTokenizer):
+    """
+    Wraps mistral_common.tokens.tokenizers.mistral.MistralTokenizer
+    and exposes HuggingFace API for special tokens.
+    """
+
+    def __init__(self, name_or_path: str, **kwargs):
+        """
+        Args:
+            name_or_path: The name or path to the tokenizer files or the repo id.
+            **kwargs: Additional keyword arguments passed to the parent class.
+        """
+        kwargs.pop("mode", None)
+
+        mode = ValidationMode.finetuning
+        super().__init__(**kwargs, mode=mode)
+
+        self._name_or_path = name_or_path
+
+        # set mode as is not set upstream
+        self._set_mode(mode)
+
+    @property
+    def name_or_path(self) -> str:
+        return self._name_or_path
+
+    @property
+    def chat_template(self) -> str | None:
+        """Chat template is not supported. Dummy method to satisfy HuggingFace API."""
+        return "[This is a dummy chat template]"
+
+    def _set_mode(self, mode: ValidationMode):
+        """Set the mode of the MistralRequestValidator.
+
+        Args:
+            mode: The mode to set.
+
+        Raises:
+            RuntimeError: If the MistralRequestValidator does not have a _mode attribute.
+        """
+        # Check if MistralRequestValidator has a _mode attribute.
+        # This is a private API and may change in the future.
+        # pylint: disable=protected-access
+        from mistral_common.protocol.instruct.validator import MistralRequestValidator
+
+        if not (
+            hasattr(self.tokenizer, "_chat_completion_request_validator")
+            and isinstance(
+                self.tokenizer._chat_completion_request_validator,
+                MistralRequestValidator,
+            )
+            and hasattr(self.tokenizer._chat_completion_request_validator, "_mode")
+        ):
+            raise RuntimeError(
+                f"Unable to switch mistral tokenizer to {mode.value} mode - "
+                "private API `_chat_completion_request_validator._mode` missing."
+            )
+
+        self.tokenizer._chat_completion_request_validator._mode = mode
+
+    def apply_chat_template(  # type: ignore
+        self,
+        conversation: list[dict] | list[list[dict]],
+        chat_template: str | None = None,  # pylint: disable=unused-argument
+        add_generation_prompt: bool = False,
+        **kwargs,
+    ) -> str | list[int]:
+        """Patched fn to handle setting serving mode, continue_final_message, remove chat_template and add_generation_prompt kwarg"""
+
+        try:
+            if add_generation_prompt:
+                self._set_mode(ValidationMode.serving)
+                kwargs["continue_final_message"] = True
+
+            out = super().apply_chat_template(conversation, **kwargs)
+
+            return out  # type: ignore
+
+        finally:
+            if add_generation_prompt:
+                self._set_mode(ValidationMode.finetuning)
+
+    def decode(  # type: ignore
+        self,
+        token_ids: int | list[int] | np.ndarray | Tensor,
+        **kwargs,
+    ) -> str:
+        """
+        Decode token_ids into str.
+
+        This overrides upstream.decode to convert int to list[int]
+        """
+
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+
+        return super().decode(token_ids, **kwargs)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str | os.PathLike,
+        *init_inputs,
+        mode: ValidationMode = ValidationMode.test,
+        cache_dir: Optional[str | os.PathLike] = None,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[str | bool] = None,
+        revision: str = "main",
+        model_max_length: int = VERY_LARGE_INTEGER,
+        padding_side: str = "left",
+        truncation_side: str = "right",
+        model_input_names: Optional[list[str]] = None,
+        clean_up_tokenization_spaces: bool = False,
+        **kwargs,
+    ):
+        r"""
+        Patched fn to pass `name_or_path` and remove extra kwargs.
+
+        Instantiate a `MistralCommonTokenizer` from a predefined
+        tokenizer.
+
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                Can be either:
+
+                - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
+                - A path to a *directory* containing the tokenizer config, for instance saved
+                  using the [`MistralCommonTokenizer.tokenization_mistral_common.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+            mode (`ValidationMode`, *optional*, defaults to `ValidationMode.test`):
+                Validation mode for the `MistralTokenizer` tokenizer.
+            cache_dir (`str` or `os.PathLike`, *optional*):
+                Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download the vocabulary files and override the cached versions if they
+                exist.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `huggingface-cli login` (stored in `~/.huggingface`).
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether or not to only rely on local files and not to attempt to download any files.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+            max_length (`int`, *optional*):
+                Controls the maximum length to use by one of the truncation/padding parameters.
+
+                If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
+                is required by one of the truncation/padding parameters. If the model has no specific maximum input
+                length (like XLNet) truncation/padding to a maximum length will be deactivated.
+            padding_side (`str`, *optional*, defaults to `"left"`):
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
+            truncation_side (`str`, *optional*, defaults to `"right"`):
+                The side on which the model should have truncation applied. Should be selected between ['right', 'left'].
+            model_input_names (`List[string]`, *optional*):
+                The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or
+                `"attention_mask"`). Default value is picked from the class attribute of the same name.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+                Whether or not the model should cleanup the spaces that were added when splitting the input text during the
+                tokenization process.
+            kwargs (additional keyword arguments, *optional*):
+                Not supported by `MistralCommonTokenizer.from_pretrained`.
+                Will raise an error if used.
+        """
+        if init_inputs:
+            raise ValueError(
+                "`init_inputs` are not supported by `MistralCommonTokenizer.from_pretrained`."
+            )
+
+        # Delete trust_remote_code as it does nothing
+        kwargs.pop("trust_remote_code", None)
+
+        # Delete tokenizer as it does nothing
+        kwargs.pop("tokenizer", None)
+
+        # Handle kwargs and AutoTokenizer case
+        if kwargs and not kwargs.keys() == {"_from_auto"}:
+            raise ValueError(
+                f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.from_pretrained`."
+            )
+
+        if not os.path.isfile(pretrained_model_name_or_path):
+            tokenizer_path = download_tokenizer_from_hf_hub(
+                repo_id=str(pretrained_model_name_or_path),
+                cache_dir=str(cache_dir),
+                token=token,
+                revision=revision,
+                force_download=force_download,
+                local_files_only=local_files_only,
+            )
+        else:
+            tokenizer_path = str(pretrained_model_name_or_path)
+
+        return cls(
+            name_or_path=str(pretrained_model_name_or_path),
+            tokenizer_path=tokenizer_path,
+            mode=mode,
+            model_max_length=model_max_length,
+            padding_side=padding_side,
+            truncation_side=truncation_side,
+            model_input_names=model_input_names,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+        )
diff --git a/src/axolotl/utils/quantization.py b/src/axolotl/utils/quantization.py
new file mode 100644
index 000000000..f9a30b660
--- /dev/null
+++ b/src/axolotl/utils/quantization.py
@@ -0,0 +1,185 @@
+"""
+Utilities for quantization including QAT and PTQ using torchao.
+"""
+
+import torch
+from torch import nn
+from torchao.core.config import AOBaseConfig
+from torchao.quantization import quantize_
+from torchao.quantization.qat import (
+    FakeQuantizeConfig,
+    FromIntXQuantizationAwareTrainingConfig,
+    IntXQuantizationAwareTrainingConfig,
+)
+from torchao.quantization.quant_api import (
+    Int4DynamicActivationInt4WeightConfig,
+    Int4WeightOnlyConfig,
+    Int8DynamicActivationInt4WeightConfig,
+    Int8DynamicActivationInt8WeightConfig,
+    Int8WeightOnlyConfig,
+    UIntXWeightOnlyConfig,
+    _is_linear,
+)
+
+from axolotl.utils.schemas.enums import TorchIntDType
+
+
+def get_ptq_config(
+    weight_dtype: TorchIntDType,
+    activation_dtype: TorchIntDType | None = None,
+    group_size: int | None = None,
+) -> AOBaseConfig:
+    """
+    This function is used to build a post-training quantization config.
+
+    Args:
+        weight_dtype: The dtype to use for weight quantization.
+        activation_dtype: The dtype to use for activation quantization.
+        group_size: The group size to use for weight quantization.
+
+    Returns:
+        The post-training quantization config.
+
+    Raises:
+        ValueError: If the activation dtype is not specified and the weight dtype is not int8 or int4,
+            or if the group size is not specified for int8 or int4 weight only quantization.
+    """
+    if activation_dtype is None:
+        if not weight_dtype.value.is_signed:  # type: ignore[attr-defined,union-attr]
+            return UIntXWeightOnlyConfig(
+                dtype=weight_dtype.value,
+                group_size=group_size,
+                set_inductor_config=False,
+            )
+        if weight_dtype == TorchIntDType.int8:
+            if group_size is None:
+                raise ValueError(
+                    "group_size must be specified for int8 weight only quantization"
+                )
+            return Int8WeightOnlyConfig(
+                group_size=group_size,
+            )
+        if weight_dtype == TorchIntDType.int4:
+            if group_size is None:
+                raise ValueError(
+                    "group_size must be specified for int4 weight only quantization"
+                )
+            return Int4WeightOnlyConfig(
+                group_size=group_size,
+            )
+    if activation_dtype == TorchIntDType.int4 and weight_dtype == TorchIntDType.int4:
+        return Int4DynamicActivationInt4WeightConfig()
+    if activation_dtype == TorchIntDType.int8 and weight_dtype == TorchIntDType.int8:
+        return Int8DynamicActivationInt8WeightConfig()
+    if activation_dtype == TorchIntDType.int8 and weight_dtype == TorchIntDType.int4:
+        return Int8DynamicActivationInt4WeightConfig()
+    raise ValueError(
+        f"Invalid activation/weight dtype combination: {activation_dtype}/{weight_dtype}"
+    )
+
+
+def prepare_model_for_qat(
+    model,
+    weight_dtype: TorchIntDType,
+    group_size: int,
+    activation_dtype: TorchIntDType | None = None,
+    quantize_embedding: bool = False,
+):
+    """
+    This function is used to prepare a model for QAT by swapping the model's linear
+    layers with fake quantized linear layers, and optionally the embedding weights with
+    fake quantized embedding weights.
+
+    Args:
+        model: The model to quantize.
+        weight_dtype: The dtype to use for weight quantization.
+        group_size: The group size to use for weight quantization.
+        activation_dtype: The dtype to use for activation quantization.
+        quantize_embedding: Whether to quantize the model's embedding weights.
+
+    Raises:
+        ValueError: If the activation/weight dtype combination is invalid.
+    """
+    if activation_dtype:
+        activation_config = FakeQuantizeConfig(
+            dtype=activation_dtype.value, granularity="per_token", is_symmetric=False
+        )
+    weight_config = FakeQuantizeConfig(dtype=weight_dtype.value, group_size=group_size)
+    linear_quantize_config = IntXQuantizationAwareTrainingConfig(
+        activation_config=None if activation_dtype is None else activation_config,
+        weight_config=weight_config,
+    )
+    quantize_(model, linear_quantize_config)
+    if quantize_embedding:
+        # activation fake quantization is not supported for embedding layers
+        embedding_quantize_config = IntXQuantizationAwareTrainingConfig(
+            activation_config=None,
+            weight_config=weight_config,
+        )
+        quantize_(
+            model,
+            embedding_quantize_config,
+            filter_fn=lambda m, _: isinstance(m, torch.nn.Embedding),
+        )
+
+
+def quantize_model_for_ptq(
+    model,
+    weight_dtype: TorchIntDType,
+    group_size: int | None = None,
+    activation_dtype: TorchIntDType | None = None,
+    quantize_embedding: bool | None = None,
+):
+    """
+    This function is used to quantize a model for post-training quantization.
+    It swaps the model's linear layers with fake quantized linear layers.
+    If `quantize_embedding` is True, it will also swap the model's embedding weights with fake quantized embedding weights.
+
+    Args:
+        model: The model to quantize.
+        weight_dtype: The dtype to use for weight quantization.
+        group_size: The group size to use for weight quantization.
+        activation_dtype: The dtype to use for activation quantization.
+        quantize_embedding: Whether to quantize the model's embedding weights.
+
+    """
+    linear_ptq_config = get_ptq_config(
+        weight_dtype=weight_dtype,
+        activation_dtype=activation_dtype,
+        group_size=group_size,
+    )
+    quantize_(model, linear_ptq_config)
+    if quantize_embedding:
+        embedding_quantize_config = get_ptq_config(
+            weight_dtype=weight_dtype,
+            activation_dtype=None,
+            group_size=group_size,
+        )
+        quantize_(
+            model,
+            embedding_quantize_config,
+            filter_fn=lambda m, _: isinstance(m, torch.nn.Embedding),
+        )
+
+
+def convert_qat_model_for_ptq(
+    model,
+    *,
+    quantize_embedding: bool | None = None,
+):
+    """
+    This function is used to convert a swap fake-quantized modules in a model
+    which has been trained with QAT back to the original modules, ready for PTQ.
+
+    Args:
+        model: The model to convert.
+        quantize_embedding: Whether to quantize the model's embedding weights.
+    """
+    if quantize_embedding:
+
+        def filter_fn(m, _):
+            return isinstance(m, nn.Embedding) or _is_linear(m)
+
+    else:
+        filter_fn = _is_linear
+    quantize_(model, FromIntXQuantizationAwareTrainingConfig(), filter_fn=filter_fn)
diff --git a/src/axolotl/utils/samplers/multipack.py b/src/axolotl/utils/samplers/multipack.py
index 1bfa2ec6e..af62c0a4f 100644
--- a/src/axolotl/utils/samplers/multipack.py
+++ b/src/axolotl/utils/samplers/multipack.py
@@ -3,8 +3,9 @@ Multipack Batch Sampler - An efficient batch sampler for packing variable-length
 into fixed-capacity batches to optimize memory usage and training throughput.
 """
 
-import logging
+import gc
 import math
+import time
 from concurrent.futures import ProcessPoolExecutor
 from multiprocessing import cpu_count, get_context
 from typing import Iterable, Iterator, Union
@@ -14,9 +15,9 @@ import numpy as np
 from torch.utils.data import BatchSampler, Sampler, SequentialSampler
 
 from axolotl.utils.distributed import reduce_and_broadcast
+from axolotl.utils.logging import get_logger
 
-LOG = logging.getLogger(__name__)
-LOG.setLevel(logging.INFO)
+LOG = get_logger(__name__)
 
 
 @numba.njit
@@ -127,7 +128,7 @@ def pack_parallel(
     bin_size: int,
     num_processes: int | None = None,
     safe_mode: bool = True,
-    mp_start_method: str | None = "spawn",
+    mp_start_method: str | None = "fork",
 ) -> list[list[int]]:
     """Pack sequences into bins using parallel processing.
 
@@ -146,7 +147,7 @@ def pack_parallel(
     """
     num_items = len(sequence_lengths)
     if num_processes is None:
-        num_processes = max(1, min(num_items // group_size, cpu_count()))
+        num_processes = max(1, min(num_items // group_size, cpu_count(), 16))
 
     # Create tasks for parallel processing
     tasks = []
@@ -259,13 +260,14 @@ class MultipackBatchSampler(BatchSampler):
         batch_max_len: int,  # Maximum sequence length (bin capacity)
         lengths: np.ndarray,  # Sequence lengths
         packing_efficiency_estimate: float = 1.0,  # Initial efficiency estimate
-        drop_last: bool = False,  # Whether to drop final batches (might be incomplete)
-        num_count_samples: int = 16,  # Number of times to estimate batch count
+        drop_last: bool = True,  # Whether to drop final batches (might be incomplete)
+        num_count_samples: int = 4,  # Number of times to estimate batch count
         sequential: bool = False,  # Whether to use sequential packing
         group_size: int = 100_000,  # Size of groups for parallel packing
         bin_size: int = 200,  # The max number of samples that can be packed in a single bin
         num_processes: int | None = None,  # Number of processes for parallel packing
         safe_mode: bool = True,  # Conservative packing to prevent training instability
+        mp_start_method: str = "fork",
         **kwargs,  # pylint: disable=unused-argument
     ):
         super().__init__(sampler, batch_size, drop_last)
@@ -278,6 +280,7 @@ class MultipackBatchSampler(BatchSampler):
         self.bin_size = bin_size
         self.num_processes = num_processes
         self.safe_mode = safe_mode
+        self.mp_start_method = mp_start_method
 
         assert isinstance(self.lengths, np.ndarray)
 
@@ -333,13 +336,15 @@ class MultipackBatchSampler(BatchSampler):
             bins = [[indices[b_idx] for b_idx in bin_indices] for bin_indices in bins]
         else:
             # Use parallel packing
+            num_processes = self.num_processes or 1
             all_bins = pack_parallel(
                 lengths,
                 bin_capacity=self.batch_max_len,
                 group_size=self.group_size,
                 bin_size=self.bin_size,
-                num_processes=self.num_processes,
+                num_processes=min(4, num_processes) if num_processes else 4,
                 safe_mode=self.safe_mode,
+                mp_start_method=self.mp_start_method,
             )
 
             # Map bin indices back to original indices
@@ -350,6 +355,7 @@ class MultipackBatchSampler(BatchSampler):
             # Calculate efficiency statistics
             total_used = lengths.sum()
             total_slots = len(all_bins) * self.batch_max_len
+            del all_bins
 
         # Group bins into batches (each batch contains batch_size bins)
         batches = [
@@ -369,6 +375,7 @@ class MultipackBatchSampler(BatchSampler):
             self.total_token_slots += total_slots
 
         self._batches = batches
+        gc.collect()
         return batches
 
     def __iter__(self) -> Iterator[list[list[int]]]:
@@ -444,10 +451,21 @@ class MultipackBatchSampler(BatchSampler):
 
         if self._len_across_ranks is None:
             # Sample multiple times to get stable estimate
-            len_batches = min(  # pylint: disable=consider-using-generator
-                [len(self._batches) for _ in range(self.num_count_samples)]
-            )
+            _sampled_lens = []
+            for _ in range(self.num_count_samples):
+                self._batches = None  # Reset cached batches
+                # log timer for generating batches
+                start_time = time.time()
+                _sampled_lens.append(len(self.generate_batches(set_stats=False)))
+                LOG.debug(f"generate_batches time: {time.time() - start_time}")
+            len_batches = min(_sampled_lens)
+
             # Gather minimum across all ranks
-            self._len_across_ranks = self.gather_len_batches(len_batches)
+            if self._len_across_ranks is None:
+                self._len_across_ranks = self.gather_len_batches(len_batches)
+            else:
+                self._len_across_ranks = min(
+                    self._len_across_ranks, self.gather_len_batches(len_batches)
+                )
 
         return self._len_across_ranks
diff --git a/src/axolotl/utils/schedulers.py b/src/axolotl/utils/schedulers.py
index b550ac02c..cdaf92271 100644
--- a/src/axolotl/utils/schedulers.py
+++ b/src/axolotl/utils/schedulers.py
@@ -2,7 +2,9 @@
 
 import math
 from functools import partial
+from typing import Sequence
 
+from torch import Tensor
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LambdaLR, LRScheduler
 
@@ -44,8 +46,10 @@ class RexLR(LRScheduler):
 
         # Ensure each parameter group has an "initial_lr" key to avoid issues when resuming.
         for group in optimizer.param_groups:
-            group.setdefault("initial_lr", group["lr"])
-
+            initial_lr = group["lr"]
+            if isinstance(initial_lr, Tensor):
+                initial_lr = initial_lr.clone()
+            group.setdefault("initial_lr", initial_lr)
         # Pass self.last_step as last_epoch to the parent.
         super().__init__(optimizer, last_epoch=self.last_step)
 
@@ -292,3 +296,50 @@ def get_cosine_schedule_with_warmup_decay_constant(
         num_cycles=num_cycles,
     )
     return LambdaLR(optimizer, lr_lambda, last_epoch)
+
+
+class JaggedLRRestartScheduler(LRScheduler):
+    """Wraps another scheduler to apply per-lora-restart learning rate warmups."""
+
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        inner_schedule: LRScheduler,
+        jagged_restart_steps: int,
+        jagged_restart_warmup_steps: int,
+        jagged_restart_anneal_steps: int = 1,
+        min_lr_scale: float = 0.001,
+    ) -> None:
+        # pylint: disable=duplicate-code
+        self.inner_schedule = inner_schedule
+        self.restarts_steps = jagged_restart_steps
+        self.warmup_steps = jagged_restart_warmup_steps
+        self.anneal_steps = jagged_restart_anneal_steps
+        self.min_lr_scale = min_lr_scale
+        super().__init__(optimizer, inner_schedule.last_epoch)
+
+    def get_lr(self) -> float | Sequence[float]:
+        self.inner_schedule.last_epoch = self.last_epoch
+
+        original = self.inner_schedule.get_lr()
+        step = self.last_epoch
+
+        if step < self.restarts_steps - self.anneal_steps:
+            scale = 1
+        else:
+            per_restart_progress = step % self.restarts_steps
+            if per_restart_progress < self.warmup_steps:
+                cycle_t = min(1.0, (per_restart_progress) / self.warmup_steps)
+            elif per_restart_progress > (self.restarts_steps - self.anneal_steps):
+                cycle_t = min(
+                    1.0,
+                    (self.restarts_steps - per_restart_progress) / self.anneal_steps,
+                )
+            else:
+                cycle_t = 1
+            scale = cycle_t * (1 - self.min_lr_scale) + self.min_lr_scale
+
+        if isinstance(original, Sequence):
+            return [lr * scale for lr in original]
+
+        return original * scale
diff --git a/src/axolotl/utils/schemas/config.py b/src/axolotl/utils/schemas/config.py
index a72019872..9756072e8 100644
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -2,8 +2,6 @@
 
 # pylint: disable=too-many-lines
 
-import logging
-import os
 from typing import Annotated, Any, Literal
 
 from annotated_types import MinLen
@@ -13,11 +11,11 @@ from pydantic import (
     Field,
     StringConstraints,
     field_serializer,
-    field_validator,
     model_validator,
 )
-from transformers.utils.import_utils import is_torch_npu_available
 
+from axolotl.utils.datasets import get_default_process_count
+from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.datasets import (
     DatasetConfig,
     DPODataset,
@@ -44,21 +42,22 @@ from axolotl.utils.schemas.model import (
 )
 from axolotl.utils.schemas.multimodal import MultiModalConfig
 from axolotl.utils.schemas.peft import LoraConfig, ReLoRAConfig
-from axolotl.utils.schemas.training import HyperparametersConfig
+from axolotl.utils.schemas.quantization import PTQConfig, QATConfig
+from axolotl.utils.schemas.training import HyperparametersConfig, JaggedLRConfig
 from axolotl.utils.schemas.trl import TRLConfig
+from axolotl.utils.schemas.validation import ValidationMixin
 from axolotl.utils.schemas.vllm import VllmConfig
 
-LOG = logging.getLogger(__name__)
-
-SUPPORTED_METRICS = {"sacrebleu", "comet", "ter", "chrf", "perplexity"}
+LOG = get_logger(__name__)
 
 
-# pylint: disable=too-many-public-methods,too-many-ancestors
+# pylint: disable=too-many-ancestors
 class AxolotlInputConfig(
     ModelInputConfig,
     ModelOutputConfig,
     LoraConfig,
     ReLoRAConfig,
+    JaggedLRConfig,
     HyperparametersConfig,
     WandbConfig,
     MLFlowConfig,
@@ -69,35 +68,93 @@ class AxolotlInputConfig(
     MultiModalConfig,
     RemappedParameters,
     DeprecatedParameters,
+    ValidationMixin,
     BaseModel,
 ):
-    """Wrapper of all config options"""
+    """Wrapper of all config options."""
 
     model_config = {"populate_by_name": True}
 
-    strict: bool | None = Field(default=False)
-    resume_from_checkpoint: str | None = None
-    auto_resume_from_checkpoints: bool | None = None
-    resize_token_embeddings_to_32x: bool | None = None
+    strict: bool | None = Field(
+        default=False,
+        json_schema_extra={"description": "Allow overwrite yml config using from cli"},
+    )
+    resume_from_checkpoint: str | None = Field(
+        default=None,
+        json_schema_extra={"description": "Resume from a specific checkpoint dir"},
+    )
+    auto_resume_from_checkpoints: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "If resume_from_checkpoint isn't set and you simply want it to start where it left off. Be careful with this being turned on between different models."
+        },
+    )
+    resize_token_embeddings_to_32x: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Resize the model embeddings when new tokens are added to multiples of 32. This is reported to improve training speed on some models"
+        },
+    )
     mean_resizing_embeddings: bool | None = False
     # optionally shrink the embeddings when the tokenizer vocab size is smaller
-    shrink_embeddings: bool | None = None
-    embeddings_skip_upcast: bool | None = None
+    shrink_embeddings: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink."
+        },
+    )
+    embeddings_skip_upcast: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs"
+        },
+    )
 
-    rl: RLType | None = None
+    trainer_cls: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "module to custom trainer class to use for training"
+        },
+    )
+
+    rl: RLType | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo'"
+        },
+    )
     trl: TRLConfig | None = Field(
         default_factory=lambda: TRLConfig(),  # pylint: disable=unnecessary-lambda
     )
     vllm: VllmConfig | None = Field(
         default_factory=lambda: VllmConfig(),  # pylint: disable=unnecessary-lambda
     )
-    reward_model: bool | None = None
-    process_reward_model: bool | None = None
+    qat: QATConfig | None = None
+    quantization: PTQConfig | None = None
+    reward_model: bool | None = Field(
+        default=None,
+        json_schema_extra={"description": "Reward modelling: `True` or `False`"},
+    )
+    process_reward_model: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Process reward modelling: `True` or `False`"
+        },
+    )
     num_labels: int | None = None
     # Whether to use weighting in DPO trainer.
     # If `None`, default is `False` in the trainer.
-    dpo_use_weighting: bool | None = None
+    dpo_use_weighting: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Whether to perform weighting in DPO trainer"
+        },
+    )
     dpo_use_logits_to_keep: bool | None = None
+    dpo_label_smoothing: float | None = None
+    dpo_norm_loss: bool | None = None
+    dpo_padding_free: bool | None = None
+    dpo_generate_during_eval: bool | None = None
 
     datasets: (
         Annotated[
@@ -105,7 +162,12 @@ class AxolotlInputConfig(
             MinLen(1),
         ]
         | None
-    ) = None
+    ) = Field(
+        default=None,
+        json_schema_extra={
+            "description": "A list of one or more datasets to finetune the model with"
+        },
+    )
 
     test_datasets: (
         Annotated[
@@ -113,22 +175,74 @@ class AxolotlInputConfig(
             MinLen(1),
         ]
         | None
-    ) = None
-    shuffle_merged_datasets: bool | None = True
-    dataset_prepared_path: str | None = None
-    dataset_shard_num: int | None = None
-    dataset_shard_idx: int | None = None
+    ) = Field(
+        default=None,
+        json_schema_extra={
+            "description": "A list of one or more datasets to eval the model with. You can use either test_datasets, or val_set_size, but not both."
+        },
+    )
+    shuffle_merged_datasets: bool | None = Field(
+        default=True,
+        json_schema_extra={
+            "description": "If false, the datasets will not be shuffled and will keep their original order in `datasets`. The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true."
+        },
+    )
+    shuffle_before_merging_datasets: bool | None = Field(
+        default=False,
+        json_schema_extra={
+            "description": "If true, each dataset in `datasets` will be shuffled before merging. This allows curriculum learning strategies to be applied at the dataset level. Default is false."
+        },
+    )
+    dataset_prepared_path: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Axolotl attempts to save the dataset as an arrow after packing the data together so subsequent training attempts load faster, relative path"
+        },
+    )
+    dataset_shard_num: int | None = Field(
+        default=None, json_schema_extra={"description": "Num shards for whole dataset"}
+    )
+    dataset_shard_idx: int | None = Field(
+        default=None,
+        json_schema_extra={"description": "Index of shard to use for whole dataset"},
+    )
     skip_prepare_dataset: bool | None = False
+    num_dataset_shards_to_save: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Number of shards to save the prepared dataset"
+        },
+    )
 
     pretraining_dataset: (
         Annotated[list[PretrainingDataset | SFTDataset], MinLen(1)] | None
     ) = Field(
         default=None,
-        json_schema_extra={"description": "streaming dataset to use for pretraining"},
+        json_schema_extra={
+            "description": "Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize"
+        },
+    )
+    dataset_processes: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": (
+                "The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()` if not set.\n"
+                "For Runpod VMs, it will default to number of vCPUs via RUNPOD_CPU_COUNT."
+            )
+        },
+    )
+    dataset_exact_deduplication: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Deduplicates datasets and test_datasets with identical entries"
+        },
+    )
+    dataset_keep_in_memory: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Keep dataset in memory while preprocessing. Only needed if cached dataset is taking too much storage"
+        },
     )
-    dataset_processes: int | None = Field(default=min(32, os.cpu_count()))  # type: ignore[type-var]
-    dataset_exact_deduplication: bool | None = None
-    dataset_keep_in_memory: bool | None = None
     dataloader_pin_memory: bool | None = None
     dataloader_num_workers: int | None = None
     dataloader_prefetch_factor: int | None = None
@@ -138,58 +252,180 @@ class AxolotlInputConfig(
 
     remove_unused_columns: bool | None = None
 
-    push_dataset_to_hub: str | None = None
-    hf_use_auth_token: bool | None = None
+    push_dataset_to_hub: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Push prepared dataset to hub - repo_org/repo_name"
+        },
+    )
+    hf_use_auth_token: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets. Required to be true when used in combination with `push_dataset_to_hub`"
+        },
+    )
 
     device: Any | None = None
-    device_map: Any | None = None
+    device_map: Any | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Passed through to transformers when loading the model when launched without accelerate. Use `sequential` when training w/ model parallelism to limit memory"
+        },
+    )
     world_size: int | None = None
-    local_rank: int | None = None
+    local_rank: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Don't mess with this, it's here for accelerate and torchrun"
+        },
+    )
     ddp: bool | None = None
 
-    seed: int | None = None
-    ddp_timeout: int | None = None
-    ddp_bucket_cap_mb: int | None = None
-    ddp_broadcast_buffers: bool | None = None
+    seed: int | None = Field(
+        default=None, json_schema_extra={"description": "Seed for reproducibility"}
+    )
+    ddp_timeout: int | None = Field(
+        default=None,
+        json_schema_extra={"description": "Advanced DDP Arguments - timeout"},
+    )
+    ddp_bucket_cap_mb: int | None = Field(
+        default=None,
+        json_schema_extra={"description": "Advanced DDP Arguments - bucket cap in MB"},
+    )
+    ddp_broadcast_buffers: bool | None = Field(
+        default=None,
+        json_schema_extra={"description": "Advanced DDP Arguments - broadcast buffers"},
+    )
     ddp_find_unused_parameters: bool | None = None
 
-    eval_table_size: int | None = None
-    eval_max_new_tokens: int | None = None
-    do_causal_lm_eval: bool | None = None
-    eval_causal_lm_metrics: list[str] | None = None
+    eval_table_size: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0"
+        },
+    )
+    eval_max_new_tokens: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Total number of tokens generated for predictions sent to wandb. Default is 128"
+        },
+    )
+    do_causal_lm_eval: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Whether to run causal language model evaluation for metrics in `eval_causal_lm_metrics`"
+        },
+    )
+    eval_causal_lm_metrics: list[str] | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "HF evaluate metrics used during evaluation. Default is ['sacrebleu', 'comet', 'ter', 'chrf', 'perplexity']"
+        },
+    )
     do_bench_eval: bool | None = None
     bench_dataset: str | None = None
     bench_split: str | None = None
     metric_for_best_model: str | None = None
     greater_is_better: bool | None = None
 
-    loss_watchdog_threshold: float | None = None
-    loss_watchdog_patience: int | None = None
+    loss_watchdog_threshold: float | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)"
+        },
+    )
+    loss_watchdog_patience: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Number of high-loss steps in a row before the trainer aborts (default: 3)"
+        },
+    )
 
-    gc_steps: int | None = None
+    gc_steps: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Run garbage collection every `gc_steps` steps. -1 will run on epoch end and before evaluations. Default is 0 (disabled)."
+        },
+    )
 
-    bf16: Literal["auto"] | bool | None = "auto"
-    fp16: bool | None = None
-    fp8: bool | None = None
-    bfloat16: bool | None = None  # for non-AMP cases
-    float16: bool | None = None  # for non-AMP cases
-    tf32: bool | None = None
+    bf16: Literal["auto"] | bool | None = Field(
+        default="auto",
+        json_schema_extra={
+            "description": "Use CUDA bf16. bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection. require >=ampere"
+        },
+    )
+    fp16: bool | None = Field(
+        default=None, json_schema_extra={"description": "Use CUDA fp16"}
+    )
+    fp8: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Enable FP8 mixed precision training using TorchAO. Best "
+            "used in combination with torch.compile."
+        },
+    )
+    fp8_enable_fsdp_float8_all_gather: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Enable FSDP float8 all-gather optimization for FP8 training. Can "
+            "improve training speed by 10-15% when FSDP is enabled."
+        },
+    )
+    bfloat16: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "No AMP (automatic mixed precision) - require >=ampere"
+        },
+    )  # for non-AMP cases
+    float16: bool | None = Field(
+        default=None,
+        json_schema_extra={"description": "No AMP (automatic mixed precision)"},
+    )  # for non-AMP cases
+    tf32: bool | None = Field(
+        default=None,
+        json_schema_extra={"description": "Use CUDA tf32 - require >=ampere"},
+    )
     float32: bool | None = None
 
-    # torch_dtype: torch.dtype | None
-
     gradient_checkpointing: Literal["offload", "offload_disk"] | bool | None = Field(
-        default=False
+        default=False,
+        json_schema_extra={
+            "description": "Whether to use gradient checkpointing. Available options are: true, false, 'offload', 'offload_disk'. https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing"
+        },
+    )
+    gradient_checkpointing_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Additional kwargs to pass to the trainer for gradient checkpointing"
+        },
+    )
+    activation_offloading: Literal["legacy", "disk"] | bool | None = Field(
+        default=False,
+        json_schema_extra={
+            "description": "Whether to offload activations. Available options are: true, false, 'legacy', 'disk'."
+        },
     )
-    gradient_checkpointing_kwargs: dict[str, Any] | None = None
 
     unfrozen_parameters: list[str] | None = None
 
+<<<<<<< HEAD
     sequence_len: int = Field(default=512)
     sequence_len_overflow_handling: Literal["drop", "truncate"] = Field(
         default="drop",
         json_schema_extra={
             "description": "How to handle sequences that overflow the sequence_len: 'drop' (remove the sample) or 'truncate' (cut off excess tokens)."
+=======
+    sequence_len: int = Field(
+        default=512,
+        json_schema_extra={
+            "description": "The maximum length of an input to train with, this should typically be less than 2048 as most models have a token/context limit of 2048"
+        },
+    )
+    eval_sequence_len: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "The maximum length of an input for evaluation. If not specified, defaults to sequence_len"
+>>>>>>> origin/main
         },
     )
     min_sample_len: int | None = None
@@ -197,22 +433,66 @@ class AxolotlInputConfig(
         default=512,
         json_schema_extra={"description": "maximum prompt length for RL training"},
     )
-    sample_packing: bool | None = None
-    sample_packing_group_size: int | None = 100_000
-    sample_packing_bin_size: int | None = 200
-    sample_packing_sequentially: bool | None = None
-    eval_sample_packing: bool | None = None
-    pad_to_sequence_len: bool | None = None
-    curriculum_sampling: bool | None = None
+    sample_packing: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'"
+        },
+    )
+    sample_packing_group_size: int | None = Field(
+        default=100_000,
+        json_schema_extra={
+            "description": "The number of samples packed at a time. Increasing the following values helps with packing, but usually only slightly (<%1.)"
+        },
+    )
+    sample_packing_bin_size: int | None = Field(
+        default=200,
+        json_schema_extra={
+            "description": "The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples."
+        },
+    )
+    sample_packing_sequentially: bool | None = Field(
+        default=None,
+        json_schema_extra={"description": "Whether to pack samples sequentially"},
+    )
+    sample_packing_mp_start_method: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "The multiprocessing start method to use for packing. Should be 'fork', 'spawn' or 'forkserver'"
+        },
+    )
+    eval_sample_packing: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Set to 'false' if getting errors during eval with sample_packing on"
+        },
+    )
+    pad_to_sequence_len: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Pad inputs so each step uses constant sized buffers. This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently. Defaults to True if `sample_packing` enabled"
+        },
+    )
+    curriculum_sampling: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Whether to use sequential sampling for curriculum learning"
+        },
+    )
     multipack_real_batches: bool | None = None
     pretraining_sample_concatenation: bool | None = Field(
         default=None,
         json_schema_extra={
-            "description": "whether to soft pack/concatenate samples during pretraining",
+            "description": "whether to concatenate samples during pretraining",
         },
     )
 
-    batch_flattening: Literal["auto"] | bool | None = None
+    batch_flattening: Literal["auto"] | bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Use batch flattening for speedups when not using sample_packing"
+        },
+    )
 
     # for PoSE context length extension
     use_pose: bool | None = None
@@ -228,20 +508,64 @@ class AxolotlInputConfig(
         },
     )
 
-    xformers_attention: bool | None = None
-    sdp_attention: bool | None = None
-    s2_attention: bool | None = None
+    xformers_attention: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Whether to use xformers attention patch https://github.com/facebookresearch/xformers"
+        },
+    )
+    sdp_attention: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html"
+        },
+    )
+    s2_attention: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf"
+        },
+    )
     flex_attention: bool | None = None
     flex_attn_compile_kwargs: dict[str, Any] | None = None
-    flash_attention: bool | None = None
-    flash_attn_cross_entropy: bool | None = None
-    flash_attn_rms_norm: bool | None = None
-    flash_attn_fuse_qkv: bool | None = None
-    flash_attn_fuse_mlp: bool | None = None
-    flash_optimum: bool | None = None
+    flash_attention: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention"
+        },
+    )
+    flash_attn_cross_entropy: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Whether to use flash-attention cross entropy implementation - advanced use only"
+        },
+    )
+    flash_attn_rms_norm: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Whether to use flash-attention rms norm implementation - advanced use only"
+        },
+    )
+    flash_attn_fuse_mlp: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Whether to fuse part of the MLP into a single operation"
+        },
+    )
+    flash_optimum: bool | None = Field(
+        default=None,
+        json_schema_extra={"description": "Whether to use bettertransformers"},
+    )
 
     eager_attention: bool | None = None
 
+    attn_implementation: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Specify a custom attention implementation, used mostly for kernels."
+        },
+    )
+
     unsloth_cross_entropy_loss: bool | None = None
     unsloth_lora_mlp: bool | None = None
     unsloth_lora_qkv: bool | None = None
@@ -249,75 +573,364 @@ class AxolotlInputConfig(
     unsloth_rms_norm: bool | None = None
     unsloth_rope: bool | None = None
 
-    lora_mlp_kernel: bool | None = None
-    lora_qkv_kernel: bool | None = None
-    lora_o_kernel: bool | None = None
+    lora_mlp_kernel: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Apply custom LoRA autograd functions and activation function Triton kernels for speed and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html"
+        },
+    )
+    lora_qkv_kernel: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Apply custom LoRA autograd functions and activation function Triton kernels for speed and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html"
+        },
+    )
+    lora_o_kernel: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Apply custom LoRA autograd functions and activation function Triton kernels for speed and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html"
+        },
+    )
+
+    chunked_cross_entropy: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Whether to use chunked cross entropy loss for memory efficiency"
+        },
+    )
+    chunked_cross_entropy_num_chunks: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Number of chunks to use for chunked cross entropy loss"
+        },
+    )
+
+    tiled_mlp: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Whether to use ALST tiled mlp for memory efficient long context"
+        },
+    )
+
+    tiled_mlp_num_shards: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Number of shards to use for ALST tiled mlp. If unset, it will be set based on seqlen/hidden_size"
+        },
+    )
+
+    tiled_mlp_use_original_mlp: bool | None = Field(
+        default=True,
+        json_schema_extra={
+            "description": "Whether to use original mlp for ALST tiled mlp. Otherwise uses a generic MLP based on llama."
+        },
+    )
 
     llama4_linearized_experts: bool | None = None
 
-    deepspeed: str | dict[str, Any] | None = None
-    fsdp: list[str] | None = None
-    fsdp_config: dict[str, Any] | None = None
+    deepspeed: str | dict[str, Any] | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Deepspeed config path. e.g., deepspeed_configs/zero3.json"
+        },
+    )
+    deepcompile: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Whether to use deepcompile for faster training with deepspeed"
+        },
+    )
+    fsdp: list[str] | None = Field(
+        default=None,
+        json_schema_extra={"description": "FSDP configuration"},
+        deprecated="Configuring FSDP using `fsdp` is deprecated. Please use `fsdp_config` instead. ",
+    )
+    # TODO @SalmanMohammadi strongly type this as its own schema
+    fsdp_config: dict[str, Any] | None = Field(
+        default=None, json_schema_extra={"description": "FSDP configuration options"}
+    )
+    fsdp_version: int | None = Field(
+        default=None,
+        json_schema_extra={"description": "FSDP version"},
+    )
     fsdp_final_state_dict_type: (
         Literal["FULL_STATE_DICT", "LOCAL_STATE_DICT", "SHARDED_STATE_DICT"] | None
-    ) = None
+    ) = Field(
+        default=None,
+        deprecated="Configuring FSDP final state dict type using `fsdp_final_state_dict_type` is deprecated. Please use `fsdp_config.final_state_dict_type` instead.",
+    )
 
-    val_set_size: float | None = Field(default=0.0)
+    val_set_size: float | None = Field(
+        default=0.0,
+        json_schema_extra={
+            "description": "How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval."
+        },
+    )
 
-    sequence_parallel_degree: int | None = None
-    heads_k_stride: int | None = None
-    ring_attn_func: RingAttnFunc | None = None
+    dp_shard_size: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Number of devices to shard across. If not set, will use all available devices."
+        },
+    )
+    dp_replicate_size: int | None = Field(
+        default=None,
+        json_schema_extra={"description": "Number of devices to replicate across."},
+    )
+    sequence_parallel_degree: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Deprecated: use `context_parallel_size` instead"
+        },
+    )
+    context_parallel_size: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Set to a divisor of the number of GPUs available to split sequences into chunks of equal size. Use in long context training to prevent OOM when sequences cannot fit into a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more details."
+        },
+    )
+    heads_k_stride: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Optional; strides across the key dimension. Larger values use more memory but should make training faster. Must evenly divide the number of KV heads in your model."
+        },
+    )
+    ring_attn_func: RingAttnFunc | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing case."
+        },
+    )
+    tensor_parallel_size: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Number of tensor parallel processes in TP group. Only supported with DeepSpeed AutoTP."
+        },
+    )
+    special_tokens: SpecialTokensConfig | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Add or change special tokens. If you add tokens here, you don't need to add them to the `tokens` list."
+        },
+    )
+    tokens: list[str] | None = Field(
+        default=None,
+        json_schema_extra={"description": "Add extra tokens to the tokenizer"},
+    )
+    added_tokens_overrides: dict[int, str] | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Mapping token_id to new_token_string to override reserved added_tokens in the tokenizer. Only works for tokens that are not part of the base vocab (aka are added_tokens). Can be checked if they exist in tokenizer.json added_tokens."
+        },
+    )
 
-    special_tokens: SpecialTokensConfig | None = None
-    tokens: list[str] | None = None
-    added_tokens_overrides: dict[int, str] | None = None
-
-    torch_compile: Literal["auto"] | bool | None = None
-    torch_compile_backend: str | None = None
+    torch_compile: Literal["auto"] | bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Whether to use torch.compile and which backend to use. setting to `auto` will enable torch compile when torch>=2.6.0"
+        },
+    )
+    torch_compile_backend: str | None = Field(
+        default=None,
+        json_schema_extra={"description": "Backend to use for torch.compile"},
+    )
     torch_compile_mode: Literal["default", "reduce-overhead", "max-autotune"] | None = (
         None
     )
 
-    max_steps: int | None = None
-    warmup_steps: int | None = None
-    warmup_ratio: float | None = None
-    eval_steps: int | float | None = None
-    evals_per_epoch: int | None = None
-    eval_strategy: str | None = None
-    save_steps: int | float | None = None
-    saves_per_epoch: int | None = None
-    save_strategy: str | None = None
-    save_total_limit: int | None = None
-    logging_steps: int | None = None
-    early_stopping_patience: int | None = None
+    max_steps: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Maximum number of iterations to train for. It precedes num_epochs which means that if both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps"
+        },
+    )
+    warmup_steps: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Number of warmup steps. Cannot use with warmup_ratio"
+        },
+    )
+    warmup_ratio: float | None = Field(
+        default=None,
+        json_schema_extra={"description": "Warmup ratio. Cannot use with warmup_steps"},
+    )
+    eval_steps: int | float | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Leave empty to eval at each epoch, integer for every N steps. float for fraction of total steps"
+        },
+    )
+    evals_per_epoch: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Number of times per epoch to run evals, mutually exclusive with eval_steps"
+        },
+    )
+    eval_strategy: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer from `eval_steps`"
+        },
+    )
+
+    save_steps: int | float | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Leave empty to save at each epoch, integer for every N steps. float for fraction of total steps"
+        },
+    )
+    saves_per_epoch: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Number of times per epoch to save a checkpoint, mutually exclusive with save_steps"
+        },
+    )
+    save_strategy: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better result is achieved, leave empty to infer from `save_steps`"
+        },
+    )
+    save_total_limit: int | None = Field(
+        default=None, json_schema_extra={"description": "Checkpoints saved at a time"}
+    )
+    save_first_step: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Whether to checkpoint a model after the first step of training. Defaults to False."
+        },
+    )
+
+    logging_steps: int | None = Field(
+        default=None, json_schema_extra={"description": "Logging frequency"}
+    )
+    early_stopping_patience: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Stop training after this many evaluation losses have increased in a row. https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback"
+        },
+    )
     load_best_model_at_end: bool | None = False
-    save_only_model: bool | None = False
-    use_tensorboard: bool | None = None
-    profiler_steps: int | None = None
-    include_tokens_per_second: bool | None = None
+    save_only_model: bool | None = Field(
+        default=False,
+        json_schema_extra={
+            "description": "Save only the model weights, skipping the optimizer. Using this means you can't resume from checkpoints."
+        },
+    )
+    use_tensorboard: bool | None = Field(
+        default=None, json_schema_extra={"description": "Use tensorboard for logging"}
+    )
+    profiler_steps: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Enable the pytorch profiler to capture the first N steps of training to the output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more information. Snapshots can be visualized @ https://pytorch.org/memory_viz"
+        },
+    )
+    profiler_steps_start: int | None = Field(
+        default=0,
+        json_schema_extra={
+            "description": "Which step to start the profiler at. Useful for only capturing a few steps mid-run."
+        },
+    )
+    include_tokens_per_second: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "bool of whether to include tokens trainer per second in the training metrics. This iterates over the entire dataset once, so it takes some time."
+        },
+    )
 
-    neftune_noise_alpha: float | None = None
+    neftune_noise_alpha: float | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings. Currently only supported on Llama and Mistral"
+        },
+    )
 
-    orpo_alpha: float | None = None
-    rpo_alpha: float | None = None
-    simpo_gamma: float | None = None
-    cpo_alpha: float | None = None
+    orpo_alpha: float | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to `beta` in `ORPOConfig` due to trl mapping."
+        },
+    )
+    rpo_alpha: float | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Weighting of NLL term in loss from RPO paper"
+        },
+    )
+    simpo_gamma: float | None = Field(
+        default=None,
+        json_schema_extra={"description": "Target reward margin for the SimPO loss"},
+    )
+    cpo_alpha: float | None = Field(
+        default=None, json_schema_extra={"description": "Weight of the BC regularizer"}
+    )
 
-    kto_desirable_weight: float | None = None
-    kto_undesirable_weight: float | None = None
-    rl_beta: float | None = None
+    kto_desirable_weight: float | None = Field(
+        default=None,
+        json_schema_extra={"description": "Factor for desirable loss term in KTO loss"},
+    )
+    kto_undesirable_weight: float | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Factor for undesirable loss term in KTO loss"
+        },
+    )
+    rl_beta: float | None = Field(
+        default=None,
+        json_schema_extra={"description": "The beta parameter for the RL training"},
+    )
 
-    max_memory: dict[int | Literal["cpu", "disk"], int | str] | None = None
-    gpu_memory_limit: int | str | None = None
-    low_cpu_mem_usage: bool | None = None
+    max_memory: dict[int | Literal["cpu", "disk"], int | str] | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Defines the max memory usage per gpu on the system. Passed through to transformers when loading the model."
+        },
+    )
+    gpu_memory_limit: int | str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset"
+        },
+    )
+    low_cpu_mem_usage: bool | None = Field(
+        default=None,
+        json_schema_extra={"description": "Whether to use low_cpu_mem_usage"},
+    )
 
     chat_template: (
         ChatTemplate
         | Annotated[str, StringConstraints(pattern="^tokenizer_default_fallback_")]
-    ) | None = None
-    chat_template_jinja: str | None = None
-    eot_tokens: list[str] | None = None
-    default_system_message: str | None = None
+    ) | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "The name of the chat template to use for training, following values are supported: tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default value. alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py. tokenizer_default_fallback_*: where * is the name of the chat template to fallback to. E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not available in the tokenizer. jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field. The selected chat template will be saved to the tokenizer_config.json for easier inferencing"
+        },
+    )
+    chat_template_jinja: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Custom jinja template or path to jinja file for chat template. This will be only used if chat_template is set to `jinja` or `null` (in which case chat_template is automatically set to `jinja`). Default is null."
+        },
+    )
+    chat_template_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Additional kwargs to pass to the chat template. This is useful for customizing the chat template. For example, you can pass `thinking=False` to add a generation prompt to the chat template."
+        },
+    )
+    eot_tokens: list[str] | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the boundaries between conversation turns. For example: ['/INST', '</s>', '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is useful for templates that use multiple delimiter tokens."
+        },
+    )
+    default_system_message: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Changes the default system message. Currently only supports chatml."
+        },
+    )
 
     fix_untrained_tokens: int | list[int] | None = None
 
@@ -325,41 +938,50 @@ class AxolotlInputConfig(
     is_preprocess: bool | None = None
     preprocess_iterable: bool | None = None
 
-    total_num_tokens: int | None = None
+    total_num_tokens: int | None = Field(
+        default=None,
+        json_schema_extra={"description": "Total number of tokens - internal use"},
+    )
     total_supervised_tokens: int | None = None
-    sample_packing_eff_est: float | None = None
+    sample_packing_eff_est: float | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "You can set these packing optimizations AFTER starting a training at least once. The trainer will provide recommended values for these values."
+        },
+    )
     axolotl_config_path: str | None = None
 
-    is_falcon_derived_model: bool | None = Field(default=None)
-    is_llama_derived_model: bool | None = Field(default=None)
-    is_mistral_derived_model: bool | None = Field(default=None)
-    is_qwen_derived_model: bool | None = Field(default=None)
+    is_falcon_derived_model: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Internal use only - Used to identify which the model is based on"
+        },
+    )
+    is_llama_derived_model: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Internal use only - Used to identify which the model is based on"
+        },
+    )
+    is_mistral_derived_model: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Internal use only - Used to identify which the model is based on. Please note that if you set this to true, `padding_side` will be set to 'left' by default"
+        },
+    )
+    is_qwen_derived_model: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Internal use only - Used to identify which the model is based on"
+        },
+    )
 
-    plugins: list[str] | None = Field(default=None)
-
-    @field_validator("datasets", mode="before")
-    @classmethod
-    def deprecate_sharegpt_datasets(cls, datasets):
-        for _, ds_cfg in enumerate(datasets):
-            # Handle both dict and pydantic model cases
-            ds_type = (
-                ds_cfg.get("type")
-                if isinstance(ds_cfg, dict)
-                else getattr(ds_cfg, "type", None)
-            )
-            if not ds_type:
-                continue
-
-            # skip if it's a dict (for custom user instruction prompt)
-            if isinstance(ds_type, dict):
-                continue
-
-            if isinstance(ds_type, str) and ds_type.startswith("sharegpt"):
-                raise ValueError(
-                    "`type: sharegpt.*` is deprecated. Please use `type: chat_template` instead."
-                )
-
-        return datasets
+    plugins: list[str] | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available plugins or doc below for more details. https://docs.axolotl.ai/docs/custom_integrations.html"
+        },
+    )
 
     @field_serializer("datasets")
     def datasets_serializer(
@@ -369,897 +991,9 @@ class AxolotlInputConfig(
             return [ds_config.model_dump(exclude_none=True) for ds_config in ds_configs]
         return None
 
-    @model_validator(mode="before")
-    @classmethod
-    def check_attention_fields(cls, data):
-        fields = (
-            "xformers_attention",
-            "sdp_attention",
-            "s2_attention",
-            "flash_attention",
-            "flex_attention",
-        )
-        non_empty_count = sum(1 for field in fields if data.get(field))
-
-        if non_empty_count > 1:
-            raise ValueError(f"Only one of {', '.join(fields)} must be set")
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_batch_size_fields(cls, data):
-        fields = ("micro_batch_size", "gradient_accumulation_steps", "batch_size")
-        non_empty_count = sum(1 for field in fields if data.get(field))
-
-        if non_empty_count < 2:
-            raise ValueError(f"At least two of {', '.join(fields)} must be set")
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_pretraining_w_max_steps(cls, data):
-        if data.get("pretraining_dataset") and not data.get("max_steps"):
-            raise ValueError(
-                "max_steps must be set when using iterable pretraining_dataset, Trainer can't infer length and schedule optimizer/learning rate without it!"
-            )
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_pretraining_w_group_by_length(cls, data):
-        if data.get("pretraining_dataset") and data.get("group_by_length"):
-            LOG.warning(
-                "You probably want to disable group_by_length as it will force a streamed dataset to download completely."
-            )
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_pretraining_split_batches_accelerate(cls, data):
-        # alternatively set ACCELERATE_SPLIT_BATCHES=False
-        if data.get("pretraining_dataset"):
-            accelerator_config = data.get("accelerator_config", {})
-            if not accelerator_config:
-                data["accelerator_config"] = {
-                    "split_batches": False,
-                    "dispatch_batches": False,
-                }
-            else:
-                if accelerator_config.get("split_batches") is None:
-                    data["accelerator_config"]["split_batches"] = False
-                if accelerator_config.get("dispatch_batches") is None:
-                    data["accelerator_config"]["dispatch_batches"] = False
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_gptq_w_revision(cls, data):
-        if data.get("gptq") and data.get("revision_of_model"):
-            raise ValueError(
-                "revision_of_model is not supported for GPTQ models. "
-                + "Please download the model from HuggingFace Hub manually for correct branch, "
-                + "point to its path, and remove revision_of_model from the config."
-            )
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    # pylint: disable=duplicate-code
-    def check_chat_template_config(cls, data):
-        # if chat_template is set to jinja, chat_template_jinja is required
-        if data.get("chat_template") == ChatTemplate.jinja and not data.get(
-            "chat_template_jinja"
-        ):
-            raise ValueError(
-                "chat_template_jinja is required when chat_template is set to jinja"
-            )
-
-        # If chat_template_jinja is set, set chat_template to jinja
-        if data.get("chat_template_jinja") and not data.get("chat_template"):
-            data["chat_template"] = ChatTemplate.jinja
-
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_sample_packing_wo_flash(cls, data):
-        if (
-            data.get("sample_packing")
-            and not data.get("flash_attention")
-            and not data.get("sdp_attention")
-            and not data.get("flex_attention")
-            and not data.get("xformers_attention")
-        ):
-            LOG.warning(
-                "sample_packing without flash, sdp, xformers or flex attention does not handle cross sample decontamination."
-            )
-
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_sample_packing_with_s2attn(cls, data):
-        if data.get("sample_packing") and data.get("s2_attention"):
-            raise ValueError(
-                "Received `sample_packing=true` and `s2_attention=true`; however, \
-                shifted-sparse attention does not currently support sample packing."
-            )
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_batch_flattening_fa(cls, data):
-        if data.get("batch_flattening"):
-            batch_flattening_auto = data.get("batch_flattening") == "auto"
-            if not data.get("flash_attention") and not batch_flattening_auto:
-                raise ValueError("batch_flattening requires flash attention")
-            if data.get("sample_packing") and not batch_flattening_auto:
-                raise ValueError("batch_flattening not compatible with sample_packing")
-            if data.get("micro_batch_size") == 1 and not batch_flattening_auto:
-                LOG.warning("batch_flattening has no effect with micro_batch_size == 1")
-
-            if (
-                batch_flattening_auto
-                and data.get("flash_attention")
-                and not data.get("sample_packing")
-                and data.get("micro_batch_size") > 1
-            ):
-                data["batch_flattening"] = True
-            elif batch_flattening_auto:
-                data["batch_flattening"] = False
-
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_sample_packing_w_rl(cls, data):
-        if data.get("sample_packing") and data.get("rl"):
-            raise ValueError("`sample_packing: true` does not work with RLHF training")
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def hint_sample_packing_padding(cls, data):
-        if data.get("sample_packing"):
-            pad_to_sequence_len = data.get("pad_to_sequence_len")
-            if pad_to_sequence_len is False:
-                LOG.warning(
-                    "`pad_to_sequence_len: true` is recommended when using sample_packing"
-                )
-            elif pad_to_sequence_len is None:
-                LOG.info(
-                    "Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing"
-                )
-                data["pad_to_sequence_len"] = True
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def hint_reward_model_pad(cls, data):
-        if data.get("reward_model") and not data.get("pad_to_sequence_len"):
-            LOG.warning(
-                "`pad_to_sequence_len: true` is recommended when using reward_model"
-            )
-            if data.get("pad_to_sequence_len") is None:
-                data["pad_to_sequence_len"] = True
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_gas_bsz(cls, data):
-        if data.get("gradient_accumulation_steps") and data.get("batch_size"):
-            raise ValueError(
-                "please set only one of gradient_accumulation_steps or batch_size"
-            )
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def hint_eval_train_mbsz(cls, data):
-        if (
-            data.get("eval_batch_size")
-            and data.get("micro_batch_size")
-            and data.get("eval_batch_size") != data.get("micro_batch_size")
-        ):
-            LOG.warning(
-                "eval_batch_size != micro_batch_size. This can lead to VRAM instability."
-            )
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_push_ds_auth(cls, data):
-        if (
-            data.get("push_dataset_to_hub")
-            and data.get("hf_use_auth_token") is not True
-        ):
-            raise ValueError(
-                "Require cfg.hf_use_auth_token to be True for push_dataset_to_hub"
-            )
-        return data
-
-    @model_validator(mode="after")
-    def check_falcon_fsdp(self):
-        if (self.base_model and "falcon" in self.base_model.lower()) and self.fsdp:
-            raise ValueError("FSDP is not supported for falcon models")
-        return self
-
-    @model_validator(mode="after")
-    def check_mpt_checkpointing(self):
-        if (
-            self.base_model and "mpt" in self.base_model.lower()
-        ) and self.gradient_checkpointing:
-            raise ValueError("gradient_checkpointing is not supported for MPT models")
-        return self
-
-    @model_validator(mode="after")
-    def check_offload_grad_checkpointing(self):
-        if self.gradient_checkpointing and self.gradient_checkpointing == "unsloth":
-            LOG.warning(
-                "`unsloth` is deprecated for gradient_checkpointing, use `offload`"
-            )
-            self.gradient_checkpointing = "offload"
-        return self
-
-    @model_validator(mode="after")
-    def check_better_transformers(self):
-        if self.flash_optimum is True:
-            if self.adapter:
-                LOG.warning(
-                    "BetterTransformers probably doesn't work with PEFT adapters"
-                )
-            if self.fp16 or self.bf16:
-                raise ValueError("AMP is not supported with BetterTransformer")
-            if self.float16 is not True and self.bfloat16 is not True:
-                LOG.warning(
-                    "You should probably set bfloat16 or float16 to true to "
-                    "load the model in float16 for BetterTransformers"
-                )
-        return self
-
-    @model_validator(mode="after")
-    def check_adamw_optimizer_params(self):
-        if any([self.adam_beta1, self.adam_beta2, self.adam_epsilon]) and (
-            not self.optimizer or "adamw" not in str(self.optimizer).lower()
-        ):
-            LOG.warning("adamw hyperparameters found, but no adamw optimizer set")
-        return self
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_lr_groups(cls, data):
-        if data.get("lr_groups") and data.get("loraplus_lr_ratio"):
-            raise ValueError("lr_groups and loraplus_lr_ratio cannot be used together.")
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_saves(cls, data):
-        if (
-            data.get("save_strategy")
-            and data.get("save_steps")
-            and data.get("save_strategy") != "steps"
-        ):
-            raise ValueError(
-                "save_strategy and save_steps mismatch. Please set save_strategy to 'steps' or remove save_steps."
-            )
-        if data.get("saves_per_epoch") and data.get("save_steps"):
-            raise ValueError(
-                "save_steps and saves_per_epoch are mutually exclusive and cannot be used together."
-            )
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_push_save(cls, data):
-        if data.get("hub_model_id") and (
-            data.get("save_strategy") not in ["steps", "epoch", None]
-        ):
-            LOG.warning(
-                "hub_model_id is set without any models being saved. To save a model, set save_strategy."
-            )
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_evals(cls, data):
-        if (
-            data.get("eval_strategy")
-            and data.get("eval_steps")
-            and data.get("eval_strategy") != "steps"
-        ):
-            raise ValueError(
-                "eval_strategy and eval_steps mismatch. Please set eval_strategy to 'steps' or remove eval_steps."
-            )
-
-        if (
-            data.get("val_set_size") == 0
-            and (data.get("eval_steps") or data.get("eval_strategy"))
-            and not data.get("test_datasets")
-            and data.get("eval_strategy") != "no"
-        ):
-            raise ValueError(
-                "eval_steps and eval_strategy are not supported with val_set_size == 0"
-            )
-        if data.get("evals_per_epoch") and data.get("eval_steps"):
-            raise ValueError(
-                "eval_steps and evals_per_epoch are mutually exclusive and cannot be used together."
-            )
-        if (
-            data.get("evals_per_epoch")
-            and data.get("eval_strategy")
-            and data.get("eval_strategy") != "steps"
-        ):
-            raise ValueError(
-                "eval_strategy must be empty or set to `steps` when used with evals_per_epoch."
-            )
-
-        if data.get("do_bench_eval") and not (
-            data.get("evals_per_epoch") or data.get("eval_steps")
-        ):
-            raise ValueError(
-                "do_bench_eval requires evals_per_epoch or eval_steps to be set."
-            )
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_test_datasets_bench(cls, data):
-        if (
-            data.get("do_bench_eval")
-            and not data.get("test_datasets")
-            and not data.get("val_set_size")
-        ):
-            LOG.warning(
-                "`do_bench_eval` needs a test dataset to run evals, adding an empty test_dataset."
-            )
-            data["test_datasets"] = [{"path": "axolotl-ai-co/empty-test-ds"}]
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_eval_packing(cls, data):
-        # TODO also should check test_datasets and val_set_size as we can skip
-        # if there are no eval datasets/splits
-        if (
-            data.get("sample_packing")
-            and data.get("eval_table_size")
-            and data.get("eval_sample_packing") is not False
-        ):
-            raise ValueError(
-                "eval_table_size and eval_sample_packing are not supported together with sample_packing. Please set 'eval_sample_packing' to false."
-            )
-        if (
-            data.get("sample_packing")
-            and data.get("eval_sample_packing") is None
-            and not data.get("eval_table_size")
-        ):
-            LOG.info(
-                "explicitly setting `eval_sample_packing` to match `sample_packing`"
-            )
-            data["eval_sample_packing"] = True
-
-        if (
-            data.get("sample_packing")
-            and data.get("eval_sample_packing") is False
-            and data.get("remove_unused_columns") is None
-        ):
-            LOG.info(
-                "setting `remove_unused_columns: false` for when sample_packing and eval_sample_packing don't match"
-            )
-            data["remove_unused_columns"] = False
-
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_mm_prepare(cls, data):
-        if data.get("skip_prepare_dataset"):
-            if data.get("remove_unused_columns") is None:
-                LOG.info(
-                    "setting `remove_unused_columns: false` for skip_prepare_dataset"
-                )
-                data["remove_unused_columns"] = False
-
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_warmup(cls, data):
-        if data.get("warmup_steps") and data.get("warmup_ratio"):
-            raise ValueError("warmup_steps and warmup_ratio are mutually exclusive")
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_neftune(cls, data):
-        if data.get("noisy_embedding_alpha") and not data.get("neftune_noise_alpha"):
-            data["neftune_noise_alpha"] = data["noisy_embedding_alpha"]
-            del data["noisy_embedding_alpha"]
-        elif data.get("noisy_embedding_alpha") and not data.get("neftune_noise_alpha"):
-            raise ValueError(
-                "noisy_embedding_alpha is deprecated, use neftune_noise_alpha; both are set, please remove the deprecated noisy_embedding_alpha setting"
-            )
-        return data
-
-    @field_validator("neftune_noise_alpha")
-    @classmethod
-    def validate_neftune_noise_alpha(cls, neftune_noise_alpha):
-        if neftune_noise_alpha is not None and neftune_noise_alpha <= 0.0:
-            raise ValueError("neftune_noise_alpha must be > 0.0")
-        return neftune_noise_alpha
-
-    @model_validator(mode="after")
-    def check_rl_beta(self):
-        if self.dpo_beta and not self.rl_beta:
-            self.rl_beta = self.dpo_beta
-            del self.dpo_beta
-        return self
-
-    @model_validator(mode="after")
-    def check_simpo_warmup(self):
-        if self.rl is RLType.SIMPO and self.warmup_ratio:
-            raise ValueError(
-                "warmup_ratio is not supported with the simpo trainer. Please use `warmup_steps` instead"
-            )
-        return self
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_frozen(cls, data):
-        if (
-            data.get("adapter")
-            and data.get("peft_layers_to_transform")
-            and data.get("unfrozen_parameters")
-        ):
-            raise ValueError(
-                "`unfrozen_parameters` used with `peft_layers_to_transform` can have unexpected behavior."
-            )
-
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_peft_layers_pattern(cls, data):
-        if data.get("peft_layers_pattern") and not data.get("peft_layers_to_transform"):
-            raise ValueError(
-                "peft_layers_pattern requires peft_layers_to_transform to be set"
-            )
-        return data
-
-    @model_validator(mode="after")
-    def check_fft_possible_bad_config(self):
-        if (
-            # pylint: disable=too-many-boolean-expressions
-            not (self.bf16 or self.bfloat16)
-            and (self.fp16 or self.float16)
-            and not self.adapter
-            and not self.flash_attention
-            and self.sample_packing
-        ):
-            LOG.warning(
-                "Full fine tune w/o FA2 w/ sample packing and fp16/float16 is likely to raise errors. Try LoRA."
-            )
-            # ValueError: Attempting to unscale FP16 gradients.
-            # OR
-            # RuntimeError: expected mat1 and mat2 to have the same dtype, but got: float != c10::Half
-        return self
-
-    @model_validator(mode="after")
-    def check_fused_lora(self):
-        if self.adapter in ["lora", "qlora"] and (
-            self.flash_attn_fuse_qkv or self.flash_attn_fuse_mlp
-        ):
-            raise ValueError("Fused modules are not supported with LoRA/QLoRA")
-        return self
-
-    @model_validator(mode="after")
-    def hint_lora_8bit(self):
-        loftq = (
-            self.peft and self.peft.loftq_config and self.peft.loftq_config.loftq_bits
-        )
-        if not self.load_in_8bit and self.adapter == "lora" and not loftq:
-            LOG.warning("We recommend setting `load_in_8bit: true` for LORA finetuning")
-        return self
-
-    @model_validator(mode="after")
-    def check_early_stopping(self):
-        if self.early_stopping_patience:
-            if not self.save_steps or not self.eval_steps:
-                raise ValueError(
-                    "`early_stopping_patience` requires save_steps and eval_steps to be set. eval_steps should evenly divide save_steps."
-                )
-            if self.save_steps % self.eval_steps != 0:
-                raise ValueError(
-                    "`early_stopping_patience` requires that eval_steps should evenly divide save_steps."
-                )
-        return self
-
-    @model_validator(mode="after")
-    def check_relora(self):
-        if self.relora_steps:
-            if self.adapter not in ("lora", "qlora"):
-                raise ValueError("cfg.adapter must be lora or qlora to use ReLoRA")
-
-            if self.fsdp:
-                raise ValueError("fsdp not supported with ReLoRA")
-
-            if self.deepspeed:
-                raise ValueError("deepspeed not supported with ReLoRA")
-
-            if self.lr_scheduler == "one_cycle":
-                raise ValueError(
-                    "ReLoRA is not compatible with the one_cycle scheduler"
-                )
-
-            if self.flash_attn_fuse_qkv or self.flash_attn_fuse_mlp:
-                raise ValueError("Fused modules are not supported with ReLoRA")
-        return self
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_mem_mismatch(cls, data):
-        if (
-            data.get("max_memory") is not None
-            and data.get("gpu_memory_limit") is not None
-        ):
-            raise ValueError(
-                "max_memory and gpu_memory_limit are mutually exclusive and cannot be used together."
-            )
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_use_reentrant_mismatch(cls, data):
-        if (
-            data.get("unfrozen_parameters")
-            and data.get("gradient_checkpointing_kwargs")
-            and data.get("gradient_checkpointing_kwargs", {}).get("use_reentrant")
-            is True
-        ):
-            # https://github.com/huggingface/transformers/issues/21381
-            raise ValueError(
-                "`use_reentrant` must be false when used with partially frozen model."
-            )
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def warn_qlora_zero3_w_use_reentrant(cls, data):
-        if (
-            data.get("adapter") == "qlora"
-            and data.get("gradient_checkpointing_kwargs", {})
-            and data.get("gradient_checkpointing_kwargs", {}).get("use_reentrant")
-            is False
-            and data.get("deepspeed", "") is not None
-            and "zero3" in data.get("deepspeed", "")
-        ):
-            # may result in:
-            # torch.utils.checkpoint.CheckpointError: torch.utils.checkpoint:
-            # Recomputed values for the following tensors have different metadata
-            # than during the forward pass.
-            LOG.warning(
-                "qlora + zero3 with use_reentrant: false may result in a CheckpointError about recomputed values"
-            )
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_val_w_test_datasets(cls, data):
-        if data.get("test_datasets") and data.get("val_set_size"):
-            raise ValueError(
-                "non-zero val_set_size should not be used with test_datasets configuration"
-            )
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_eval_strategy(cls, data):
-        if (
-            data.get("evaluation_strategy") is not None
-            and data.get("eval_strategy") is None
-        ):
-            LOG.info(
-                "explicitly setting `eval_strategy` from the `evaluation_strategy`"
-            )
-            data["eval_strategy"] = data.get("evaluation_strategy")
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_fsdp_offload_w_8bit_optimizer(cls, data):
-        if (
-            data.get("fsdp")
-            and "8bit" in data.get("optimizer", "")
-            and data.get("fsdp_config")
-            and data["fsdp_config"].get("fsdp_offload_params")
-            and str(data["fsdp_config"].get("fsdp_version")) != "2"
-        ):
-            raise ValueError(
-                f"FSDP Offload not compatible with {data.get('optimizer')}"
-            )
-        if (
-            data.get("fsdp")
-            and "8bit" in data.get("optimizer", "")
-            and data.get("fsdp_config")
-            and str(data["fsdp_config"].get("fsdp_version")) == "2"
-        ):
-            if data.get("optimizer", "") in ["adamw_8bit", "adamw_bnb_8bit"]:
-                # CUDA ops errors with bnb 8bit optimizer + FSDP2
-                raise ValueError(
-                    f"FSDP2 not compatible with {data.get('optimizer')}, use `adamw_torch_8bit` instead"
-                )
-
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_fsdp_sharded_state_dict_w_safetensors(cls, data):
-        if (
-            data.get("fsdp")
-            and data.get("save_safetensors")
-            and data.get("fsdp_config")
-            and data["fsdp_config"].get("fsdp_state_dict_type") == "SHARDED_STATE_DICT"
-        ):
-            raise ValueError(
-                "FSDP SHARDED_STATE_DICT not compatible with save_safetensors"
-            )
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_causal_lm_evals(cls, data):
-        if data.get("do_causal_lm_eval") and data.get("eval_sample_packing"):
-            raise ValueError(
-                "do_causal_lm_eval is enabled, eval_sample_packing must be set to False"
-            )
-
-        if data.get("eval_causal_lm_metrics"):
-            if not isinstance(data.get("eval_causal_lm_metrics"), list):
-                raise ValueError("eval_causal_lm_metrics must be a list")
-            # only ["sacrebleu", "comet", "ter", "chrf"] supported
-            if set(data.get("eval_causal_lm_metrics")) - SUPPORTED_METRICS:
-                raise ValueError(
-                    f"eval_causal_lm_metrics must be one of {SUPPORTED_METRICS}"
-                )
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_dataset_or_pretraining_dataset(cls, data):
-        if data.get("datasets") is None and data.get("pretraining_dataset") is None:
-            raise ValueError("either datasets or pretraining_dataset is required")
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_xentropy_patch_conflicts(cls, data):
-        if data.get("flash_attn_cross_entropy") and data.get(
-            "unsloth_cross_entropy_loss"
-        ):
-            raise ValueError(
-                "flash_attn_cross_entropy and unsloth_cross_entropy_loss cannot be both enabled"
-            )
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_qlora_unsloth(cls, data):
-        if (
-            data.get("unsloth_lora_mlp")
-            or data.get("unsloth_lora_qkv")
-            or data.get("unsloth_lora_o")
-        ):
-            if data.get("adapter") == "lora" and data.get("load_in_8bit"):
-                raise ValueError(
-                    "unsloth_lora_mlp, unsloth_lora_qkv, and unsloth_lora_o are not compatible with 8-bit LoRA"
-                )
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_lora_8bit(cls, data):
-        if (
-            data.get("lora_mlp_kernel")
-            or data.get("lora_qkv_kernel")
-            or data.get("lora_o_kernel")
-        ):
-            if data.get("adapter") == "lora" and data.get("load_in_8bit"):
-                raise ValueError(
-                    "lora_mlp_kernel, lora_mlp_kernel, and lora_mlp_kernel are not compatible with 8-bit LoRA"
-                )
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_lora_axolotl_unsloth(cls, data):
-        is_lora_kernel = any(
-            data.get(k) for k in ["lora_mlp_kernel", "lora_qkv_kernel", "lora_o_kernel"]
-        )
-        is_unsloth_lora = any(
-            data.get(k)
-            for k in ["unsloth_lora_mlp", "unsloth_lora_qkv", "unsloth_lora_o"]
-        )
-        if is_lora_kernel and is_unsloth_lora:
-            raise ValueError(
-                "both lora_mlp_kernel and unsloth_lora_mlp cannot be true (similarly for lora_qkv_kernel, lora_o_kernel)"
-            )
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_torch_compile_deepspeed(cls, data):
-        if data.get("deepspeed") and data.get("torch_compile"):
-            raise ValueError(
-                "torch_compile should be set within your deepspeed config file"
-            )
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_npu_config(cls, data):
-        if is_torch_npu_available():
-            # check attention config
-            attn_list = ["flash_attention", "sdp_attention", "s2_attention"]
-            for attn in attn_list:
-                if data.get(attn):
-                    raise NotImplementedError(
-                        f"{attn} is currently not supported in Ascend npu, please disable this configuration."
-                    )
-
-            # check quant config
-            if data.get("optimizer") is not None and "bit" in data.get("optimizer"):
-                optimizer = data.get("optimizer")
-                raise NotImplementedError(
-                    f"{optimizer} is currently not supported in Ascend npu, choose another one please."
-                )
-
-            quant_list = ["load_in_8bit", "load_in_4bit"]
-            for quant in quant_list:
-                if data.get(quant):
-                    raise NotImplementedError(
-                        f"Quantification is currently not supported in Ascend npu, please disable {quant}."
-                    )
-
-            # check dtype config
-            if data.get("tf32"):
-                raise NotImplementedError(
-                    "tf32 dtype is currently not supported in Ascend npu, please disable this configuration"
-                )
-
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_rl_config_gradient_checkpointing(cls, data):
-        # TODO: SalmanMohammadi
-        # Distributed RL with QLoRA + gradient checkpointing
-        # and use_reentrant = True is broken upstream in TRL
-        # pylint: disable=too-many-boolean-expressions
-        if (
-            data.get("rl")
-            and data.get("gradient_checkpointing")
-            and data.get("gradient_checkpointing_kwargs")
-            and data.get("gradient_checkpointing_kwargs").get("use_reentrant")
-            and data.get("load_in_4bit")
-            and data.get("adapter") == "qlora"
-            and data.get("capabilities")
-            and data.get("capabilities").get("n_gpu", 1) > 1
-        ):
-            raise ValueError(
-                "The `use_reentrant: True` implementation of gradient checkpointing "
-                "is not supported for distributed RL training with QLoRA. Please set "
-                "`use_reentrant: False` in `gradient_checkpointing_kwargs`."
-            )
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_kto_config(cls, data):
-        if data.get("rl") == "kto":
-            if data.get("sample_packing") or data.get("eval_sample_packing"):
-                raise ValueError("sample_packing is not supported with kto")
-
-            if data.get("remove_unused_columns") is not False:
-                raise ValueError("Set `remove_unused_columns: False` when using kto")
-
-        return data
-
-    # @model_validator(mode="before")
-    # @classmethod
-    # def check_grpo_peft_liger(cls, data):
-    #     if (
-    #         data.get("rl") == "grpo"
-    #         and data.get("trl", {})
-    #         and data.get("trl").get("use_liger_loss")
-    #         and data.get("adapter")
-    #     ):
-    #         raise ValueError("PEFT + GRPO + Liger is not yet supported")
-    #     return data
-    #
-    @model_validator(mode="before")
-    @classmethod
-    def check_grpo_liger_sequence_parallel(cls, data):
-        if (
-            data.get("rl") == "grpo"
-            and data.get("trl", {})
-            and data.get("trl").get("use_liger_loss")
-            and data.get("sequence_parallel_degree", 1) > 1
-        ):
-            raise ValueError("GRPO + SP + Liger not currently supported")
-        return data
-
-    @model_validator(mode="after")
-    def check_sequence_parallel_degree(self):
-        if not self.sequence_parallel_degree:
-            self.sequence_parallel_degree = 1
-        elif self.sequence_parallel_degree > 1:
-            if not self.flash_attention:
-                raise ValueError(
-                    "flash_attention: true must be set with sequence_parallel_degree > 1"
-                )
-
-            if self.sample_packing and self.micro_batch_size > 1:
-                raise ValueError(
-                    "micro_batch_size must be set to 1 when sample_packing is enabled "
-                    "due to a `ring-flash-attn` requirement"
-                )
-
-            try:
-                import ring_flash_attn  # noqa: F401 # pylint:disable=unused-import
-            except ImportError as exception:
-                raise ImportError(
-                    "sequence_parallel_degree > 1 but ring_flash_attn is not installed. "
-                    "Please install it with `pip install axolotl[ring-flash-attn] "
-                    "or `pip install ring-flash-attn>=0.1.4`."
-                ) from exception
-
-            # TODO: monkeypatch / callback to average losses correctly across SP ranks
-            # / fix gradient scaling across SP ranks. Losses, grads should be scaled
-            # according to the proportion of non-padding tokens per rank.
-            LOG.warning(
-                "Sequence parallelism (SP) is enabled with "
-                f"sequence_parallel_degree={self.sequence_parallel_degree}. "
-                "Please note that logged losses may differ slightly to the non-SP "
-                "losses due to transformers Trainer implementation details. "
-                "Please see https://github.com/axolotl-ai-cloud/axolotl/pull/2495#issuecomment-2784022042 "
-                "for more details."
-            )
-
-        return self
-
-    @model_validator(mode="after")
-    def validate_ring_attn_func(self):
-        if getattr(self, "sequence_parallel_degree", 1) == 1:
-            return self
-
-        if self.ring_attn_func is not None:
-            self.ring_attn_func = RingAttnFunc(self.ring_attn_func)
-        else:
-            # Default ring attention function selection
-            sample_packing = getattr(self, "sample_packing", False)
-            self.ring_attn_func = (
-                RingAttnFunc.VARLEN_LLAMA3
-                if sample_packing
-                else RingAttnFunc.BATCH_RING
-            )
-
-        return self
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_muon_deepspeed_fsdp(cls, data):
-        if data.get("optimizer") == "muon" and (
-            data.get("deepspeed") or data.get("fsdp") or data.get("fsdp_config")
-        ):
-            raise ValueError(
-                "Muon optimizer is currently incompatible with DeepSpeed and FSDP"
-            )
-        return data
-
 
 class AxolotlConfigWCapabilities(AxolotlInputConfig):
-    """wrapper to valdiate gpu capabilities with the configured options"""
+    """wrapper to valdiate GPU capabilities with the configured options"""
 
     capabilities: GPUCapabilities
     env_capabilities: EnvCapabilities
@@ -1303,13 +1037,7 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
 
         return data
 
-    @model_validator(mode="before")
-    @classmethod
-    def check_fsdp_deepspeed(cls, data):
-        if data.get("deepspeed") and data.get("fsdp"):
-            raise ValueError("deepspeed and fsdp cannot be used together.")
-        return data
-
+    # pylint: disable=duplicate-code
     @model_validator(mode="before")
     @classmethod
     def check_multigpu_unsloth(cls, data):
@@ -1325,6 +1053,7 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
                 )
         return data
 
+    # pylint: disable=duplicate-code
     @model_validator(mode="before")
     @classmethod
     def check_multigpu_lora_kernels(cls, data):
@@ -1334,11 +1063,9 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
             or data.get("lora_o_kernel")
         ):
             capabilities = data.get("capabilities")
-            is_fsdp = data.get("fsdp") is not None
-            is_fsdp2 = (
-                data.get("fsdp_config") is not None
-                and str(data.get("fsdp_config").get("fsdp_version")) == "2"
-            )
+            is_fsdp = data.get("fsdp_config") is not None
+            is_fsdp2 = is_fsdp and str(data.get("fsdp_version")) == "2"
+
             if capabilities and capabilities.get("n_gpu", 0) > 1 and not is_fsdp2:
                 if is_fsdp:
                     raise ValueError(
@@ -1372,11 +1099,8 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
             # Check multi-GPU compatibility
             capabilities = data.get("capabilities")
             is_multi_gpu = capabilities and capabilities.get("n_gpu", 0) > 1
-            is_fsdp = data.get("fsdp") is not None
-            is_fsdp2 = (
-                data.get("fsdp_config") is not None
-                and str(data.get("fsdp_config").get("fsdp_version")) == "2"
-            )
+            is_fsdp = data.get("fsdp_config") is not None
+            is_fsdp2 = is_fsdp and str(data.get("fsdp_version")) == "2"
 
             if (
                 not is_multi_gpu
@@ -1468,9 +1192,77 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
     def check_min_torch_version(self):
         if self.env_capabilities and self.env_capabilities.torch_version:
             torch_version = self.env_capabilities.torch_version
-            if version.parse(torch_version) < version.parse("2.5.1"):
+            if version.parse(torch_version) < version.parse("2.6.0"):
                 LOG.warning(
-                    f"torch=={torch_version} may not be supported in future versions. Please consider upgrading to torch>=2.5.1."
+                    f"torch=={torch_version} not be supported. Please upgrade to torch>=2.6.0."
                 )
 
         return self
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_qat_config(cls, data):
+        qat_cfg = data.get("qat", {})
+        if not qat_cfg:
+            return data
+
+        if data.get("peft"):
+            raise ValueError("QAT and PEFT cannot be used together.")
+
+        if data.get("load_in_8bit"):
+            raise ValueError("QAT and load_in_8bit cannot be used together.")
+
+        if data.get("load_in_4bit"):
+            raise ValueError("QAT and load_in_4bit cannot be used together.")
+
+        env_capabilities = data.get("env_capabilities", {})
+        torch_version = env_capabilities.get("torch_version")
+
+        if torch_version is None:
+            import torch
+
+            torch_version = str(torch.__version__).split("+", maxsplit=1)[0]
+
+        if version.parse(torch_version) < version.parse("2.6.0"):
+            raise ValueError("QAT is not supported on torch version < 2.6.0")
+
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_fsdp_torch_version(cls, data):
+        env_capabilities = data.get("env_capabilities", {})
+        torch_version = env_capabilities.get("torch_version")
+
+        if torch_version is None:
+            import torch
+
+            torch_version = str(torch.__version__).split("+", maxsplit=1)[0]
+
+        if data.get("fsdp_config") and str(data.get("fsdp_version")) == "2":
+            if version.parse(torch_version) < version.parse("2.7.0"):
+                raise ValueError("FSDP2 is not supported on torch version < 2.7.0")
+
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def default_dataloader_opts(cls, data):
+        if (
+            data.get("dataloader_num_workers") is None
+            and data.get("dataloader_pin_memory") is None
+            and data.get("dataloader_prefetch_factor") is None
+        ):
+            data["dataloader_num_workers"] = data.get("capabilities").get("n_gpu", 1)
+            data["dataloader_pin_memory"] = True
+            data["dataloader_prefetch_factor"] = 256
+
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def default_dataset_processes(cls, data):
+        if data.get("dataset_processes") is None:
+            data["dataset_processes"] = get_default_process_count()
+
+        return data
diff --git a/src/axolotl/utils/schemas/datasets.py b/src/axolotl/utils/schemas/datasets.py
index cc5d6daba..d9c8042d4 100644
--- a/src/axolotl/utils/schemas/datasets.py
+++ b/src/axolotl/utils/schemas/datasets.py
@@ -1,6 +1,8 @@
 """Pydantic models for datasets-related configuration"""
 
-from pydantic import BaseModel, model_validator
+from typing import Literal
+
+from pydantic import BaseModel, Field, model_validator
 
 from axolotl.utils.schemas.enums import ChatTemplate
 from axolotl.utils.schemas.utils import handle_legacy_message_fields_logic
@@ -9,56 +11,189 @@ from axolotl.utils.schemas.utils import handle_legacy_message_fields_logic
 class UserDefinedPrompterType(BaseModel):
     """Structure for user defined prompt types"""
 
-    system_prompt: str | None = None
-    system_format: str | None = None
+    system_prompt: str | None = Field(
+        default=None,
+        json_schema_extra={"description": "Custom user instruction prompt"},
+    )
+    system_format: str | None = Field(
+        default=None,
+        json_schema_extra={"description": "Use {system} as key to be replaced"},
+    )
     field_system: str | None = None
     field_instruction: str | None = None
     field_input: str | None = None
     field_output: str | None = None
 
-    format: str | None = None
-    no_input_format: str | None = None
-    field: str | None = None
+    format: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Customizable to be single line or multi-line. Use {instruction}/{input} as key to be replaced. 'format' can include {input}"
+        },
+    )
+    no_input_format: str | None = Field(
+        default=None,
+        json_schema_extra={"description": "'no_input_format' cannot include {input}"},
+    )
 
 
 class SFTDataset(BaseModel):
     """SFT configuration subset"""
 
-    path: str | None = None
-    split: str | None = None
-    type: str | UserDefinedPrompterType | None = None
+    path: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "HuggingFace dataset repo | s3:// | gs:// | path to local file or directory"
+        },
+    )
+    split: str | None = Field(
+        default=None,
+        json_schema_extra={"description": "name of dataset split to load from"},
+    )
+    type: str | UserDefinedPrompterType | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]"
+        },
+    )
     input_transform: str | None = None
-    shards: int | None = None
-    shards_idx: int | None = None
-    preprocess_shards: int | None = None
+    shards: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "split dataset into N pieces (use with shards_idx)"
+        },
+    )
+    shards_idx: int | None = Field(
+        default=None,
+        json_schema_extra={"description": "the index of sharded dataset to use"},
+    )
+    preprocess_shards: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "process dataset in N sequential chunks for memory efficiency (exclusive with `shards`)"
+        },
+    )
     conversation: str | None = None
     # Do not make this too strict or it will break the validator to choose different dataset class
-    chat_template: ChatTemplate | str | None = None
-    chat_template_jinja: str | None = None
-    data_files: str | list[str] | None = None
+    chat_template: ChatTemplate | str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "The name of the chat template to use for training, following values are supported: tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default. alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py. tokenizer_default_fallback_*: where * is the name of the chat template to fallback to if the tokenizer does not have a chat template else default to tokenizer. E.g. tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field."
+        },
+    )
+    chat_template_jinja: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Custom jinja chat template or path to jinja file. Used only if `chat_template: jinja` or empty."
+        },
+    )
+    data_files: str | list[str] | None = Field(
+        default=None, json_schema_extra={"description": "path to source data files"}
+    )
     input_format: str | None = None
-    name: str | None = None
-    ds_type: str | None = None
-    field: str | None = None
+    name: str | None = Field(
+        default=None,
+        json_schema_extra={"description": "name of dataset configuration to load"},
+    )
+    ds_type: str | None = Field(
+        default=None,
+        json_schema_extra={"description": "defines the datatype when path is a file"},
+    )
+    field: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "For `completion` datasets only, uses the provided field instead of `text` column"
+        },
+    )
     field_human: str | None = None
     field_model: str | None = None
-    field_messages: str | None = None
+    field_messages: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": 'Key containing the messages (default: "messages")'
+        },
+    )
+    field_tools: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": 'Key containing the tools (default: "tools"). Must be a list[dict] and follow [JSON schema](https://json-schema.org/learn/getting-started-step-by-step).'
+        },
+    )
+    field_thinking: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": 'Key containing the reasoning trace (default: "reasoning_content").'
+        },
+    )
+    template_thinking_key: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "The key the chat template expects that indicates the reasoning trace."
+        },
+    )
     # deprecated, use message_property_mappings
     message_field_role: str | None = None
     # deprecated, use message_property_mappings
     message_field_content: str | None = None
-    message_property_mappings: dict[str, str] | None = None
-    message_field_training: str | None = None
-    message_field_training_detail: str | None = None
-    split_thinking: bool | None = None
+    message_property_mappings: dict[str, str] | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Mapping of properties from the input dataset to the chat template. (default: message_property_mappings={'role':'role', 'content':'content'}) If a property exists in the template but not in this mapping, the system will attempt to load it directly from the message using the property name as the key. Example: In the mapping below, 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and used as 'content' in the chat template."
+        },
+    )
+    message_field_training: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "The key in the message turn that indicates via boolean whether tokens of a turn should be considered for training. Useful to selectively train on certain turns besides the `roles_to_train`."
+        },
+    )
+    message_field_training_detail: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "The key in the message turn that contains the training details. Useful to selectively train on certain tokens in a turn. The value of the key is a List[Dict] containing `begin_offset` (start character index in content), `end_offset` (end character index in content), and `train` (boolean whether to train)."
+        },
+    )
+    split_thinking: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "(for Qwen3 template only) Whether to split the assistant content based on a reasoning trace inside delimited tags"
+        },
+    )
     logprobs_field: str | None = None
     temperature: float | None = None
-    roles_to_train: list[str] | None = None
-    train_on_eos: str | None = None
-    roles: dict[str, list[str]] | None = None
-    drop_system_message: bool | None = None
-    trust_remote_code: bool | None = False
-    revision: str | None = None
+    roles_to_train: list[str] | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Roles to train on. The tokens from these roles will be considered for the loss."
+        },
+    )
+    train_on_eos: Literal["all", "turn", "last"] | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Which EOS tokens to train on in the conversation. Possible values are: all: train on all EOS tokens, turn (default): train on the EOS token at the end of each trainable turn, last: train on the last EOS token in the conversation"
+        },
+    )
+    roles: dict[str, list[str]] | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": 'Roles mapping in the messages. The format is {target_role: [source_roles]}. All source roles will be mapped to the target role. The default is: user: ["human", "user"], assistant: ["gpt", "assistant"], system: ["system"], tool: ["tool"]'
+        },
+    )
+    drop_system_message: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Whether to drop the system turn from the dataset. Only works with chat_template. This does not drop the default system message from chat_template if it exists. If you wish to, we recommend using a custom jinja template with the default system message removed or adding a system turn with empty content."
+        },
+    )
+    trust_remote_code: bool | None = Field(
+        default=False,
+        json_schema_extra={"description": "Trust remote code for untrusted source"},
+    )
+    revision: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets."
+        },
+    )
 
     @model_validator(mode="before")
     @classmethod
diff --git a/src/axolotl/utils/schemas/deprecated.py b/src/axolotl/utils/schemas/deprecated.py
index d42d6ff9e..972fe0ccf 100644
--- a/src/axolotl/utils/schemas/deprecated.py
+++ b/src/axolotl/utils/schemas/deprecated.py
@@ -1,11 +1,12 @@
 """Pydantic models for deprecated and remapped configuration parameters"""
 
-import logging
 from typing import Any
 
 from pydantic import BaseModel, Field, field_validator
 
-LOG = logging.getLogger(__name__)
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
 
 
 class DeprecatedParameters(BaseModel):
@@ -59,10 +60,30 @@ class RemappedParameters(BaseModel):
     """Parameters that have been remapped to other names"""
 
     overrides_of_model_config: dict[str, Any] | None = Field(
-        default=None, alias="model_config"
+        default=None,
+        alias="model_config",
+        json_schema_extra={
+            "description": "optional overrides to the base model configuration"
+        },
     )
     overrides_of_model_kwargs: dict[str, Any] | None = Field(
-        default=None, alias="model_kwargs"
+        default=None,
+        alias="model_kwargs",
+        json_schema_extra={
+            "description": "optional overrides the base model loading from_pretrained"
+        },
+    )
+    type_of_model: str | None = Field(
+        default=None,
+        alias="model_type",
+        json_schema_extra={
+            "description": "If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too"
+        },
+    )
+    revision_of_model: str | None = Field(
+        default=None,
+        alias="model_revision",
+        json_schema_extra={
+            "description": "You can specify to choose a specific model revision from huggingface hub"
+        },
     )
-    type_of_model: str | None = Field(default=None, alias="model_type")
-    revision_of_model: str | None = Field(default=None, alias="model_revision")
diff --git a/src/axolotl/utils/schemas/enums.py b/src/axolotl/utils/schemas/enums.py
index ff8471dfd..cf2a8b484 100644
--- a/src/axolotl/utils/schemas/enums.py
+++ b/src/axolotl/utils/schemas/enums.py
@@ -1,68 +1,93 @@
 """Enums for Axolotl input config"""
 
+# pylint: disable=invalid-name
+
 from enum import Enum
 
+import torch
+
+
+class TorchIntDType(Enum):
+    """Torch integer data types - `getattr` guards against torch < 2.6 which does not support int4"""
+
+    uint1 = getattr(torch, "uint1", None)
+    uint2 = getattr(torch, "uint2", None)
+    uint3 = getattr(torch, "uint3", None)
+    uint4 = getattr(torch, "uint4", None)
+    uint5 = getattr(torch, "uint5", None)
+    uint6 = getattr(torch, "uint6", None)
+    uint7 = getattr(torch, "uint7", None)
+    int4 = getattr(torch, "int4", None)
+    int8 = getattr(torch, "int8", None)
+
 
 class RLType(str, Enum):
     """RL trainer type configuration subset"""
 
-    DPO = "dpo"  # pylint: disable=invalid-name
-    GRPO = "grpo"  # pylint: disable=invalid-name
-    IPO = "ipo"  # pylint: disable=invalid-name
-    ORPO = "orpo"  # pylint: disable=invalid-name
-    KTO = "kto"  # pylint: disable=invalid-name
-    SIMPO = "simpo"  # pylint: disable=invalid-name
+    DPO = "dpo"
+    GRPO = "grpo"
+    IPO = "ipo"
+    ORPO = "orpo"
+    KTO = "kto"
+    SIMPO = "simpo"
 
 
 class ChatTemplate(str, Enum):
     """Chat templates configuration subset"""
 
-    alpaca = "alpaca"  # pylint: disable=invalid-name
-    chatml = "chatml"  # pylint: disable=invalid-name
-    mistral_v1 = "mistral_v1"  # pylint: disable=invalid-name
-    mistral_v2v3 = "mistral_v2v3"  # pylint: disable=invalid-name
-    mistral_v3_tekken = "mistral_v3_tekken"  # pylint: disable=invalid-name
-    mistral_v7_tekken = "mistral_v7_tekken"  # pylint: disable=invalid-name
-    gemma = "gemma"  # pylint: disable=invalid-name
-    cohere = "cohere"  # pylint: disable=invalid-name
-    llama3 = "llama3"  # pylint: disable=invalid-name
-    llama3_2_vision = "llama3_2_vision"  # pylint: disable=invalid-name
-    llama4 = "llama4"  # pylint: disable=invalid-name
-    phi_3 = "phi_3"  # pylint: disable=invalid-name
-    phi_35 = "phi_35"  # pylint: disable=invalid-name
-    deepseek_v2 = "deepseek_v2"  # pylint: disable=invalid-name
-    deepseek_v3 = "deepseek_v3"  # pylint: disable=invalid-name
-    jamba = "jamba"  # pylint: disable=invalid-name
-    jinja = "jinja"  # pylint: disable=invalid-name
-    qwen_25 = "qwen_25"  # pylint: disable=invalid-name
-    qwen3 = "qwen3"  # pylint: disable=invalid-name
-    tokenizer_default = "tokenizer_default"  # pylint: disable=invalid-name
-    exaone = "exaone"  # pylint: disable=invalid-name
-    metharme = "metharme"  # pylint: disable=invalid-name
-    pixtral = "pixtral"  # pylint: disable=invalid-name
-    llava = "llava"  # pylint: disable=invalid-name
-    qwen2_vl = "qwen2_vl"  # pylint: disable=invalid-name
-    gemma3 = "gemma3"  # pylint: disable=invalid-name
+    alpaca = "alpaca"
+    chatml = "chatml"
+    mistral_v1 = "mistral_v1"
+    mistral_v2v3 = "mistral_v2v3"
+    mistral_v3_tekken = "mistral_v3_tekken"
+    mistral_v7_tekken = "mistral_v7_tekken"
+    gemma = "gemma"
+    cohere = "cohere"
+    llama3 = "llama3"
+    llama3_2_vision = "llama3_2_vision"
+    llama4 = "llama4"
+    phi_3 = "phi_3"
+    phi_35 = "phi_35"
+    deepseek_v2 = "deepseek_v2"
+    deepseek_v3 = "deepseek_v3"
+    jamba = "jamba"
+    jinja = "jinja"
+    qwen_25 = "qwen_25"
+    qwen3 = "qwen3"
+    falcon_h1 = "falcon_h1"
+    tokenizer_default = "tokenizer_default"
+    exaone = "exaone"
+    metharme = "metharme"
+    pixtral = "pixtral"
+    llava = "llava"
+    qwen2_vl = "qwen2_vl"
+    gemma3 = "gemma3"
+    gemma3n = "gemma3n"
+    command_a = "command_a"
+    command_a_tool_use = "command_a_tool_use"
+    command_a_rag = "command_a_rag"
+    aya = "aya"
 
 
 class CustomSupportedOptimizers(str, Enum):
     """Custom supported optimizers"""
 
-    optimi_adamw = "optimi_adamw"  # pylint: disable=invalid-name
-    ao_adamw_4bit = "ao_adamw_4bit"  # pylint: disable=invalid-name
-    ao_adamw_8bit = "ao_adamw_8bit"  # pylint: disable=invalid-name
-    ao_adamw_fp8 = "ao_adamw_fp8"  # pylint: disable=invalid-name
-    adopt_adamw = "adopt_adamw"  # pylint: disable=invalid-name
-    came_pytorch = "came_pytorch"  # pylint: disable=invalid-name
-    muon = "muon"  # pylint: disable=invalid-name
+    optimi_adamw = "optimi_adamw"
+    ao_adamw_4bit = "ao_adamw_4bit"
+    ao_adamw_8bit = "ao_adamw_8bit"
+    ao_adamw_fp8 = "ao_adamw_fp8"
+    adopt_adamw = "adopt_adamw"
+    came_pytorch = "came_pytorch"
+    muon = "muon"
+    dion = "dion"
 
 
 class RingAttnFunc(str, Enum):
     """Enum class for supported `ring-flash-attn` implementations"""
 
-    # VARLEN_RING = "varlen_ring"
-    # VARLEN_ZIGZAG = "varlen_zigzag"
     VARLEN_LLAMA3 = "varlen_llama3"
     BATCH_RING = "batch_ring"
+    # VARLEN_RING = "varlen_ring"
+    # VARLEN_ZIGZAG = "varlen_zigzag"
     # BATCH_ZIGZAG = "batch_zigzag"
     # BATCH_STRIPE = "batch_stripe"
diff --git a/src/axolotl/utils/schemas/integrations.py b/src/axolotl/utils/schemas/integrations.py
index 9d8f9c190..7332c7d39 100644
--- a/src/axolotl/utils/schemas/integrations.py
+++ b/src/axolotl/utils/schemas/integrations.py
@@ -1,21 +1,33 @@
 """Pydantic models for Axolotl integrations"""
 
-import logging
 from typing import Any
 
 from pydantic import BaseModel, Field, model_validator
 
-LOG = logging.getLogger(__name__)
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
 
 
 class MLFlowConfig(BaseModel):
     """MLFlow configuration subset"""
 
     use_mlflow: bool | None = None
-    mlflow_tracking_uri: str | None = None
-    mlflow_experiment_name: str | None = None
-    mlflow_run_name: str | None = None
-    hf_mlflow_log_artifacts: bool | None = None
+    mlflow_tracking_uri: str | None = Field(
+        default=None, json_schema_extra={"description": "URI to mlflow"}
+    )
+    mlflow_experiment_name: str | None = Field(
+        default=None, json_schema_extra={"description": "Your experiment name"}
+    )
+    mlflow_run_name: str | None = Field(
+        default=None, json_schema_extra={"description": "Your run name"}
+    )
+    hf_mlflow_log_artifacts: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "set to true to copy each saved checkpoint on each save to mlflow artifact registry"
+        },
+    )
 
 
 class LISAConfig(BaseModel):
@@ -39,13 +51,33 @@ class WandbConfig(BaseModel):
     """Wandb configuration subset"""
 
     use_wandb: bool | None = None
-    wandb_name: str | None = None
-    wandb_run_id: str | None = None
-    wandb_mode: str | None = None
-    wandb_project: str | None = None
-    wandb_entity: str | None = None
+    wandb_name: str | None = Field(
+        default=None,
+        json_schema_extra={"description": "Set the name of your wandb run"},
+    )
+    wandb_run_id: str | None = Field(
+        default=None, json_schema_extra={"description": "Set the ID of your wandb run"}
+    )
+    wandb_mode: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": '"offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb'
+        },
+    )
+    wandb_project: str | None = Field(
+        default=None, json_schema_extra={"description": "Your wandb project name"}
+    )
+    wandb_entity: str | None = Field(
+        default=None,
+        json_schema_extra={"description": "A wandb Team name if using a Team"},
+    )
     wandb_watch: str | None = None
-    wandb_log_model: str | None = None
+    wandb_log_model: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": '"checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training'
+        },
+    )
 
     @model_validator(mode="before")
     @classmethod
@@ -63,14 +95,52 @@ class WandbConfig(BaseModel):
 class CometConfig(BaseModel):
     """Comet configuration subset"""
 
-    use_comet: bool | None = None
-    comet_api_key: str | None = None
-    comet_workspace: str | None = None
-    comet_project_name: str | None = None
-    comet_experiment_key: str | None = None
-    comet_mode: str | None = None
-    comet_online: bool | None = None
-    comet_experiment_config: dict[str, Any] | None = None
+    use_comet: bool | None = Field(
+        default=None,
+        json_schema_extra={"description": "Enable or disable Comet integration."},
+    )
+    comet_api_key: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "API key for Comet. Recommended to set via `comet login`."
+        },
+    )
+    comet_workspace: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Workspace name in Comet. Defaults to the user's default workspace."
+        },
+    )
+    comet_project_name: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Project name in Comet. Defaults to Uncategorized."
+        },
+    )
+    comet_experiment_key: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Identifier for the experiment. Used to append data to an existing experiment or control the key of new experiments. Default to a random key."
+        },
+    )
+    comet_mode: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": 'Create a new experiment ("create") or log to an existing one ("get"). Default ("get_or_create") auto-selects based on configuration.'
+        },
+    )
+    comet_online: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Set to True to log data to Comet server, or False for offline storage. Default is True."
+        },
+    )
+    comet_experiment_config: dict[str, Any] | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Dictionary for additional configuration settings, see the doc for more details."
+        },
+    )
 
 
 class GradioConfig(BaseModel):
diff --git a/src/axolotl/utils/schemas/model.py b/src/axolotl/utils/schemas/model.py
index 5f1d26e84..eb751bfcc 100644
--- a/src/axolotl/utils/schemas/model.py
+++ b/src/axolotl/utils/schemas/model.py
@@ -1,10 +1,12 @@
 """Pydantic models for model input / output, etc. configuration"""
 
-import logging
+from typing import Any, Literal
 
 from pydantic import BaseModel, Field, field_validator
 
-LOG = logging.getLogger(__name__)
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
 
 
 class ModelInputConfig(BaseModel):
@@ -12,19 +14,77 @@ class ModelInputConfig(BaseModel):
 
     model_config = {"protected_namespaces": ()}
 
-    base_model: str
-    base_model_config: str | None = None
+    base_model: str = Field(
+        json_schema_extra={
+            "description": "This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This can also be a relative path to a model on disk"
+        }
+    )
+    base_model_config: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "If the base_model repo on hf hub doesn't include configuration .json files, You can set that here, or leave this empty to default to base_model"
+        },
+    )
     cls_model_config: str | None = None
-    tokenizer_config: str | None = None
-    tokenizer_use_fast: bool | None = None
-    tokenizer_legacy: bool | None = None
+    tokenizer_config: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Optional tokenizer configuration path in case you want to use a different tokenizer than the one defined in the base model"
+        },
+    )
+    tokenizer_use_fast: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "use_fast option for tokenizer loading from_pretrained, default to True"
+        },
+    )
+    tokenizer_legacy: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Whether to use the legacy tokenizer setting, defaults to True"
+        },
+    )
+    tokenizer_use_mistral_common: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Whether to use mistral-common tokenizer. If set to True, it will use the mistral-common tokenizer."
+        },
+    )
     tokenizer_type: str | None = Field(
-        default=None, json_schema_extra={"description": "transformers tokenizer class"}
+        default=None,
+        json_schema_extra={
+            "description": "Corresponding tokenizer for the model AutoTokenizer is a good choice"
+        },
     )
     processor_type: str | None = Field(
         default=None, json_schema_extra={"description": "transformers processor class"}
     )
-    trust_remote_code: bool | None = None
+    trust_remote_code: bool | None = Field(
+        default=None,
+        json_schema_extra={"description": "Trust remote code for untrusted source"},
+    )
+
+    experimental_skip_move_to_device: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Don't move the model to the device before sharding. "
+            "This is an experimental feature that may be included in the future as the default."
+        },
+    )
+
+    use_kernels: bool | None = Field(
+        default=None,
+        json_schema_extra={"description": "Use custom kernels, e.g. MegaBlocks."},
+    )
+
+    model_quantization_config: Literal["Mxfp4Config"] | None = Field(
+        default=None,
+        json_schema_extra={"description": "Model loading quantization config"},
+    )
+    model_quantization_config_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        json_schema_extra={"description": "kwargs for model quantization config"},
+    )
 
     @field_validator("trust_remote_code")
     @classmethod
@@ -39,10 +99,23 @@ class ModelInputConfig(BaseModel):
 class ModelOutputConfig(BaseModel):
     """model save configuration subset"""
 
-    output_dir: str = Field(default="./model-out")
-    hub_model_id: str | None = None
-    hub_strategy: str | None = None
-    save_safetensors: bool | None = True
+    output_dir: str = Field(
+        default="./model-out",
+        json_schema_extra={"description": "Where to save the full-finetuned model to"},
+    )
+    hub_model_id: str | None = Field(
+        default=None, json_schema_extra={"description": "push checkpoints to hub"}
+    )
+    hub_strategy: str | None = Field(
+        default=None,
+        json_schema_extra={"description": "how to push checkpoints to hub"},
+    )
+    save_safetensors: bool | None = Field(
+        default=True,
+        json_schema_extra={
+            "description": "Save model as safetensors (require safetensors package). Default True"
+        },
+    )
 
 
 class SpecialTokensConfig(BaseModel):
diff --git a/src/axolotl/utils/schemas/peft.py b/src/axolotl/utils/schemas/peft.py
index 5d408e1fe..de29521cb 100644
--- a/src/axolotl/utils/schemas/peft.py
+++ b/src/axolotl/utils/schemas/peft.py
@@ -9,7 +9,7 @@ class LoftQConfig(BaseModel):
     """LoftQ configuration subset"""
 
     loftq_bits: int = Field(
-        default=4, json_schema_extra={"description": "Quantization bits for LoftQ"}
+        default=4, json_schema_extra={"description": "typically 4 bits"}
     )
     # loftq_iter: int = Field(default=1, json_schema_extra={"description": "Alternating iterations for LoftQ"})
 
@@ -17,31 +17,79 @@ class LoftQConfig(BaseModel):
 class PeftConfig(BaseModel):
     """peftq configuration subset"""
 
-    loftq_config: LoftQConfig | None = None
+    loftq_config: LoftQConfig | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Configuration options for loftq initialization for LoRA"
+        },
+    )
 
 
 class LoraConfig(BaseModel):
     """Peft / LoRA configuration subset"""
 
-    load_in_8bit: bool | None = Field(default=False)
-    load_in_4bit: bool | None = Field(default=False)
+    load_in_8bit: bool | None = Field(
+        default=False,
+        json_schema_extra={
+            "description": "This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer"
+        },
+    )
+    load_in_4bit: bool | None = Field(
+        default=False, json_schema_extra={"description": "Use bitsandbytes 4 bit"}
+    )
 
-    adapter: str | None = None
-    lora_model_dir: str | None = None
+    adapter: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model"
+        },
+    )
+    lora_model_dir: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "If you already have a lora model trained that you want to load, put that here. This means after training, if you want to test the model, you should set this to the value of `output_dir`. Note that if you merge an adapter to the base model, a new subdirectory `merged` will be created under the `output_dir`."
+        },
+    )
     lora_r: int | None = None
     lora_alpha: int | None = None
     lora_fan_in_fan_out: bool | None = None
     lora_target_modules: str | list[str] | None = None
-    lora_target_linear: bool | None = None
-    lora_modules_to_save: list[str] | None = None
+    lora_target_parameters: str | list[str] | None = None
+    lora_target_linear: bool | None = Field(
+        default=None,
+        json_schema_extra={"description": "If true, will target all linear modules"},
+    )
+    lora_modules_to_save: list[str] | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens. For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities."
+        },
+    )
     lora_dropout: float | None = 0.0
-    peft_layers_to_transform: list[int] | None = None
+    peft_layers_to_transform: list[int] | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "The layer indices to transform, otherwise, apply to all layers"
+        },
+    )
     peft_layers_pattern: list[str] | None = None
     peft: PeftConfig | None = None
-    peft_use_dora: bool | None = None
-    peft_use_rslora: bool | None = None
-    peft_layer_replication: list[tuple[int, int]] | None = None
-    peft_init_lora_weights: bool | str | None = None
+    peft_use_dora: bool | None = Field(
+        default=None, json_schema_extra={"description": "Whether to use DoRA."}
+    )
+    peft_use_rslora: bool | None = Field(
+        default=None, json_schema_extra={"description": "Whether to use RSLoRA."}
+    )
+    peft_layer_replication: list[tuple[int, int]] | None = Field(
+        default=None,
+        json_schema_extra={"description": "List of layer indices to replicate."},
+    )
+    peft_init_lora_weights: bool | str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "How to initialize LoRA weights. Default to True which is MS original implementation."
+        },
+    )
 
     qlora_sharded_model_loading: bool | None = Field(
         default=False,
@@ -49,9 +97,24 @@ class LoraConfig(BaseModel):
             "description": "load qlora model in sharded format for FSDP using answer.ai technique."
         },
     )
-    lora_on_cpu: bool | None = None
-    gptq: bool | None = None
-    bnb_config_kwargs: dict[str, Any] | None = None
+    lora_on_cpu: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge"
+        },
+    )
+    gptq: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Whether you are training a 4-bit GPTQ quantized model"
+        },
+    )
+    bnb_config_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "optional overrides to the bnb 4bit quantization configuration"
+        },
+    )
 
     loraplus_lr_ratio: float | None = Field(
         default=None,
@@ -62,7 +125,7 @@ class LoraConfig(BaseModel):
     loraplus_lr_embedding: float | None = Field(
         default=1e-6,
         json_schema_extra={
-            "description": "loraplus learning rate for lora embedding layers."
+            "description": "loraplus learning rate for lora embedding layers. Default value is 1e-6."
         },
     )
 
@@ -125,8 +188,21 @@ class LoraConfig(BaseModel):
 class ReLoRAConfig(BaseModel):
     """ReLoRA configuration subset"""
 
-    relora_steps: int | None = None
-    relora_warmup_steps: int | None = None
-    relora_anneal_steps: int | None = None
-    relora_prune_ratio: float | None = None
-    relora_cpu_offload: bool | None = None
+    relora: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Whether to use ReLoRA. Use with jagged_restart_*steps options."
+        },
+    )
+    relora_prune_ratio: float | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "threshold for optimizer magnitude when pruning"
+        },
+    )
+    relora_cpu_offload: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "True to perform lora weight merges on cpu during restarts, for modest gpu memory savings"
+        },
+    )
diff --git a/src/axolotl/utils/schemas/quantization.py b/src/axolotl/utils/schemas/quantization.py
new file mode 100644
index 000000000..090640c7b
--- /dev/null
+++ b/src/axolotl/utils/schemas/quantization.py
@@ -0,0 +1,74 @@
+"""
+QAT Config Schema
+"""
+
+from typing import Any
+
+from pydantic import BaseModel, Field, field_validator
+
+from axolotl.utils.schemas.enums import TorchIntDType
+
+
+class QATConfig(BaseModel):
+    """
+    QAT Config Schema
+    """
+
+    activation_dtype: TorchIntDType | None = Field(
+        default=None,
+        description='Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"',
+    )
+    weight_dtype: TorchIntDType = Field(
+        default=TorchIntDType.int8,
+        description='Fake quantization layout to use for weight quantization. Valid options are "int4" and "int8"',
+    )
+    quantize_embedding: bool | None = Field(
+        default=False, description="Quantize embedding"
+    )
+    group_size: int | None = Field(
+        default=32,
+        description="The number of elements in each group for per-group fake quantization",
+    )
+    fake_quant_after_n_steps: int | None = Field(
+        default=None, description="The number of steps to apply fake quantization after"
+    )
+
+    @field_validator("activation_dtype", "weight_dtype", mode="before")
+    @classmethod
+    def validate_dtype(cls, v: Any) -> TorchIntDType | None:
+        if v == "int4":
+            return TorchIntDType.int4
+        if v == "int8":
+            return TorchIntDType.int8
+        raise ValueError(f"Invalid dtype: '{v}'. Must be one of: ['int4', 'int8']")
+
+
+class PTQConfig(BaseModel):
+    """
+    PTQ Config Schema
+    """
+
+    weight_dtype: TorchIntDType = Field(
+        default=TorchIntDType.int8,
+        description="Fake quantization layout to use for weight quantization. Valid options are uintX for X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8",
+    )
+    activation_dtype: TorchIntDType | None = Field(
+        default=None,
+        description='Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"',
+    )
+    quantize_embedding: bool | None = Field(
+        default=None, description="Whether to quantize the embedding layer."
+    )
+    group_size: int | None = Field(
+        default=32,
+        description="The number of elements in each group for per-group fake quantization",
+    )
+
+    @field_validator("activation_dtype", "weight_dtype", mode="before")
+    @classmethod
+    def validate_dtype(cls, v: Any) -> TorchIntDType | None:
+        if v == "int4":
+            return TorchIntDType.int4
+        if v == "int8":
+            return TorchIntDType.int8
+        raise ValueError(f"Invalid dtype: '{v}'. Must be one of: ['int4', 'int8']")
diff --git a/src/axolotl/utils/schemas/training.py b/src/axolotl/utils/schemas/training.py
index 69547c17f..b1788dcaa 100644
--- a/src/axolotl/utils/schemas/training.py
+++ b/src/axolotl/utils/schemas/training.py
@@ -1,15 +1,15 @@
 """Pydantic models for training hyperparameters"""
 
-import logging
 from typing import Any, Literal
 
 from pydantic import BaseModel, Field, field_validator
 from transformers import SchedulerType
 from transformers.training_args import OptimizerNames
 
+from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.enums import CustomSupportedOptimizers
 
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__)
 
 
 class LrGroup(BaseModel):
@@ -23,10 +23,17 @@ class LrGroup(BaseModel):
 class HyperparametersConfig(BaseModel):
     """Training hyperparams configuration subset"""
 
-    gradient_accumulation_steps: int | None = Field(default=1)
+    gradient_accumulation_steps: int | None = Field(
+        default=1,
+        json_schema_extra={
+            "description": "If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps."
+        },
+    )
     micro_batch_size: int | None = Field(
         default=1,
-        json_schema_extra={"description": "per gpu micro batch size for training"},
+        json_schema_extra={
+            "description": "The number of samples to include in each batch. This is the number of samples sent to each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps"
+        },
     )
     batch_size: int | None = Field(
         default=None,
@@ -41,45 +48,119 @@ class HyperparametersConfig(BaseModel):
         },
     )
 
-    auto_find_batch_size: bool | None = None
+    auto_find_batch_size: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "whether to find batch size that fits in memory. Passed to underlying transformers Trainer"
+        },
+    )
 
-    train_on_inputs: bool | None = False
-    group_by_length: bool | None = None
+    train_on_inputs: bool | None = Field(
+        default=False,
+        json_schema_extra={
+            "description": "Whether to mask out or include the human's prompt from the training labels"
+        },
+    )
+    group_by_length: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Group similarly sized data to minimize padding. May be slower to start, as it must download and sort the entire dataset. Note that training loss may have an oscillating pattern with this enabled."
+        },
+    )
 
     learning_rate: str | float
     embedding_lr: float | None = None
     embedding_lr_scale: float | None = None
-    weight_decay: float | None = 0.0
-    optimizer: (OptimizerNames | CustomSupportedOptimizers) | None = (
-        OptimizerNames.ADAMW_TORCH_FUSED
+    weight_decay: float | None = Field(
+        default=0.0, json_schema_extra={"description": "Specify weight decay"}
+    )
+    optimizer: (OptimizerNames | CustomSupportedOptimizers) | None = Field(
+        default=OptimizerNames.ADAMW_TORCH_FUSED,
+        json_schema_extra={"description": "Specify optimizer"},
     )
     optim_args: (str | dict[str, Any]) | None = Field(
         default=None,
-        json_schema_extra={"description": "Optional arguments to supply to optimizer."},
+        json_schema_extra={
+            "description": "Dictionary of arguments to pass to the optimizer"
+        },
     )
     optim_target_modules: (list[str] | Literal["all_linear"]) | None = Field(
         default=None,
         json_schema_extra={
-            "description": "The target modules to optimize, i.e. the module names that you would like to train."
+            "description": "The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm"
+        },
+    )
+    torchdistx_path: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Path to torch distx for optim 'adamw_anyprecision'"
         },
     )
-    torchdistx_path: str | None = None
     lr_scheduler: (SchedulerType | Literal["one_cycle"] | Literal["rex"]) | None = (
         SchedulerType.COSINE
     )
-    lr_scheduler_kwargs: dict[str, Any] | None = None
+    lr_scheduler_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Specify a scheduler and kwargs to use with the optimizer"
+        },
+    )
     lr_quadratic_warmup: bool | None = None
-    cosine_min_lr_ratio: float | None = None
-    cosine_constant_lr_ratio: float | None = None
-    lr_div_factor: float | None = None
+    cosine_min_lr_ratio: float | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr"
+        },
+    )
+    cosine_constant_lr_ratio: float | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step"
+        },
+    )
+    lr_div_factor: float | None = Field(
+        default=None, json_schema_extra={"description": "Learning rate div factor"}
+    )
     lr_groups: list[LrGroup] | None = None
 
-    adam_epsilon: float | None = None
-    adam_epsilon2: float | None = None
-    adam_beta1: float | None = None
-    adam_beta2: float | None = None
-    adam_beta3: float | None = None
-    max_grad_norm: float | None = None
+    adam_epsilon: float | None = Field(
+        default=None, json_schema_extra={"description": "adamw hyperparams"}
+    )
+    adam_epsilon2: float | None = Field(
+        default=None, json_schema_extra={"description": "only used for CAME Optimizer"}
+    )
+    adam_beta1: float | None = Field(
+        default=None, json_schema_extra={"description": "adamw hyperparams"}
+    )
+    adam_beta2: float | None = Field(
+        default=None, json_schema_extra={"description": "adamw hyperparams"}
+    )
+    adam_beta3: float | None = Field(
+        default=None, json_schema_extra={"description": "only used for CAME Optimizer"}
+    )
+
+    dion_lr: float | None = Field(
+        default=None, json_schema_extra={"description": "Dion Optimizer learning rate"}
+    )
+    dion_momentum: float | None = Field(
+        default=None, json_schema_extra={"description": "Dion Optimizer momentum"}
+    )
+    dion_rank_fraction: float | None = Field(
+        default=1.0,
+        json_schema_extra={
+            "description": "Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank dimension."
+        },
+    )
+    dion_rank_multiple_of: int | None = Field(
+        default=1,
+        json_schema_extra={
+            "description": "Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may be useful to ensure even sharding."
+        },
+    )
+
+    max_grad_norm: float | None = Field(
+        default=None, json_schema_extra={"description": "Gradient clipping max norm"}
+    )
     num_epochs: float = Field(default=1.0)
 
     @field_validator("batch_size")
@@ -99,3 +180,24 @@ class HyperparametersConfig(BaseModel):
         if learning_rate and isinstance(learning_rate, str):
             learning_rate = float(learning_rate)
         return learning_rate
+
+
+class JaggedLRConfig(BaseModel):
+    """JaggedLR configuration subset, can be used w/ ReLoRA training"""
+
+    jagged_restart_steps: int | None = Field(
+        default=None,
+        json_schema_extra={"description": "how often to reset for jagged restarts"},
+    )
+    jagged_restart_warmup_steps: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "how many warmup steps to take after reset for jagged restarts"
+        },
+    )
+    jagged_restart_anneal_steps: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "how many anneal steps to take before reset for jagged restarts"
+        },
+    )
diff --git a/src/axolotl/utils/schemas/trl.py b/src/axolotl/utils/schemas/trl.py
index 37b71dba8..980474e87 100644
--- a/src/axolotl/utils/schemas/trl.py
+++ b/src/axolotl/utils/schemas/trl.py
@@ -1,5 +1,7 @@
 """Pydantic models for TRL trainer configuration"""
 
+from typing import Literal
+
 from pydantic import BaseModel, Field
 
 
@@ -10,12 +12,14 @@ class TRLConfig(BaseModel):
 
     beta: float | None = Field(
         default=None,
-        json_schema_extra={"description": "Beta for RL training"},
+        json_schema_extra={
+            "description": "Beta parameter for the RL training. Same as `rl_beta`. Use"
+        },
     )
     max_completion_length: int | None = Field(
         default=None,
         json_schema_extra={
-            "description": "Maximum length of the completion for RL training"
+            "description": "Maximum length of the completion for RL training."
         },
     )
 
@@ -23,81 +27,83 @@ class TRLConfig(BaseModel):
     # Ref: https://github.com/huggingface/trl/blob/26d86757a7c7e24e397ea44f57ecce6031dfac01/trl/trainer/grpo_config.py#L23
     use_vllm: bool = Field(
         default=False,
-        json_schema_extra={"description": "Whether to use VLLM for RL training"},
+        json_schema_extra={"description": "Whether to use VLLM for RL training."},
+    )
+    vllm_mode: Literal["server", "colocate"] | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "VLLM mode to use, one of 'server' or 'colocate'"
+        },
     )
     vllm_server_host: str | None = Field(
         default="0.0.0.0",  # nosec B104
-        json_schema_extra={"description": "Host of the vLLM server to connect to"},
+        json_schema_extra={"description": "Host of the vLLM server to connect to."},
     )
     vllm_server_port: int | None = Field(
         default=8000,
-        json_schema_extra={"description": "Port of the vLLM server to connect to"},
+        json_schema_extra={"description": "Port of the vLLM server to connect to."},
     )
     vllm_server_timeout: int | None = Field(
         default=None,
         json_schema_extra={
-            "description": "Total timeout duration in seconds to wait for the vLLM server to be up. If the server is not up "
-            "after the timeout, a `ConnectionError` is raised."
+            "description": "Total timeout (in seconds) to wait for the vLLM server to respond."
         },
     )
     vllm_guided_decoding_regex: str | None = Field(
         default=None,
-        json_schema_extra={
-            "description": "Regex for vLLM guided decoding. If `None` (default), guided decoding is disabled."
-        },
+        json_schema_extra={"description": "Regex for vLLM guided decoding."},
     )
 
     reward_funcs: list[str] | None = Field(
         default=None,
-        json_schema_extra={"description": "List of reward functions to load"},
+        json_schema_extra={
+            "description": "List of reward functions to load. Paths must be importable from current dir."
+        },
     )
     reward_weights: list[float] | None = Field(
         default=None,
         json_schema_extra={
-            "description": "Weights for each reward function. Must match the number of reward functions."
+            "description": "List of reward weights for the reward functions."
         },
     )
     num_generations: int | None = Field(
         default=None,
-        json_schema_extra={
-            "description": "Number of generations to sample. The global batch size (num_processes * per_device_batch_size) must be divisible by this value."
-        },
+        json_schema_extra={"description": "Number of generations to sample."},
     )
     log_completions: bool | None = Field(
         default=False,
-        json_schema_extra={"description": "Whether to log completions"},
+        json_schema_extra={"description": "Whether to log completions."},
     )
     num_completions_to_print: int | None = Field(
         default=None,
         json_schema_extra={
-            "description": "Number of completions to print. If `log_completions` is `True`, this will be the number of completions logged."
+            "description": "Number of completions to print when log_completions is True."
         },
     )
+    importance_sampling_level: Literal["sequence", "token"] | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Controls whether importance sampling ratios are computed at the `'token'` or `'sequence'` level. "
+            "For GSPO, use `sequence`, default is None which corresponds to the original GRPO paper."
+        },
+    )
+
     sync_ref_model: bool | None = Field(
         default=False,
-        json_schema_extra={
-            "description": (
-                "Whether to sync the reference model every `ref_model_sync_steps` "
-                "steps, using the `ref_model_mixup_alpha` parameter."
-            )
-        },
+        json_schema_extra={"description": "Whether to sync the reference model."},
     )
     ref_model_mixup_alpha: float | None = Field(
         default=0.9,
-        json_schema_extra={
-            "description": "Mixup alpha for the reference model. Requires `sync_ref_model=True`."
-        },
+        json_schema_extra={"description": "Mixup alpha for the reference model."},
     )
     ref_model_sync_steps: int | None = Field(
         default=64,
-        json_schema_extra={
-            "description": "Sync steps for the reference model. Requires `sync_ref_model=True`."
-        },
+        json_schema_extra={"description": "Sync steps for the reference model."},
     )
     scale_rewards: bool = Field(
         default=True,
         json_schema_extra={
-            "description": "Whether to scale the rewards for GRPO by dividing them by their standard deviation."
+            "description": "Whether to scale rewards by their standard deviation."
         },
     )
 
@@ -124,13 +130,13 @@ class TRLConfig(BaseModel):
     repetition_penalty: float | None = Field(
         default=None,
         json_schema_extra={
-            "description": "Float that penalizes new tokens based on whether they appear in the prompt and the generated text so far."
+            "description": "Penalty for tokens that appear in prompt and generated text."
         },
     )
     num_iterations: int | None = Field(
         default=None,
         json_schema_extra={
-            "description": "Number of iterations per batch (denoted as μ in the algorithm) for GRPO."
+            "description": "Number of iterations per batch (μ) for GRPO."
         },
     )
     epsilon: float | None = Field(
@@ -152,12 +158,12 @@ class TRLConfig(BaseModel):
     loss_type: str | None = Field(
         default=None,
         json_schema_extra={
-            "description": "Specifies the loss formulation to use. Supported values are `grpo`, `bnpo`, and `dr_grpo`."
+            "description": "Loss formulation to use. Supported values: grpo, bnpo, dr_grpo."
         },
     )
     mask_truncated_completions: bool = Field(
         default=False,
         json_schema_extra={
-            "description": "When enabled, truncated completions are excluded from the loss calculation."
+            "description": "Whether to exclude truncated completions from loss calculation."
         },
     )
diff --git a/src/axolotl/utils/schemas/utils.py b/src/axolotl/utils/schemas/utils.py
index bf74390f6..b46c8f847 100644
--- a/src/axolotl/utils/schemas/utils.py
+++ b/src/axolotl/utils/schemas/utils.py
@@ -1,8 +1,8 @@
 """Utilities for Axolotl Pydantic models"""
 
-import logging
+from axolotl.utils.logging import get_logger
 
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__)
 
 
 def handle_legacy_message_fields_logic(data: dict) -> dict:
diff --git a/src/axolotl/utils/schemas/validation.py b/src/axolotl/utils/schemas/validation.py
new file mode 100644
index 000000000..72991c947
--- /dev/null
+++ b/src/axolotl/utils/schemas/validation.py
@@ -0,0 +1,1339 @@
+"""Module with validation methods for config pydantic model."""
+
+# pylint: disable=too-many-boolean-expressions
+
+import json
+import tempfile
+from pathlib import Path
+
+from pydantic import (
+    field_validator,
+    model_validator,
+)
+from transformers.utils.import_utils import is_torch_npu_available
+
+from axolotl.utils.logging import get_logger
+from axolotl.utils.schemas.enums import ChatTemplate, RingAttnFunc, RLType
+
+# pylint: disable=too-many-lines
+
+LOG = get_logger(__name__)
+
+SUPPORTED_METRICS = {"sacrebleu", "comet", "ter", "chrf", "perplexity"}
+
+
+class DatasetValidationMixin:
+    """Validation methods related to dataset configuration."""
+
+    @field_validator("seed", mode="after")
+    @classmethod
+    def set_default_seed(cls, seed):
+        if seed is None:
+            LOG.info("`seed` not set in config; setting to 42")
+            seed = 42
+        return seed
+
+    @field_validator("datasets", mode="before")
+    @classmethod
+    def deprecate_sharegpt_datasets(cls, datasets):
+        for _, ds_cfg in enumerate(datasets):
+            ds_type = (
+                ds_cfg.get("type")
+                if isinstance(ds_cfg, dict)
+                else getattr(ds_cfg, "type", None)
+            )
+            if not ds_type:
+                continue
+
+            if isinstance(ds_type, dict):
+                continue
+
+            if isinstance(ds_type, str) and ds_type.startswith("sharegpt"):
+                raise ValueError(
+                    "`type: sharegpt.*` is deprecated. Please use `type: chat_template` instead."
+                )
+
+        return datasets
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_dataset_or_pretraining_dataset(cls, data):
+        if data.get("datasets") is None and data.get("pretraining_dataset") is None:
+            raise ValueError("either datasets or pretraining_dataset is required")
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_push_ds_auth(cls, data):
+        if (
+            data.get("push_dataset_to_hub")
+            and data.get("hf_use_auth_token") is not True
+        ):
+            raise ValueError(
+                "Require cfg.hf_use_auth_token to be True for push_dataset_to_hub"
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_val_w_test_datasets(cls, data):
+        if data.get("test_datasets") and data.get("val_set_size"):
+            raise ValueError(
+                "non-zero val_set_size should not be used with test_datasets configuration"
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_test_datasets_bench(cls, data):
+        if (
+            data.get("do_bench_eval")
+            and not data.get("test_datasets")
+            and not data.get("val_set_size")
+        ):
+            LOG.warning(
+                "`do_bench_eval` needs a test dataset to run evals, adding an empty test_dataset."
+            )
+            data["test_datasets"] = [{"path": "axolotl-ai-co/empty-test-ds"}]
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_eval_packing(cls, data):
+        # TODO also should check test_datasets and val_set_size as we can skip
+        # if there are no eval datasets/splits
+        if (
+            data.get("sample_packing")
+            and data.get("eval_table_size")
+            and data.get("eval_sample_packing") is not False
+        ):
+            raise ValueError(
+                "eval_table_size and eval_sample_packing are not supported together with sample_packing. Please set 'eval_sample_packing' to false."
+            )
+        if (
+            data.get("sample_packing")
+            and data.get("eval_sample_packing") is None
+            and not data.get("eval_table_size")
+        ):
+            LOG.info(
+                "explicitly setting `eval_sample_packing` to match `sample_packing`",
+                main_process_only=True,
+            )
+            data["eval_sample_packing"] = True
+
+        if (
+            data.get("sample_packing")
+            and data.get("eval_sample_packing") is False
+            and data.get("remove_unused_columns") is None
+        ):
+            LOG.info(
+                "setting `remove_unused_columns: false` for when sample_packing and eval_sample_packing don't match"
+            )
+            data["remove_unused_columns"] = False
+
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_mm_prepare(cls, data):
+        if data.get("skip_prepare_dataset"):
+            if data.get("remove_unused_columns") is None:
+                LOG.info(
+                    "setting `remove_unused_columns: false` for skip_prepare_dataset"
+                )
+                data["remove_unused_columns"] = False
+
+        return data
+
+
+class AttentionValidationMixin:
+    """Validation methods related to attention mechanisms."""
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_attention_fields(cls, data):
+        fields = (
+            "xformers_attention",
+            "sdp_attention",
+            "s2_attention",
+            "flash_attention",
+            "flex_attention",
+        )
+        non_empty_count = sum(1 for field in fields if data.get(field))
+
+        if non_empty_count > 1:
+            raise ValueError(f"Only one of {', '.join(fields)} must be set")
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_sample_packing_without_attention(cls, data):
+        if (
+            data.get("sample_packing")
+            and not data.get("flash_attention")
+            and not data.get("sdp_attention")
+            and not data.get("flex_attention")
+            and not data.get("xformers_attention")
+        ):
+            LOG.warning(
+                "sample_packing without flash, sdp, xformers or flex attention does not handle cross sample decontamination."
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_sample_packing_with_s2attn(cls, data):
+        if data.get("sample_packing") and data.get("s2_attention"):
+            raise ValueError(
+                "Received `sample_packing=true` and `s2_attention=true`; however, \
+                shifted-sparse attention does not currently support sample packing."
+            )
+        return data
+
+
+class TrainingValidationMixin:
+    """Validation methods related to training configuration."""
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_batch_size_fields(cls, data):
+        fields = ("micro_batch_size", "gradient_accumulation_steps", "batch_size")
+        non_empty_count = sum(1 for field in fields if data.get(field))
+
+        if non_empty_count < 2:
+            raise ValueError(f"At least two of {', '.join(fields)} must be set")
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def hint_sample_packing_padding(cls, data):
+        if data.get("sample_packing"):
+            pad_to_sequence_len = data.get("pad_to_sequence_len")
+            if pad_to_sequence_len is False:
+                LOG.warning(
+                    "`pad_to_sequence_len: true` is recommended when using sample_packing"
+                )
+            elif pad_to_sequence_len is None:
+                LOG.info(
+                    "Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing"
+                )
+                data["pad_to_sequence_len"] = True
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def hint_reward_model_pad(cls, data):
+        if data.get("reward_model") and not data.get("pad_to_sequence_len"):
+            LOG.warning(
+                "`pad_to_sequence_len: true` is recommended when using reward_model"
+            )
+            if data.get("pad_to_sequence_len") is None:
+                data["pad_to_sequence_len"] = True
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_gas_bsz(cls, data):
+        if data.get("gradient_accumulation_steps") and data.get("batch_size"):
+            raise ValueError(
+                "please set only one of gradient_accumulation_steps or batch_size"
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def hint_eval_train_mbsz(cls, data):
+        if (
+            data.get("eval_batch_size")
+            and data.get("micro_batch_size")
+            and data.get("eval_batch_size") != data.get("micro_batch_size")
+        ):
+            LOG.warning(
+                "eval_batch_size != micro_batch_size. This can lead to VRAM instability."
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_warmup(cls, data):
+        if data.get("warmup_steps") and data.get("warmup_ratio"):
+            raise ValueError("warmup_steps and warmup_ratio are mutually exclusive")
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_saves(cls, data):
+        if (
+            data.get("save_strategy")
+            and data.get("save_steps")
+            and data.get("save_strategy") != "steps"
+        ):
+            raise ValueError(
+                "save_strategy and save_steps mismatch. Please set save_strategy to 'steps' or remove save_steps."
+            )
+        if data.get("saves_per_epoch") and data.get("save_steps"):
+            raise ValueError(
+                "save_steps and saves_per_epoch are mutually exclusive and cannot be used together."
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_push_save(cls, data):
+        if data.get("hub_model_id") and (
+            data.get("save_strategy") not in ["steps", "epoch", None]
+        ):
+            LOG.warning(
+                "hub_model_id is set without any models being saved. To save a model, set save_strategy."
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_evals(cls, data):
+        if (
+            data.get("eval_strategy")
+            and data.get("eval_steps")
+            and data.get("eval_strategy") != "steps"
+        ):
+            raise ValueError(
+                "eval_strategy and eval_steps mismatch. Please set eval_strategy to 'steps' or remove eval_steps."
+            )
+
+        if (
+            data.get("val_set_size") == 0
+            and (data.get("eval_steps") or data.get("eval_strategy"))
+            and not data.get("test_datasets")
+            and data.get("eval_strategy") != "no"
+        ):
+            raise ValueError(
+                "eval_steps and eval_strategy are not supported with val_set_size == 0"
+            )
+        if data.get("evals_per_epoch") and data.get("eval_steps"):
+            raise ValueError(
+                "eval_steps and evals_per_epoch are mutually exclusive and cannot be used together."
+            )
+        if (
+            data.get("evals_per_epoch")
+            and data.get("eval_strategy")
+            and data.get("eval_strategy") != "steps"
+        ):
+            raise ValueError(
+                "eval_strategy must be empty or set to `steps` when used with evals_per_epoch."
+            )
+
+        if data.get("do_bench_eval") and not (
+            data.get("evals_per_epoch") or data.get("eval_steps")
+        ):
+            raise ValueError(
+                "do_bench_eval requires evals_per_epoch or eval_steps to be set."
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_neftune(cls, data):
+        if data.get("noisy_embedding_alpha") and not data.get("neftune_noise_alpha"):
+            data["neftune_noise_alpha"] = data["noisy_embedding_alpha"]
+            del data["noisy_embedding_alpha"]
+        elif data.get("noisy_embedding_alpha") and data.get("neftune_noise_alpha"):
+            raise ValueError(
+                "noisy_embedding_alpha is deprecated, use neftune_noise_alpha; both are set, please remove the deprecated noisy_embedding_alpha setting"
+            )
+        return data
+
+    @model_validator(mode="after")
+    def check_fft_possible_bad_config(self):
+        if (
+            # pylint: disable=too-many-boolean-expressions
+            not (self.bf16 or self.bfloat16)
+            and (self.fp16 or self.float16)
+            and not self.adapter
+            and not self.flash_attention
+            and self.sample_packing
+        ):
+            LOG.warning(
+                "Full fine tune w/o FA2 w/ sample packing and fp16/float16 is likely to raise errors. Try LoRA."
+            )
+            # ValueError: Attempting to unscale FP16 gradients.
+            # OR
+            # RuntimeError: expected mat1 and mat2 to have the same dtype, but got: float != c10::Half
+        return self
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_fp8_config(cls, data):
+        if data.get("fp8") and not data.get("torch_compile"):
+            LOG.warning(
+                "torch_compile is strongly recommended for FP8 training in order to "
+                "see speed improvements. Please consider setting `torch_compile: "
+                "true` in your config."
+            )
+        if data.get("fp8") and (
+            data.get("fsdp_config", {}).get("activation_checkpointing", False) is True
+            or data.get("fsdp_config", {}).get("fsdp_activation_checkpointing", False)
+            is True
+        ):
+            LOG.warning(
+                "FP8 + FSDP2 + activation checkpointing may be slower than BF16 "
+                "training. Please considering setting `activation_checkpointing: false` "
+                "in your FSDP config."
+            )
+        if (
+            data.get("fp8_enable_fsdp_float8_all_gather")
+            and not data.get("fsdp_version", None) == 2
+        ):
+            raise ValueError(
+                "fp8_enable_fsdp_float8_all_gather requires FSDP2 (fsdp_version: 2) "
+                "to be used."
+            )
+
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_use_reentrant_mismatch(cls, data):
+        if (
+            data.get("unfrozen_parameters")
+            and data.get("gradient_checkpointing_kwargs")
+            and data.get("gradient_checkpointing_kwargs", {}).get("use_reentrant")
+            is True
+        ):
+            # https://github.com/huggingface/transformers/issues/21381
+            raise ValueError(
+                "`use_reentrant` must be false when used with partially frozen model."
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_eval_strategy(cls, data):
+        if (
+            data.get("evaluation_strategy") is not None
+            and data.get("eval_strategy") is None
+        ):
+            LOG.info(
+                "explicitly setting `eval_strategy` from the `evaluation_strategy`"
+            )
+            data["eval_strategy"] = data.get("evaluation_strategy")
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_causal_lm_evals(cls, data):
+        if data.get("do_causal_lm_eval") and data.get("eval_sample_packing"):
+            raise ValueError(
+                "do_causal_lm_eval is enabled, eval_sample_packing must be set to False"
+            )
+
+        if data.get("eval_causal_lm_metrics"):
+            if not isinstance(data.get("eval_causal_lm_metrics"), list):
+                raise ValueError("eval_causal_lm_metrics must be a list")
+            # only ["sacrebleu", "comet", "ter", "chrf"] supported
+            if set(data.get("eval_causal_lm_metrics")) - SUPPORTED_METRICS:
+                raise ValueError(
+                    f"eval_causal_lm_metrics must be one of {SUPPORTED_METRICS}"
+                )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_tokenizer_use_mistral_common(cls, data):
+        if data.get("tokenizer_use_mistral_common") is None:
+            if any(
+                "magistral" in name.lower()
+                for name in [
+                    data.get("base_model", ""),
+                    data.get("base_model_config", ""),
+                    data.get("tokenizer_config", ""),
+                ]
+            ):
+                LOG.warning(
+                    "tokenizer_use_mistral_common auto inferred to True for Magistral models. Please set it to True explicitly if you want to use mistral-common tokenizer."
+                )
+                data["tokenizer_use_mistral_common"] = True
+
+        return data
+
+    @field_validator("tokenizer_use_mistral_common", mode="after")
+    @classmethod
+    def check_mistral_common_import(cls, tokenizer_use_mistral_common):
+        if tokenizer_use_mistral_common:
+            try:
+                import mistral_common  # noqa: F401 # pylint:disable=unused-import
+            except ImportError as exception:
+                raise ImportError(
+                    "mistral-common is required for mistral models. Please install it with `pip install axolotl` or `pip install -e .`."
+                ) from exception
+
+        return tokenizer_use_mistral_common
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_mistral_common_incompatible_options(cls, data):
+        if not data.get("tokenizer_use_mistral_common"):
+            return data
+
+        # NOTE: mistral-common tokenizer is not compatible with editing tokenizer at the moment
+
+        if data.get("added_tokens_overrides"):
+            raise ValueError(
+                "added_tokens_overrides is not supported with mistral-common tokenizer"
+            )
+
+        if data.get("special_tokens"):
+            raise ValueError(
+                "special_tokens override is not supported with mistral-common tokenizer"
+            )
+
+        if data.get("tokens"):
+            raise ValueError(
+                "tokens override is not supported with mistral-common tokenizer"
+            )
+
+        if data.get("chat_template"):
+            raise ValueError(
+                "Setting chat_template is not supported with mistral-common tokenizer"
+            )
+
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def pretrain_with_tps(cls, data):
+        if data.get("pretraining_dataset") and data.get(
+            "include_tokens_per_second", False
+        ):
+            # combining these would raise `TypeError: cannot pickle 'dict_keys' object`
+            # due to trying to count the number of tokens total in the dataset
+            raise ValueError(
+                "pretraining_dataset and include_tokens_per_second cannot be used together."
+            )
+
+        return data
+
+
+class LoRAValidationMixin:
+    """Validation methods related to LoRA/QLoRA configuration."""
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_lr_groups(cls, data):
+        if data.get("lr_groups") and data.get("loraplus_lr_ratio"):
+            raise ValueError("lr_groups and loraplus_lr_ratio cannot be used together.")
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_frozen(cls, data):
+        if (
+            data.get("adapter")
+            and data.get("peft_layers_to_transform")
+            and data.get("unfrozen_parameters")
+        ):
+            raise ValueError(
+                "`unfrozen_parameters` used with `peft_layers_to_transform` can have unexpected behavior."
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_peft_layers_pattern(cls, data):
+        if data.get("peft_layers_pattern") and not data.get("peft_layers_to_transform"):
+            raise ValueError(
+                "peft_layers_pattern requires peft_layers_to_transform to be set"
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_qlora_unsloth(cls, data):
+        if (
+            data.get("unsloth_lora_mlp")
+            or data.get("unsloth_lora_qkv")
+            or data.get("unsloth_lora_o")
+        ):
+            if data.get("adapter") == "lora" and data.get("load_in_8bit"):
+                raise ValueError(
+                    "unsloth_lora_mlp, unsloth_lora_qkv, and unsloth_lora_o are not compatible with 8-bit LoRA"
+                )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_lora_axolotl_unsloth(cls, data):
+        is_lora_kernel = any(
+            data.get(k) for k in ["lora_mlp_kernel", "lora_qkv_kernel", "lora_o_kernel"]
+        )
+        is_unsloth_lora = any(
+            data.get(k)
+            for k in ["unsloth_lora_mlp", "unsloth_lora_qkv", "unsloth_lora_o"]
+        )
+        if is_lora_kernel and is_unsloth_lora:
+            raise ValueError(
+                "both lora_mlp_kernel and unsloth_lora_mlp cannot be true (similarly for lora_qkv_kernel, lora_o_kernel)"
+            )
+        return data
+
+    @model_validator(mode="after")
+    def check_fused_lora(self):
+        if self.adapter in ["lora", "qlora"] and self.flash_attn_fuse_mlp:
+            raise ValueError("Fused modules are not supported with LoRA/QLoRA")
+        return self
+
+    @model_validator(mode="before")
+    @classmethod
+    def warn_qlora_zero3_w_use_reentrant(cls, data):
+        if (
+            data.get("adapter") == "qlora"
+            and data.get("gradient_checkpointing_kwargs", {})
+            and data.get("gradient_checkpointing_kwargs", {}).get("use_reentrant")
+            is False
+            and data.get("deepspeed", "") is not None
+            and "zero3" in data.get("deepspeed", "")
+        ):
+            # may result in:
+            # torch.utils.checkpoint.CheckpointError: torch.utils.checkpoint:
+            # Recomputed values for the following tensors have different metadata
+            # than during the forward pass.
+            LOG.warning(
+                "qlora + zero3 with use_reentrant: false may result in a CheckpointError about recomputed values"
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_lora_kernels_8bit(cls, data):
+        if (
+            data.get("lora_mlp_kernel")
+            or data.get("lora_qkv_kernel")
+            or data.get("lora_o_kernel")
+        ):
+            if data.get("adapter") == "lora" and data.get("load_in_8bit"):
+                raise ValueError(
+                    "lora_mlp_kernel, lora_qkv_kernel, and lora_o_kernel are not "
+                    "compatible with 8-bit LoRA a the moment."
+                )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_lora_kernels_dora(cls, data):
+        if (
+            data.get("lora_mlp_kernel")
+            or data.get("lora_qkv_kernel")
+            or data.get("lora_o_kernel")
+        ) and data.get("peft_use_dora"):
+            raise ValueError(
+                "lora_mlp_kernel, lora_qkv_kernel, and lora_o_kernel are not "
+                "compatible with DoRA at the moment."
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_lora_kernels_rl(cls, data):
+        if (
+            data.get("lora_mlp_kernel")
+            or data.get("lora_qkv_kernel")
+            or data.get("lora_o_kernel")
+        ) and data.get("rl"):
+            raise ValueError(
+                "lora_mlp_kernel, lora_qkv_kernel, and lora_o_kernel are not "
+                "compatible with RL at the moment."
+            )
+        return data
+
+
+class RLValidationMixin:
+    """Validation methods related to RL training configuration."""
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_sample_packing_w_rl(cls, data):
+        if data.get("sample_packing") and data.get("rl"):
+            raise ValueError("`sample_packing: true` does not work with RLHF training")
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_kto_config(cls, data):
+        if data.get("rl") == "kto":
+            if data.get("sample_packing") or data.get("eval_sample_packing"):
+                raise ValueError("sample_packing is not supported with kto")
+
+            if data.get("remove_unused_columns") is not False:
+                raise ValueError("Set `remove_unused_columns: False` when using kto")
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_grpo_liger_sequence_parallel(cls, data):
+        if (
+            data.get("rl") == "grpo"
+            and data.get("trl", {})
+            and data.get("trl").get("use_liger_loss")
+            and data.get("context_parallel_size", 1) > 1
+        ):
+            raise ValueError("GRPO + SP + Liger not currently supported")
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_rl_config_gradient_checkpointing(cls, data):
+        # TODO: SalmanMohammadi
+        # Distributed RL with QLoRA + gradient checkpointing
+        # and use_reentrant = True is broken upstream in TRL
+        # pylint: disable=too-many-boolean-expressions
+        if (
+            data.get("rl")
+            and data.get("gradient_checkpointing")
+            and data.get("gradient_checkpointing_kwargs")
+            and data.get("gradient_checkpointing_kwargs").get("use_reentrant")
+            and data.get("load_in_4bit")
+            and data.get("adapter") == "qlora"
+            and data.get("capabilities")
+            and data.get("capabilities").get("n_gpu", 1) > 1
+        ):
+            raise ValueError(
+                "The `use_reentrant: True` implementation of gradient checkpointing "
+                "is not supported for distributed RL training with QLoRA. Please set "
+                "`use_reentrant: False` in `gradient_checkpointing_kwargs`."
+            )
+        return data
+
+
+class OptimizationValidationMixin:
+    """Validation methods related to optimization and performance."""
+
+    @model_validator(mode="after")
+    def check_adamw_optimizer_params(self):
+        if any([self.adam_beta1, self.adam_beta2, self.adam_epsilon]) and (
+            not self.optimizer or "adamw" not in str(self.optimizer).lower()
+        ):
+            LOG.warning("adamw hyperparameters found, but no adamw optimizer set")
+        return self
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_muon_deepspeed_fsdp(cls, data):
+        if data.get("optimizer") == "muon" and (
+            data.get("deepspeed") or data.get("fsdp") or data.get("fsdp_config")
+        ):
+            raise ValueError(
+                "Muon optimizer is currently incompatible with DeepSpeed and FSDP"
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_batch_flattening_fa(cls, data):
+        if data.get("batch_flattening"):
+            batch_flattening_auto = data.get("batch_flattening") == "auto"
+            if not data.get("flash_attention") and not batch_flattening_auto:
+                raise ValueError("batch_flattening requires flash attention")
+            if data.get("sample_packing") and not batch_flattening_auto:
+                raise ValueError("batch_flattening not compatible with sample_packing")
+            if data.get("micro_batch_size") == 1 and not batch_flattening_auto:
+                LOG.warning("batch_flattening has no effect with micro_batch_size == 1")
+
+            if (
+                batch_flattening_auto
+                and data.get("flash_attention")
+                and not data.get("sample_packing")
+                and data.get("micro_batch_size") > 1
+            ):
+                data["batch_flattening"] = True
+            elif batch_flattening_auto:
+                data["batch_flattening"] = False
+
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_torch_compile_deepspeed(cls, data):
+        if data.get("deepspeed") and data.get("torch_compile"):
+            raise ValueError(
+                "torch_compile should be set within your deepspeed config file"
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_xentropy_patch_conflicts(cls, data):
+        if data.get("flash_attn_cross_entropy") and data.get(
+            "unsloth_cross_entropy_loss"
+        ):
+            raise ValueError(
+                "flash_attn_cross_entropy and unsloth_cross_entropy_loss cannot be both enabled"
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_fsdp_version(cls, data):
+        fsdp_config = data.get("fsdp_config", {})
+        if fsdp_config and str(data.get("fsdp_version")) != "2":
+            LOG.info(
+                "FSDP1 will be deprecated in an upcoming release of Axolotl."
+                "We recommend that you use FSDP version 2 for better performance and compatibility. "
+                "Please see this link for more details: https://docs.axolotl.ai/docs/multi-gpu.html#sec-fsdp "
+                "For more details on migrating your config. "
+            )
+        return data
+
+    @model_validator(mode="after")
+    def check_fsdp2_base_model_quant_ram_efficient_loading(self):
+        fsdp_config = self.fsdp_config if hasattr(self, "fsdp_config") else None
+        fsdp_version = self.fsdp_version if hasattr(self, "fsdp_version") else None
+        load_in_8bit = self.load_in_8bit if hasattr(self, "load_in_8bit") else None
+        load_in_4bit = self.load_in_4bit if hasattr(self, "load_in_4bit") else None
+        if fsdp_config and fsdp_version == 2:
+            if fsdp_config.get("cpu_ram_efficient_loading") and (
+                load_in_8bit or load_in_4bit
+            ):
+                raise ValueError(
+                    "FSDP2 does not support load_in_8bit or load_in_4bit with cpu_ram_efficient_loading. Please do one of the following: use DeepSpeed, "
+                    "set fsdp_version to 1, or disable cpu_ram_efficient_loading."
+                )
+        return self
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_fsdp2_base_model_quant_rl(cls, data):
+        if data.get("fsdp_version") == 2 and data.get("rl") in [
+            RLType.DPO,
+            RLType.KTO,
+            RLType.ORPO,
+            RLType.IPO,
+        ]:
+            if data.get("load_in_8bit") or data.get("load_in_4bit"):
+                raise ValueError(
+                    f"FSDP2 does not support load_in_8bit or load_in_4bit with {data.get('rl')}. Please use DeepSpeed or set `fsdp_version` to 1."
+                )
+
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_fsdp_version_in_fsdp_config(cls, data):
+        if data.get("fsdp_config"):
+            if data.get("fsdp_config", {}).get("fsdp_version"):
+                LOG.warning(
+                    "Configuring `fsdp_version` in `fsdp_config` is deprecated. "
+                    "Please configure `fsdp_version` as a top-level field."
+                )
+                data["fsdp_version"] = data.get("fsdp_config").pop("fsdp_version")
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_fsdp_config_kwargs_prefix(cls, data):
+        if fsdp_config := data.get("fsdp_config"):
+            should_fix = False
+            for key, _ in fsdp_config.items():
+                if key.startswith("fsdp_"):
+                    should_fix = True
+                    LOG.warning_once(
+                        "Configuring FSDP fields with the `fsdp_` prefix is deprecated. "
+                        "Please omit the `fsdp_` prefix from the any fields in `fsdp_config`."
+                    )
+            if should_fix:
+                update_fsdp_config = {}
+                for key, value in fsdp_config.items():
+                    if key.startswith("fsdp_") and key != "fsdp_version":
+                        update_fsdp_config[key.replace("fsdp_", "")] = value
+                    else:
+                        update_fsdp_config[key] = value
+                data["fsdp_config"] = update_fsdp_config
+        return data
+
+    @model_validator(mode="after")
+    def check_fsdp_offload_w_8bit_optimizer(self):
+        if (
+            hasattr(self, "fsdp_config")
+            and self.fsdp_config
+            and self.optimizer
+            and "8bit" in self.optimizer.value
+            and self.fsdp_config["offload_params"]
+            and str(self.fsdp_version) != "2"
+        ):
+            raise ValueError(
+                f"FSDP Offload not compatible with {str(self.optimizer.value)}"
+            )
+        return self
+
+    @model_validator(mode="after")
+    def check_fsdp2_w_8bit_optimizer(self):
+        if (
+            hasattr(self, "fsdp_config")
+            and self.fsdp_config
+            and self.optimizer
+            and "8bit" in self.optimizer.value
+            and str(self.fsdp_version) == "2"
+        ):
+            if self.optimizer in ["adamw_8bit", "adamw_bnb_8bit"]:
+                # CUDA ops errors with bnb 8bit optimizer + FSDP2
+                raise ValueError(
+                    f"FSDP2 not compatible with {self.optimizer.value}, use `adamw_torch_8bit` instead"
+                )
+
+        return self
+
+    @model_validator(mode="after")
+    def lr_groups_ao_optimizer(self):
+        if (
+            self.loraplus_lr_ratio is not None
+            or self.embedding_lr_scale is not None
+            or self.embedding_lr is not None
+            or self.lr_groups is not None
+        ) and self.optimizer.value in ["adamw_torch_8bit", "adamw_torch_4bit"]:
+            # TODO(wing): remove this once ao>0.12.0
+            # requires https://github.com/pytorch/ao/pull/2606 in an ao release
+            raise ValueError(
+                "lr groups (`loraplus_lr_ratio`, `embedding_lr_scale`, `embedding_lr`, `lr_groups`) are not "
+                "supported with ao low-bit optimizers until ao>0.12.0. "
+                "Please refer to https://github.com/pytorch/ao/pull/2606."
+            )
+        return self
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_tensor_parallel_size_update_ds_json(cls, data):
+        tensor_parallel_size = data.get("tensor_parallel_size")
+        if tensor_parallel_size is not None and tensor_parallel_size > 1:
+            if data.get("deepspeed"):
+                with open(data.get("deepspeed"), "r", encoding="utf-8") as ds_fin:
+                    ds_config = json.load(ds_fin)
+                    should_save = False
+                    if "tensor_parallel" not in ds_config:
+                        ds_config["tensor_parallel"] = {
+                            "autotp_size": tensor_parallel_size
+                        }
+                        should_save = True
+                    if (
+                        "gather_16bit_weights_on_model_save"
+                        not in ds_config["zero_optimization"]
+                    ):
+                        ds_config["zero_optimization"][
+                            "gather_16bit_weights_on_model_save"
+                        ] = True
+                        should_save = True
+                    if should_save:
+                        temp_dir = tempfile.mkdtemp()
+                        with open(
+                            Path(temp_dir) / "autotp_ds.json", "w", encoding="utf-8"
+                        ) as ds_fout:
+                            json.dump(ds_config, ds_fout, indent=4)
+                        data["deepspeed"] = str(Path(temp_dir) / "autotp_ds.json")
+
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_deepcompile(cls, data):
+        deepcompile = data.get("deepcompile")
+        if deepcompile:
+            if not data.get("deepspeed"):
+                raise ValueError("DeepCompile is only supported with DeepSpeed")
+            with open(data.get("deepspeed"), "r", encoding="utf-8") as ds_fin:
+                ds_config = json.load(ds_fin)
+                if "compile" not in ds_config:
+                    ds_config["compile"] = {"deepcompile": True}
+                    temp_dir = tempfile.mkdtemp()
+                    with open(
+                        Path(temp_dir) / "deepcompile_ds.json", "w", encoding="utf-8"
+                    ) as ds_fout:
+                        json.dump(ds_config, ds_fout, indent=4)
+                    data["deepspeed"] = str(Path(temp_dir) / "deepcompile_ds.json")
+
+        return data
+
+
+class SystemValidationMixin:
+    """Validation methods related to system and hardware configuration."""
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_mem_mismatch(cls, data):
+        if (
+            data.get("max_memory") is not None
+            and data.get("gpu_memory_limit") is not None
+        ):
+            raise ValueError(
+                "max_memory and gpu_memory_limit are mutually exclusive and cannot be used together."
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_fsdp_deepspeed(cls, data):
+        if data.get("deepspeed") and data.get("fsdp"):
+            raise ValueError("deepspeed and fsdp cannot be used together.")
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_model_quantization_config_vs_bnb(cls, data):
+        if data.get("model_quantization_config"):
+            if data.get("load_in_8bit") or data.get("load_in_4bit"):
+                raise ValueError(
+                    "model_quantization_config and load_in_8bit or load_in_4bit cannot be used together."
+                )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_npu_config(cls, data):
+        if is_torch_npu_available():
+            # check attention config
+            attn_list = ["flash_attention", "sdp_attention", "s2_attention"]
+            for attn in attn_list:
+                if data.get(attn):
+                    raise NotImplementedError(
+                        f"{attn} is currently not supported in Ascend npu, please disable this configuration."
+                    )
+
+            # check quant config
+            if data.get("optimizer") is not None and "bit" in data.get("optimizer"):
+                optimizer = data.get("optimizer")
+                raise NotImplementedError(
+                    f"{optimizer} is currently not supported in Ascend npu, choose another one please."
+                )
+
+            quant_list = ["load_in_8bit", "load_in_4bit"]
+            for quant in quant_list:
+                if data.get(quant):
+                    raise NotImplementedError(
+                        f"Quantification is currently not supported in Ascend npu, please disable {quant}."
+                    )
+
+            # check dtype config
+            if data.get("tf32"):
+                raise NotImplementedError(
+                    "tf32 dtype is currently not supported in Ascend npu, please disable this configuration"
+                )
+
+        return data
+
+
+class ChatTemplateValidationMixin:
+    """Validation methods related to chat template configuration."""
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_chat_template_config(cls, data):
+        # if chat_template is set to jinja, chat_template_jinja is required
+        if data.get("chat_template") == ChatTemplate.jinja and not data.get(
+            "chat_template_jinja"
+        ):
+            raise ValueError(
+                "chat_template_jinja is required when chat_template is set to jinja"
+            )
+
+        # If chat_template_jinja is set, set chat_template to jinja
+        if data.get("chat_template_jinja") and not data.get("chat_template"):
+            data["chat_template"] = ChatTemplate.jinja
+
+        return data
+
+
+class PretrainingValidationMixin:
+    """Validation methods related to pretraining configuration."""
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_pretraining_w_max_steps(cls, data):
+        if data.get("pretraining_dataset") and not data.get("max_steps"):
+            raise ValueError(
+                "max_steps must be set when using iterable pretraining_dataset, Trainer can't infer length and schedule optimizer/learning rate without it!"
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_pretraining_w_group_by_length(cls, data):
+        if data.get("pretraining_dataset") and data.get("group_by_length"):
+            LOG.warning(
+                "You probably want to disable group_by_length as it will force a streamed dataset to download completely."
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_pretraining_split_batches_accelerate(cls, data):
+        # alternatively set ACCELERATE_SPLIT_BATCHES=False
+        if data.get("pretraining_dataset"):
+            accelerator_config = data.get("accelerator_config", {})
+            if not accelerator_config:
+                data["accelerator_config"] = {
+                    "split_batches": False,
+                    "dispatch_batches": False,
+                }
+            else:
+                if accelerator_config.get("split_batches") is None:
+                    data["accelerator_config"]["split_batches"] = False
+                if accelerator_config.get("dispatch_batches") is None:
+                    data["accelerator_config"]["dispatch_batches"] = False
+        return data
+
+
+class ModelCompatibilityValidationMixin:
+    """Validation methods for specific model compatibility."""
+
+    @model_validator(mode="after")
+    def check_falcon_fsdp(self):
+        if (self.base_model and "falcon" in self.base_model.lower()) and self.fsdp:
+            raise ValueError("FSDP is not supported for falcon models")
+        return self
+
+    @model_validator(mode="after")
+    def check_mpt_checkpointing(self):
+        if (
+            self.base_model and "mpt" in self.base_model.lower()
+        ) and self.gradient_checkpointing:
+            raise ValueError("gradient_checkpointing is not supported for MPT models")
+        return self
+
+    @model_validator(mode="after")
+    def check_gradient_checkpointing_w_offload(self):
+        if self.gradient_checkpointing == "offload":
+            LOG.warning(
+                "`offload` is deprecated for gradient_checkpointing, use `activation_offloading: true` or `activation_offloading: legacy`"
+            )
+            self.gradient_checkpointing = True
+            LOG.warning(
+                "`offload` now uses a new stream implementation; to use the previous implementation, use `activation_offloading: legacy`"
+            )
+            self.activation_offloading = True
+        if self.gradient_checkpointing == "offload_disk":
+            LOG.warning(
+                "`offload_disk` is deprecated for gradient_checkpointing, use `activation_offloading: disk`"
+            )
+            self.gradient_checkpointing = True
+            self.activation_offloading = "disk"
+        return self
+
+    @model_validator(mode="after")
+    def check_activation_offloading_wo_gc(self):
+        if self.activation_offloading and not self.gradient_checkpointing:
+            raise ValueError("activation_offloading requires gradient_checkpointing")
+        return self
+
+    @model_validator(mode="after")
+    def check_better_transformers(self):
+        if self.flash_optimum is True:
+            if self.adapter:
+                LOG.warning(
+                    "BetterTransformers probably doesn't work with PEFT adapters"
+                )
+            if self.fp16 or self.bf16:
+                raise ValueError("AMP is not supported with BetterTransformer")
+            if self.float16 is not True and self.bfloat16 is not True:
+                LOG.warning(
+                    "You should probably set bfloat16 or float16 to true to "
+                    "load the model in float16 for BetterTransformers"
+                )
+        return self
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_gptq_w_revision(cls, data):
+        if data.get("gptq") and data.get("revision_of_model"):
+            raise ValueError(
+                "revision_of_model is not supported for GPTQ models. "
+                + "Please download the model from HuggingFace Hub manually for correct branch, "
+                + "point to its path, and remove revision_of_model from the config."
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_gpt_oss_fsdp_loading(cls, data):
+        if data.get("model_quantization_config", "") == "Mxfp4Config":
+            if (
+                data.get("fsdp_config", {}).get("cpu_ram_efficient_loading", False)
+                is True
+            ):
+                raise ValueError(
+                    "FSDP cpu_ram_efficient_loading is not supported for Mxfp4Config model quantization."
+                )
+        return data
+
+
+class ComplexValidationMixin:
+    """Complex validation methods that involve multiple systems."""
+
+    @field_validator("neftune_noise_alpha")
+    @classmethod
+    def validate_neftune_noise_alpha(cls, neftune_noise_alpha):
+        if neftune_noise_alpha is not None and neftune_noise_alpha <= 0.0:
+            raise ValueError("neftune_noise_alpha must be > 0.0")
+        return neftune_noise_alpha
+
+    @model_validator(mode="after")
+    def check_rl_beta(self):
+        if self.dpo_beta and not self.rl_beta:
+            self.rl_beta = self.dpo_beta
+            del self.dpo_beta
+        return self
+
+    @model_validator(mode="after")
+    def check_simpo_warmup(self):
+        if self.rl is RLType.SIMPO and self.warmup_ratio:
+            raise ValueError(
+                "warmup_ratio is not supported with the simpo trainer. Please use `warmup_steps` instead"
+            )
+        return self
+
+    @model_validator(mode="after")
+    def check_relora(self):
+        if self.relora:
+            if not self.jagged_restart_steps:
+                raise ValueError("jagged_restart_steps must be set to use ReLoRA")
+            if self.adapter not in ("lora", "qlora"):
+                raise ValueError("cfg.adapter must be lora or qlora to use ReLoRA")
+
+            if self.fsdp or self.fsdp_config:
+                raise ValueError("fsdp not supported with ReLoRA")
+
+            if self.deepspeed:
+                raise ValueError("deepspeed not supported with ReLoRA")
+
+            if self.lr_scheduler == "one_cycle":
+                raise ValueError(
+                    "ReLoRA is not compatible with the one_cycle scheduler"
+                )
+
+            if self.flash_attn_fuse_mlp:
+                raise ValueError("Fused modules are not supported with ReLoRA")
+        return self
+
+    @model_validator(mode="after")
+    def check_early_stopping(self):
+        if self.early_stopping_patience:
+            if not self.save_steps or not self.eval_steps:
+                raise ValueError(
+                    "`early_stopping_patience` requires save_steps and eval_steps to be set. eval_steps should evenly divide save_steps."
+                )
+            if self.save_steps % self.eval_steps != 0:
+                raise ValueError(
+                    "`early_stopping_patience` requires that eval_steps should evenly divide save_steps."
+                )
+        return self
+
+    @model_validator(mode="after")
+    def check_tensor_parallel_size(self):
+        if not self.tensor_parallel_size:
+            self.tensor_parallel_size = 1
+        return self
+
+    @model_validator(mode="after")
+    def check_context_parallel_size(self):
+        if self.sequence_parallel_degree and not self.context_parallel_size:
+            LOG.warning(
+                "`sequence_parallel_degree` is deprecated, use `context_parallel_size`"
+            )
+            self.context_parallel_size = self.sequence_parallel_degree
+        if not self.context_parallel_size:
+            self.context_parallel_size = 1
+        elif self.context_parallel_size > 1:
+            if not self.flash_attention:
+                raise ValueError(
+                    "flash_attention: true must be set with context_parallel_size > 1"
+                )
+
+            if self.sample_packing and self.micro_batch_size > 1:
+                raise ValueError(
+                    "micro_batch_size must be set to 1 when sample_packing is enabled "
+                    "due to a `ring-flash-attn` requirement"
+                )
+
+            try:
+                import transformers.modeling_flash_attention_utils
+
+                # pylint: disable=protected-access
+                transformers.modeling_flash_attention_utils._flash_supports_window_size = (
+                    transformers.modeling_flash_attention_utils._flash_supports_window
+                )
+                import ring_flash_attn  # noqa: F401 # pylint:disable=unused-import
+            except ImportError as exception:
+                raise ImportError(
+                    "context_parallel_size > 1 but ring_flash_attn is not installed. "
+                    "Please install it with `pip install axolotl[ring-flash-attn] "
+                    "or `pip install ring-flash-attn>=0.1.4`."
+                ) from exception
+
+            LOG.warning(
+                "Sequence parallelism (SP) is enabled with "
+                f"context_parallel_size={self.context_parallel_size}. "
+                "Please note that logged losses may differ slightly to the non-SP "
+                "losses due to transformers Trainer implementation details. "
+                "Please see https://github.com/axolotl-ai-cloud/axolotl/pull/2495#issuecomment-2784022042 "
+                "for more details."
+            )
+
+        return self
+
+    @model_validator(mode="after")
+    def validate_ring_attn_func(self):
+        if getattr(self, "context_parallel_size", 1) == 1:
+            return self
+
+        if self.ring_attn_func is not None:
+            self.ring_attn_func = RingAttnFunc(self.ring_attn_func)
+        else:
+            # Default ring attention function selection
+            sample_packing = getattr(self, "sample_packing", False)
+            self.ring_attn_func = (
+                RingAttnFunc.VARLEN_LLAMA3
+                if sample_packing
+                else RingAttnFunc.BATCH_RING
+            )
+
+        return self
+
+
+class DistributedValidationMixin:
+    """validation for distributed training."""
+
+    @model_validator(mode="after")
+    def check_tensor_parallel_optimizer(self):
+        if self.tensor_parallel_size > 1:
+            if self.optimizer in ["paged_adamw_8bit", "adamw_8bit", "adamw_bnb_8bit"]:
+                raise ValueError(
+                    "tensor_parallel_size is not supported with paged_adamw_8bit, adamw_8bit, and adamw_bnb_8bit optimizers"
+                )
+
+        return self
+
+
+class GRPOVllmValidationMixin:
+    """Validation mixin for vllm when using GRPO."""
+
+    @model_validator(mode="after")
+    def check_vllm_mode_set(self):
+        if self.trl and self.trl.use_vllm and not self.trl.vllm_mode:
+            LOG.warning(
+                "vllm_mode must be set to either `server` or `colocate` when using vllm, using default value `server`"
+            )
+            self.trl.vllm_mode = "server"
+        return self
+
+
+# pylint: disable=too-many-ancestors
+class ValidationMixin(
+    DatasetValidationMixin,
+    AttentionValidationMixin,
+    TrainingValidationMixin,
+    LoRAValidationMixin,
+    RLValidationMixin,
+    OptimizationValidationMixin,
+    SystemValidationMixin,
+    ChatTemplateValidationMixin,
+    PretrainingValidationMixin,
+    ModelCompatibilityValidationMixin,
+    ComplexValidationMixin,
+    GRPOVllmValidationMixin,
+):
+    """Full validation mixin for Axolotl configuration."""
diff --git a/src/axolotl/utils/schemas/vllm.py b/src/axolotl/utils/schemas/vllm.py
index 48441de5e..518b8f62d 100644
--- a/src/axolotl/utils/schemas/vllm.py
+++ b/src/axolotl/utils/schemas/vllm.py
@@ -18,6 +18,10 @@ class VllmConfig(BaseModel):
         default=None,
         json_schema_extra={"description": "Tensor parallel size for VLLM"},
     )
+    data_parallel_size: int | None = Field(
+        default=None,
+        json_schema_extra={"description": "Data parallel size for VLLM"},
+    )
     gpu_memory_utilization: float | None = Field(
         default=0.9,
         json_schema_extra={"description": "GPU memory utilization for VLLM"},
@@ -44,3 +48,12 @@ class VllmConfig(BaseModel):
         default=8000,
         json_schema_extra={"description": "Port of the vLLM server to start on"},
     )
+
+    enable_reasoning: bool | None = Field(
+        default=None,
+        json_schema_extra={"description": "Enable reasoning for VLLM"},
+    )
+    reasoning_parser: str | None = Field(
+        default=None,
+        json_schema_extra={"description": "Reasoning parser for VLLM"},
+    )
diff --git a/src/axolotl/utils/tokenization.py b/src/axolotl/utils/tokenization.py
index e0b21a9f0..3526bd5b5 100644
--- a/src/axolotl/utils/tokenization.py
+++ b/src/axolotl/utils/tokenization.py
@@ -1,10 +1,10 @@
 """Module for tokenization utilities"""
 
-import logging
-
 from termcolor import colored
 
-LOG = logging.getLogger("axolotl")
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
 
 
 def check_dataset_labels(
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index 556eee09f..2d1cb14c4 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -11,18 +11,16 @@ from typing import List, Optional
 import numpy as np
 import torch
 import torch.cuda
-from accelerate.logging import get_logger
 from datasets import IterableDataset, disable_caching, enable_caching
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from transformers.utils import is_torch_bf16_gpu_available
 
-from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFRLTrainerBuilder
-from axolotl.monkeypatch.trainer_eval_guard import patch_evaluation_loop_for_fsdp2
-from axolotl.utils.distributed import reduce_and_broadcast
+from axolotl.utils.distributed import init_distributed_state, reduce_and_broadcast
 from axolotl.utils.environment import check_cuda_p2p_ib_support
+from axolotl.utils.logging import get_logger
 from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
 
-LOG = get_logger("axolotl")
+LOG = get_logger(__name__)
 
 
 @torch.jit.script
@@ -508,6 +506,7 @@ def process_pretraining_datasets_for_packing(
     if not skip_position_ids:
         train_dataset = train_dataset.map(
             add_position_ids,
+            batched=True,
             desc="Add position_id column (Pretraining Sample Packing)",
         )
     if drop_attention_mask:
@@ -528,7 +527,7 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
             .apply(len)
             .values
         )
-        LOG.debug(f"total_num_tokens: {total_num_tokens:_}", main_process_only=True)
+        LOG.debug(f"total_num_tokens: {total_num_tokens:_}")
         if update:
             cfg.total_num_tokens = total_num_tokens
 
@@ -546,10 +545,7 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
             .apply(lambda x: np.sum(np.array(x) != -100))
             .sum()
         )
-        LOG.debug(
-            f"`total_supervised_tokens: {total_supervised_tokens:_}`",
-            main_process_only=True,
-        )
+        LOG.debug(f"`total_supervised_tokens: {total_supervised_tokens:_}`")
         if update:
             cfg.total_supervised_tokens = total_supervised_tokens
 
@@ -571,11 +567,11 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
                     - 1
                 )
                 * cfg.num_epochs
-                * cfg.sequence_parallel_degree
+                * cfg.context_parallel_size
+                * cfg.tensor_parallel_size
             )
             LOG.debug(
-                f"total_num_tokens: {cfg.total_num_tokens:_}, total_num_steps: {total_num_steps:_}",
-                main_process_only=True,
+                f"total_num_tokens: {cfg.total_num_tokens:_}, total_num_steps: {total_num_steps:_}"
             )
         else:
             if cfg.flash_attention and not cfg.multipack_real_batches:
@@ -597,6 +593,8 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
                 bin_size=cfg.sample_packing_bin_size,
                 sequential=cfg.sample_packing_sequentially,
                 drop_last=True,
+                num_processes=cfg.dataset_processes,
+                mp_start_method=cfg.sample_packing_mp_start_method or "fork",
             )
 
             data_loader = DataLoader(
@@ -604,14 +602,20 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
                 batch_sampler=sampler,
             )
             data_loader_len = len(data_loader) * cfg.micro_batch_size // cfg.batch_size
-            LOG.debug(f"data_loader_len: {data_loader_len}", main_process_only=True)
+            LOG.debug(f"data_loader_len: {data_loader_len}")
             # FIXME: is there a bug here somewhere? the total num steps depends
             # on the agreed on value for sample_packing_eff_est
             total_num_steps = int(
                 math.floor(
-                    data_loader_len * cfg.num_epochs * cfg.sequence_parallel_degree
+                    data_loader_len
+                    * cfg.num_epochs
+                    * cfg.context_parallel_size
+                    * cfg.tensor_parallel_size
                 )
             )
+            if cfg.dataloader_drop_last:
+                # drop the last batch for each epoch
+                total_num_steps -= int(math.ceil(cfg.num_epochs))
 
             def calc_sample_packing_eff_est(estimates: List[float]):
                 LOG.info(f"sample_packing_eff_est across ranks: {repr(estimates)}")
@@ -626,20 +630,18 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
             )
             if update:
                 cfg.sample_packing_eff_est = sample_packing_eff_est
-            LOG.debug(
-                f"sample_packing_eff_est: {cfg.sample_packing_eff_est}",
-                main_process_only=True,
-            )
+            LOG.debug(f"sample_packing_eff_est: {cfg.sample_packing_eff_est}")
     else:
         total_num_steps = int(
             math.ceil(
                 len(train_dataset)
                 * cfg.num_epochs
-                * cfg.sequence_parallel_degree
+                * cfg.context_parallel_size
+                * cfg.tensor_parallel_size
                 / cfg.batch_size
             )
         )
-    LOG.debug(f"total_num_steps: {total_num_steps}", main_process_only=True)
+    LOG.debug(f"total_num_steps: {total_num_steps}")
     return total_num_steps
 
 
@@ -663,47 +665,89 @@ def setup_deepspeed_env(cfg, stage=None):
 
     os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
     os.environ["ACCELERATE_DEEPSPEED_CONFIG_FILE"] = cfg.deepspeed
+    os.environ["ACCELERATE_GRADIENT_ACCUMULATION_STEPS"] = str(
+        cfg.gradient_accumulation_steps
+    )
     if stage:
         os.environ["ACCELERATE_DEEPSPEED_ZERO_STAGE"] = str(stage)
         if stage == 3:
             os.environ["ACCELERATE_DEEPSPEED_ZERO3_INIT"] = "true"
+
+    # NOTE(djsaunde): The distribued state cannot be initialized prior to the
+    # ACCELERATE_USE_DEEPSPEED assignment, but it must be initialized some time prior
+    # to model load.
+    if (
+        int(os.environ.get("WORLD_SIZE", "1")) == 1
+        and os.environ.get("AXOLOTL_IS_PREPROCESS", "0") != "1"
+    ):
+        os.environ["WORLD_SIZE"] = "1"  # force it in case not set
+        os.environ["LOCAL_RANK"] = "0"  # force it in case not set
+        os.environ["RANK"] = os.environ.get("LOCAL_RANK", "0")
+        import deepspeed.comm as dist
+
+        dist.init_distributed(
+            dist_backend="nccl", auto_mpi_discovery=False, dist_init_required=True
+        )
+    init_distributed_state()
+
     # If we don't assign this, it doesn't actually get set in the accelerate weakref
     _ = HfTrainerDeepSpeedConfig(cfg.deepspeed)
 
 
 def setup_fsdp_envs(cfg):
     os.environ["ACCELERATE_USE_FSDP"] = "true"
-    if str(cfg.fsdp_config.fsdp_version) == "2":
+
+    # TODO @SalmanMohammadi remove FSDP1 args in 0.12
+    if str(cfg.fsdp_version) == "2":
         os.environ["FSDP_VERSION"] = "2"
-    if cfg.fsdp_config.fsdp_activation_checkpointing:
+    if cfg.fsdp_config.activation_checkpointing:
         os.environ["FSDP_ACTIVATION_CHECKPOINTING"] = "true"
-    if cfg.fsdp_config.fsdp_offload_params:
+    if cfg.fsdp_config.offload_params:
         os.environ["FSDP_OFFLOAD_PARAMS"] = "true"
-    if cfg.fsdp_config.fsdp_sync_module_states:
+    if cfg.fsdp_config.sync_module_states:
         os.environ["FSDP_SYNC_MODULE_STATES"] = "true"
-    if cfg.fsdp_config.fsdp_cpu_ram_efficient_loading:
+    if cfg.fsdp_config.cpu_ram_efficient_loading:
         os.environ["FSDP_CPU_RAM_EFFICIENT_LOADING"] = "true"
-    if cfg.fsdp_config.fsdp_use_orig_params:
+    if cfg.fsdp_config.use_orig_params:
         os.environ["FSDP_USE_ORIG_PARAMS"] = "true"
-    if cfg.fsdp_config.fsdp_state_dict_type:
-        os.environ["FSDP_STATE_DICT_TYPE"] = cfg.fsdp_config.fsdp_state_dict_type
-    if cfg.fsdp_config.fsdp_auto_wrap_policy:
-        os.environ["FSDP_AUTO_WRAP_POLICY"] = cfg.fsdp_config.fsdp_auto_wrap_policy
-    if cfg.fsdp_config.fsdp_transformer_layer_cls_to_wrap:
+    if cfg.fsdp_config.state_dict_type:
+        os.environ["FSDP_STATE_DICT_TYPE"] = cfg.fsdp_config.state_dict_type
+    if cfg.fsdp_config.auto_wrap_policy:
+        os.environ["FSDP_AUTO_WRAP_POLICY"] = cfg.fsdp_config.auto_wrap_policy
+    if cfg.fsdp_config.transformer_layer_cls_to_wrap:
         os.environ["FSDP_TRANSFORMER_CLS_TO_WRAP"] = (
-            cfg.fsdp_config.fsdp_transformer_layer_cls_to_wrap
-        )
-    if cfg.fsdp_config.fsdp_reshard_after_forward is not None:
-        os.environ["FSDP_RESHARD_AFTER_FORWARD"] = (
-            "true" if cfg.fsdp_config.fsdp_reshard_after_forward else "false"
+            cfg.fsdp_config.transformer_layer_cls_to_wrap
         )
+    if cfg.fsdp_config.reshard_after_forward:
+        os.environ["FSDP_RESHARD_AFTER_FORWARD"] = "true"
+
+
+def setup_parallelism_envs(cfg):
+    set_accelerate_parallelism_config = False
+    if cfg.tensor_parallel_size and cfg.tensor_parallel_size > 1:
+        set_accelerate_parallelism_config = True
+        os.environ["PARALLELISM_CONFIG_TP_SIZE"] = str(cfg.tensor_parallel_size)
+    if cfg.dp_shard_size and cfg.dp_shard_size > 1:
+        set_accelerate_parallelism_config = True
+        os.environ["PARALLELISM_CONFIG_DP_SHARD_SIZE"] = str(cfg.dp_shard_size)
+    if cfg.dp_replicate_size and cfg.dp_replicate_size > 1:
+        set_accelerate_parallelism_config = True
+        os.environ["PARALLELISM_CONFIG_DP_REPLICATE_SIZE"] = str(cfg.dp_replicate_size)
+    if cfg.context_parallel_size and cfg.context_parallel_size > 1:
+        set_accelerate_parallelism_config = True
+        os.environ["PARALLELISM_CONFIG_CP_SIZE"] = str(cfg.context_parallel_size)
+        os.environ["ACCELERATE_ALLOW_CP_STANDALONE"] = "true"
+    if set_accelerate_parallelism_config:
+        os.environ["ACCELERATE_USE_PARALLELISM_CONFIG"] = "true"
 
 
 def prepare_optim_env(cfg):
     if not check_cuda_p2p_ib_support():
         if os.getenv("NCCL_P2P_DISABLE") is None:
             os.environ["NCCL_P2P_DISABLE"] = "1"
-    if cfg.fsdp:
+    # TODO @SalmanMohammadi remove the cfg.fsdp check in 0.12
+    if cfg.fsdp or cfg.fsdp_config:
+        cfg.fsdp = True if not cfg.fsdp else cfg.fsdp
         setup_fsdp_envs(cfg)
     elif cfg.deepspeed:
         stage = None
@@ -715,6 +759,7 @@ def prepare_optim_env(cfg):
             stage = deepspeed_config.get("zero_optimization", {}).get("stage", None)
         setup_deepspeed_env(cfg, stage=stage)
 
+    setup_parallelism_envs(cfg)
     setup_torch_compile_env(cfg)
 
     if cfg.fp8:
@@ -731,6 +776,9 @@ def prepare_opinionated_env(cfg):
     if cfg.qlora_sharded_model_loading:
         # model loading is forked after the tokenizer
         os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    if cfg.sample_packing:
+        # multipack parallel packing sampler defaults to using fork
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 
 def setup_trainer(
@@ -762,12 +810,8 @@ def setup_trainer(
         A trainer instance (either `HFRLTrainer` or `HFCausalTrainer`) configured based
             on the provided parameters.
     """
-    if (
-        cfg.torch_compile
-        and cfg.fsdp_config
-        and str(cfg.fsdp_config.fsdp_version) == "2"
-    ):
-        patch_evaluation_loop_for_fsdp2()
+    from axolotl.core.builders import HFCausalTrainerBuilder, HFRLTrainerBuilder
+
     if cfg.rl:
         trainer_builder = HFRLTrainerBuilder(cfg, model, tokenizer, processor)
         trainer_builder.model_ref = model_ref
diff --git a/src/axolotl/utils/wandb_.py b/src/axolotl/utils/wandb_.py
index 327dd9b63..6484d435a 100644
--- a/src/axolotl/utils/wandb_.py
+++ b/src/axolotl/utils/wandb_.py
@@ -16,6 +16,3 @@ def setup_wandb_env_vars(cfg: DictDefault):
     # Enable wandb if project name is present
     if cfg.wandb_project and len(cfg.wandb_project) > 0:
         cfg.use_wandb = True
-        os.environ.pop("WANDB_DISABLED", None)  # Remove if present
-    else:
-        os.environ["WANDB_DISABLED"] = "true"
diff --git a/tests/cli/test_cli_base.py b/tests/cli/test_cli_base.py
index 6dbae045f..e28bbb75c 100644
--- a/tests/cli/test_cli_base.py
+++ b/tests/cli/test_cli_base.py
@@ -17,16 +17,23 @@ class BaseCliTest:
             command: Command to test (train/evaluate)
         """
         # Test missing config file
-        result = cli_runner.invoke(cli, [command, "--no-accelerate"])
+        result = cli_runner.invoke(cli, [command, "--launcher", "python"])
         assert result.exit_code != 0
 
         # Test non-existent config file
-        result = cli_runner.invoke(cli, [command, "nonexistent.yml", "--no-accelerate"])
+        result = cli_runner.invoke(
+            cli, [command, "nonexistent.yml", "--launcher", "python"]
+        )
         assert result.exit_code != 0
         assert "Error: Invalid value for 'CONFIG'" in result.output
 
     def _test_basic_execution(
-        self, cli_runner, tmp_path: Path, valid_test_config: str, command: str
+        self,
+        cli_runner,
+        tmp_path: Path,
+        valid_test_config: str,
+        command: str,
+        train: bool = True,
     ):
         """Test basic execution with accelerate.
 
@@ -35,24 +42,37 @@ class BaseCliTest:
             tmp_path: Temporary path fixture
             valid_test_config: Valid config fixture
             command: Command to test (train/evaluate)
+            train: Whether to test training (default) or evaluation
         """
         config_path = tmp_path / "config.yml"
         config_path.write_text(valid_test_config)
 
-        with patch("subprocess.run") as mock:
+        mock_fn = "os.execvpe" if command == "train" else "subprocess.run"
+
+        with patch(mock_fn) as mock:
             result = cli_runner.invoke(cli, [command, str(config_path)])
 
             assert mock.called
-            assert mock.call_args.args[0] == [
+
+            expected = [
                 "accelerate",
                 "launch",
                 "-m",
                 f"axolotl.cli.{command}",
                 str(config_path),
-                "--debug-num-examples",
-                "0",
+                "--debug=False",
+                "--debug-text-only=False",
+                "--debug-num-examples=0",
             ]
-            assert mock.call_args.kwargs == {"check": True}
+            if train:
+                expected.append("--shard=False")
+
+            if command == "train":
+                assert mock.call_args.args[0] == "accelerate"
+                assert mock.call_args.args[1] == expected
+            else:
+                assert mock.call_args.args[0] == expected
+                assert mock.call_args.kwargs == {"check": True}
             assert result.exit_code == 0
 
     def _test_cli_overrides(self, tmp_path: Path, valid_test_config: str):
diff --git a/tests/cli/test_cli_evaluate.py b/tests/cli/test_cli_evaluate.py
index d8eb41467..a191bf957 100644
--- a/tests/cli/test_cli_evaluate.py
+++ b/tests/cli/test_cli_evaluate.py
@@ -1,5 +1,7 @@
 """Tests for evaluate CLI command."""
 
+# pylint: disable=duplicate-code
+
 from unittest.mock import patch
 
 from axolotl.cli.main import cli
@@ -18,7 +20,9 @@ class TestEvaluateCommand(BaseCliTest):
 
     def test_evaluate_basic_execution(self, cli_runner, tmp_path, valid_test_config):
         """Test basic successful execution"""
-        self._test_basic_execution(cli_runner, tmp_path, valid_test_config, "evaluate")
+        self._test_basic_execution(
+            cli_runner, tmp_path, valid_test_config, "evaluate", train=False
+        )
 
     def test_evaluate_basic_execution_no_accelerate(
         self, cli_runner, tmp_path, valid_test_config
@@ -27,13 +31,15 @@ class TestEvaluateCommand(BaseCliTest):
         config_path = tmp_path / "config.yml"
         config_path.write_text(valid_test_config)
 
+        # pylint: disable=duplicate-code
         with patch("axolotl.cli.evaluate.do_evaluate") as mock_evaluate:
             result = cli_runner.invoke(
                 cli,
                 [
                     "evaluate",
                     str(config_path),
-                    "--no-accelerate",
+                    "--launcher",
+                    "python",
                 ],
                 catch_exceptions=False,
             )
@@ -55,7 +61,8 @@ class TestEvaluateCommand(BaseCliTest):
                     "2",
                     "--sequence-len",
                     "128",
-                    "--no-accelerate",
+                    "--launcher",
+                    "python",
                 ],
                 catch_exceptions=False,
             )
@@ -65,3 +72,104 @@ class TestEvaluateCommand(BaseCliTest):
             cfg = mock_evaluate.call_args[0][0]
             assert cfg.micro_batch_size == 2
             assert cfg.sequence_len == 128
+
+    def test_evaluate_with_launcher_args_torchrun(
+        self, cli_runner, tmp_path, valid_test_config
+    ):
+        """Test evaluate with torchrun launcher arguments"""
+        config_path = tmp_path / "config.yml"
+        config_path.write_text(valid_test_config)
+
+        with patch("subprocess.run") as mock_subprocess:
+            result = cli_runner.invoke(
+                cli,
+                [
+                    "evaluate",
+                    str(config_path),
+                    "--launcher",
+                    "torchrun",
+                    "--",
+                    "--nproc_per_node=2",
+                    "--nnodes=1",
+                ],
+                catch_exceptions=False,
+            )
+
+            assert result.exit_code == 0
+            mock_subprocess.assert_called_once()
+
+            # Verify launcher args are passed to torchrun
+            called_cmd = mock_subprocess.call_args.args[0]
+            assert called_cmd[0] == "torchrun"
+            assert "--nproc_per_node=2" in called_cmd
+            assert "--nnodes=1" in called_cmd
+            assert "-m" in called_cmd
+            assert "axolotl.cli.evaluate" in called_cmd
+
+    def test_evaluate_with_launcher_args_accelerate(
+        self, cli_runner, tmp_path, valid_test_config
+    ):
+        """Test evaluate with accelerate launcher arguments"""
+        config_path = tmp_path / "config.yml"
+        config_path.write_text(valid_test_config)
+
+        with patch("subprocess.run") as mock_subprocess:
+            result = cli_runner.invoke(
+                cli,
+                [
+                    "evaluate",
+                    str(config_path),
+                    "--launcher",
+                    "accelerate",
+                    "--",
+                    "--config_file=accelerate_config.yml",
+                    "--num_processes=4",
+                ],
+                catch_exceptions=False,
+            )
+
+            assert result.exit_code == 0
+            mock_subprocess.assert_called_once()
+
+            # Verify launcher args are passed to accelerate
+            called_cmd = mock_subprocess.call_args.args[0]
+            assert called_cmd[0] == "accelerate"
+            assert called_cmd[1] == "launch"
+            assert "--config_file=accelerate_config.yml" in called_cmd
+            assert "--num_processes=4" in called_cmd
+            assert "-m" in called_cmd
+            assert "axolotl.cli.evaluate" in called_cmd
+
+    def test_evaluate_backward_compatibility_no_launcher_args(
+        self, cli_runner, tmp_path, valid_test_config
+    ):
+        """Test that existing evaluate commands work without launcher args"""
+        config_path = tmp_path / "config.yml"
+        config_path.write_text(valid_test_config)
+
+        with patch("subprocess.run") as mock_subprocess:
+            result = cli_runner.invoke(
+                cli,
+                [
+                    "evaluate",
+                    str(config_path),
+                    "--launcher",
+                    "accelerate",
+                    "--micro-batch-size",
+                    "2",
+                ],
+                catch_exceptions=False,
+            )
+
+            assert result.exit_code == 0
+            mock_subprocess.assert_called_once()
+
+            # Verify no launcher args contamination
+            called_cmd = mock_subprocess.call_args.args[0]
+            assert called_cmd[0] == "accelerate"
+            assert called_cmd[1] == "launch"
+            # Should not contain any extra launcher args
+            launcher_section = called_cmd[2 : called_cmd.index("-m")]
+            assert (
+                len(launcher_section) == 0
+            )  # No launcher args between 'launch' and '-m'
diff --git a/tests/cli/test_cli_inference.py b/tests/cli/test_cli_inference.py
index b8effa3d2..3394c189d 100644
--- a/tests/cli/test_cli_inference.py
+++ b/tests/cli/test_cli_inference.py
@@ -1,5 +1,7 @@
 """pytest tests for axolotl CLI inference command."""
 
+# pylint: disable=duplicate-code
+
 from unittest.mock import patch
 
 from axolotl.cli.main import cli
@@ -10,7 +12,7 @@ def test_inference_basic(cli_runner, config_path):
     with patch("axolotl.cli.inference.do_inference") as mock:
         result = cli_runner.invoke(
             cli,
-            ["inference", str(config_path), "--no-accelerate"],
+            ["inference", str(config_path), "--launcher", "python"],
             catch_exceptions=False,
         )
 
@@ -23,9 +25,124 @@ def test_inference_gradio(cli_runner, config_path):
     with patch("axolotl.cli.inference.do_inference_gradio") as mock:
         result = cli_runner.invoke(
             cli,
-            ["inference", str(config_path), "--no-accelerate", "--gradio"],
+            ["inference", str(config_path), "--launcher", "python", "--gradio"],
             catch_exceptions=False,
         )
 
         assert mock.called
         assert result.exit_code == 0
+
+
+def test_inference_with_launcher_args_torchrun(cli_runner, config_path):
+    """Test inference with torchrun launcher arguments"""
+    with patch("subprocess.run") as mock_subprocess:
+        result = cli_runner.invoke(
+            cli,
+            [
+                "inference",
+                str(config_path),
+                "--launcher",
+                "torchrun",
+                "--",
+                "--nproc_per_node=2",
+                "--nnodes=1",
+            ],
+            catch_exceptions=False,
+        )
+
+        assert result.exit_code == 0
+        mock_subprocess.assert_called_once()
+
+        # Verify launcher args are passed to torchrun
+        called_cmd = mock_subprocess.call_args.args[0]
+        assert called_cmd[0] == "torchrun"
+        assert "--nproc_per_node=2" in called_cmd
+        assert "--nnodes=1" in called_cmd
+        assert "-m" in called_cmd
+        assert "axolotl.cli.inference" in called_cmd
+
+
+def test_inference_with_launcher_args_accelerate(cli_runner, config_path):
+    """Test inference with accelerate launcher arguments"""
+    with patch("subprocess.run") as mock_subprocess:
+        result = cli_runner.invoke(
+            cli,
+            [
+                "inference",
+                str(config_path),
+                "--launcher",
+                "accelerate",
+                "--",
+                "--config_file=accelerate_config.yml",
+                "--num_processes=4",
+            ],
+            catch_exceptions=False,
+        )
+
+        assert result.exit_code == 0
+        mock_subprocess.assert_called_once()
+
+        # Verify launcher args are passed to accelerate
+        called_cmd = mock_subprocess.call_args.args[0]
+        assert called_cmd[0] == "accelerate"
+        assert called_cmd[1] == "launch"
+        assert "--config_file=accelerate_config.yml" in called_cmd
+        assert "--num_processes=4" in called_cmd
+        assert "-m" in called_cmd
+        assert "axolotl.cli.inference" in called_cmd
+
+
+def test_inference_gradio_with_launcher_args(cli_runner, config_path):
+    """Test inference with gradio and launcher arguments"""
+    with patch("subprocess.run") as mock_subprocess:
+        result = cli_runner.invoke(
+            cli,
+            [
+                "inference",
+                str(config_path),
+                "--launcher",
+                "accelerate",
+                "--gradio",
+                "--",
+                "--num_processes=2",
+            ],
+            catch_exceptions=False,
+        )
+
+        assert result.exit_code == 0
+        mock_subprocess.assert_called_once()
+
+        # Verify both gradio flag and launcher args are present
+        called_cmd = mock_subprocess.call_args.args[0]
+        assert called_cmd[0] == "accelerate"
+        assert called_cmd[1] == "launch"
+        assert "--num_processes=2" in called_cmd
+        assert "--gradio" in called_cmd
+        assert "-m" in called_cmd
+        assert "axolotl.cli.inference" in called_cmd
+
+
+def test_inference_backward_compatibility_no_launcher_args(cli_runner, config_path):
+    """Test that existing inference commands work without launcher args"""
+    with patch("subprocess.run") as mock_subprocess:
+        result = cli_runner.invoke(
+            cli,
+            [
+                "inference",
+                str(config_path),
+                "--launcher",
+                "accelerate",
+            ],
+            catch_exceptions=False,
+        )
+
+        assert result.exit_code == 0
+        mock_subprocess.assert_called_once()
+
+        # Verify no launcher args contamination
+        called_cmd = mock_subprocess.call_args.args[0]
+        assert called_cmd[0] == "accelerate"
+        assert called_cmd[1] == "launch"
+        # Should not contain any extra launcher args
+        launcher_section = called_cmd[2 : called_cmd.index("-m")]
+        assert len(launcher_section) == 0  # No launcher args between 'launch' and '-m'
diff --git a/tests/cli/test_cli_interface.py b/tests/cli/test_cli_interface.py
index 8b5fec17f..ebd91ea60 100644
--- a/tests/cli/test_cli_interface.py
+++ b/tests/cli/test_cli_interface.py
@@ -18,11 +18,10 @@ def test_build_command():
     assert result == [
         "accelerate",
         "launch",
-        "--learning-rate",
-        "0.0001",
-        "--batch-size",
-        "8",
-        "--debug",
+        "--learning-rate=0.0001",
+        "--batch-size=8",
+        "--debug=True",
+        "--use-fp16=False",
     ]
 
 
@@ -38,7 +37,7 @@ def test_invalid_command_options(cli_runner):
         ],
     )
     assert result.exit_code != 0
-    assert "No such option" in result.output
+    assert "does not exist" in result.output
 
 
 def test_required_config_argument(cli_runner):
diff --git a/tests/cli/test_cli_merge_sharded_fsdp_weights.py b/tests/cli/test_cli_merge_sharded_fsdp_weights.py
index ec96b4ed4..4f6a973ea 100644
--- a/tests/cli/test_cli_merge_sharded_fsdp_weights.py
+++ b/tests/cli/test_cli_merge_sharded_fsdp_weights.py
@@ -11,9 +11,101 @@ def test_merge_sharded_fsdp_weights_no_accelerate(cli_runner, config_path):
     """Test merge_sharded_fsdp_weights command without accelerate"""
     with patch("axolotl.cli.merge_sharded_fsdp_weights.do_cli") as mock:
         result = cli_runner.invoke(
-            cli, ["merge-sharded-fsdp-weights", str(config_path), "--no-accelerate"]
+            cli,
+            ["merge-sharded-fsdp-weights", str(config_path), "--launcher", "python"],
         )
 
         assert mock.called
         assert mock.call_args.kwargs["config"] == str(config_path)
         assert result.exit_code == 0
+
+
+def test_merge_sharded_fsdp_weights_with_launcher_args_torchrun(
+    cli_runner, config_path
+):
+    """Test merge-sharded-fsdp-weights with torchrun launcher arguments"""
+    with patch("subprocess.run") as mock_subprocess:
+        result = cli_runner.invoke(
+            cli,
+            [
+                "merge-sharded-fsdp-weights",
+                str(config_path),
+                "--launcher",
+                "torchrun",
+                "--",
+                "--nproc_per_node=2",
+                "--nnodes=1",
+            ],
+            catch_exceptions=False,
+        )
+
+        assert result.exit_code == 0
+        mock_subprocess.assert_called_once()
+
+        # Verify launcher args are passed to torchrun
+        called_cmd = mock_subprocess.call_args.args[0]
+        assert called_cmd[0] == "torchrun"
+        assert "--nproc_per_node=2" in called_cmd
+        assert "--nnodes=1" in called_cmd
+        assert "-m" in called_cmd
+        assert "axolotl.cli.merge_sharded_fsdp_weights" in called_cmd
+
+
+def test_merge_sharded_fsdp_weights_with_launcher_args_accelerate(
+    cli_runner, config_path
+):
+    """Test merge-sharded-fsdp-weights with accelerate launcher arguments"""
+    with patch("subprocess.run") as mock_subprocess:
+        result = cli_runner.invoke(
+            cli,
+            [
+                "merge-sharded-fsdp-weights",
+                str(config_path),
+                "--launcher",
+                "accelerate",
+                "--",
+                "--config_file=accelerate_config.yml",
+                "--num_processes=4",
+            ],
+            catch_exceptions=False,
+        )
+
+        assert result.exit_code == 0
+        mock_subprocess.assert_called_once()
+
+        # Verify launcher args are passed to accelerate
+        called_cmd = mock_subprocess.call_args.args[0]
+        assert called_cmd[0] == "accelerate"
+        assert called_cmd[1] == "launch"
+        assert "--config_file=accelerate_config.yml" in called_cmd
+        assert "--num_processes=4" in called_cmd
+        assert "-m" in called_cmd
+        assert "axolotl.cli.merge_sharded_fsdp_weights" in called_cmd
+
+
+def test_merge_sharded_fsdp_weights_backward_compatibility_no_launcher_args(
+    cli_runner, config_path
+):
+    """Test that existing merge-sharded-fsdp-weights commands work without launcher args"""
+    with patch("subprocess.run") as mock_subprocess:
+        result = cli_runner.invoke(
+            cli,
+            [
+                "merge-sharded-fsdp-weights",
+                str(config_path),
+                "--launcher",
+                "accelerate",
+            ],
+            catch_exceptions=False,
+        )
+
+        assert result.exit_code == 0
+        mock_subprocess.assert_called_once()
+
+        # Verify no launcher args contamination
+        called_cmd = mock_subprocess.call_args.args[0]
+        assert called_cmd[0] == "accelerate"
+        assert called_cmd[1] == "launch"
+        # Should not contain any extra launcher args
+        launcher_section = called_cmd[2 : called_cmd.index("-m")]
+        assert len(launcher_section) == 0  # No launcher args between 'launch' and '-m'
diff --git a/tests/cli/test_cli_sweeps.py b/tests/cli/test_cli_sweeps.py
index 40b360717..1b14f5aca 100644
--- a/tests/cli/test_cli_sweeps.py
+++ b/tests/cli/test_cli_sweeps.py
@@ -2,7 +2,7 @@
 unit tests for generating sweep configurations
 """
 
-from axolotl.cli.main import generate_sweep_configs
+from axolotl.cli.utils import generate_sweep_configs
 
 
 def test_generate_sweep_configs_no_pairs():
diff --git a/tests/cli/test_cli_train.py b/tests/cli/test_cli_train.py
index 473913599..d4d90f57f 100644
--- a/tests/cli/test_cli_train.py
+++ b/tests/cli/test_cli_train.py
@@ -1,5 +1,7 @@
 """Tests for train CLI command."""
 
+# pylint: disable=duplicate-code
+
 from unittest.mock import MagicMock, patch
 
 from axolotl.cli.main import cli
@@ -18,7 +20,9 @@ class TestTrainCommand(BaseCliTest):
 
     def test_train_basic_execution(self, cli_runner, tmp_path, valid_test_config):
         """Test basic successful execution"""
-        self._test_basic_execution(cli_runner, tmp_path, valid_test_config, "train")
+        self._test_basic_execution(
+            cli_runner, tmp_path, valid_test_config, "train", train=True
+        )
 
     def test_train_basic_execution_no_accelerate(
         self, cli_runner, tmp_path, valid_test_config
@@ -37,7 +41,8 @@ class TestTrainCommand(BaseCliTest):
                     [
                         "train",
                         str(config_path),
-                        "--no-accelerate",
+                        "--launcher",
+                        "python",
                     ],
                     catch_exceptions=False,
                 )
@@ -59,11 +64,10 @@ class TestTrainCommand(BaseCliTest):
                     [
                         "train",
                         str(config_path),
-                        "--learning-rate",
-                        "1e-4",
-                        "--micro-batch-size",
-                        "2",
-                        "--no-accelerate",
+                        "--learning-rate=1e-4",
+                        "--micro-batch-size=2",
+                        "--launcher",
+                        "python",
                     ],
                     catch_exceptions=False,
                 )
@@ -73,3 +77,177 @@ class TestTrainCommand(BaseCliTest):
                 cfg = mock_train.call_args[1]["cfg"]
                 assert cfg["learning_rate"] == 1e-4
                 assert cfg["micro_batch_size"] == 2
+
+    def test_train_with_launcher_args_torchrun(
+        self, cli_runner, tmp_path, valid_test_config
+    ):
+        """Test train with torchrun launcher arguments"""
+        config_path = tmp_path / "config.yml"
+        config_path.write_text(valid_test_config)
+
+        with patch("os.execvpe") as mock_subprocess:
+            result = cli_runner.invoke(
+                cli,
+                [
+                    "train",
+                    str(config_path),
+                    "--launcher",
+                    "torchrun",
+                    "--",
+                    "--nproc_per_node=2",
+                    "--nnodes=1",
+                ],
+                catch_exceptions=False,
+            )
+
+            assert result.exit_code == 0
+            mock_subprocess.assert_called_once()
+
+            # Verify launcher args are passed to torchrun
+            called_cmd = mock_subprocess.call_args.args[1]
+            assert called_cmd[0] == "torchrun"
+            assert "--nproc_per_node=2" in called_cmd
+            assert "--nnodes=1" in called_cmd
+            assert "-m" in called_cmd
+            assert "axolotl.cli.train" in called_cmd
+
+    def test_train_with_launcher_args_accelerate(
+        self, cli_runner, tmp_path, valid_test_config
+    ):
+        """Test train with accelerate launcher arguments"""
+        config_path = tmp_path / "config.yml"
+        config_path.write_text(valid_test_config)
+
+        with patch("os.execvpe") as mock_subprocess:
+            result = cli_runner.invoke(
+                cli,
+                [
+                    "train",
+                    str(config_path),
+                    "--launcher",
+                    "accelerate",
+                    "--",
+                    "--config_file=accelerate_config.yml",
+                    "--num_processes=4",
+                ],
+                catch_exceptions=False,
+            )
+
+            assert result.exit_code == 0
+            mock_subprocess.assert_called_once()
+
+            # Verify launcher args are passed to accelerate
+            assert mock_subprocess.call_args.args[0] == "accelerate"
+            called_cmd = mock_subprocess.call_args.args[1]
+            assert called_cmd[0] == "accelerate"
+            assert called_cmd[1] == "launch"
+            assert "--config_file=accelerate_config.yml" in called_cmd
+            assert "--num_processes=4" in called_cmd
+            assert "-m" in called_cmd
+            assert "axolotl.cli.train" in called_cmd
+
+    def test_train_backward_compatibility_no_launcher_args(
+        self, cli_runner, tmp_path, valid_test_config
+    ):
+        """Test that existing train commands work without launcher args"""
+        config_path = tmp_path / "config.yml"
+        config_path.write_text(valid_test_config)
+
+        with patch("os.execvpe") as mock_subprocess:
+            result = cli_runner.invoke(
+                cli,
+                [
+                    "train",
+                    str(config_path),
+                    "--launcher",
+                    "accelerate",
+                    "--learning-rate",
+                    "1e-4",
+                ],
+                catch_exceptions=False,
+            )
+
+            assert result.exit_code == 0
+            mock_subprocess.assert_called_once()
+
+            # Verify no launcher args contamination
+            assert mock_subprocess.call_args.args[0] == "accelerate"
+            called_cmd = mock_subprocess.call_args.args[1]
+            assert called_cmd[0] == "accelerate"
+            assert called_cmd[1] == "launch"
+            # Should not contain any extra launcher args
+            launcher_section = called_cmd[2 : called_cmd.index("-m")]
+            assert (
+                len(launcher_section) == 0
+            )  # No launcher args between 'launch' and '-m'
+
+    def test_train_mixed_args_with_launcher_args(
+        self, cli_runner, tmp_path, valid_test_config
+    ):
+        """Test train with both regular CLI args and launcher args"""
+        config_path = tmp_path / "config.yml"
+        config_path.write_text(valid_test_config)
+
+        with patch("os.execvpe") as mock_subprocess:
+            result = cli_runner.invoke(
+                cli,
+                [
+                    "train",
+                    str(config_path),
+                    "--launcher",
+                    "torchrun",
+                    "--learning-rate",
+                    "2e-4",
+                    "--micro-batch-size",
+                    "4",
+                    "--",
+                    "--nproc_per_node=8",
+                ],
+                catch_exceptions=False,
+            )
+
+            assert result.exit_code == 0
+            mock_subprocess.assert_called_once()
+
+            assert mock_subprocess.call_args.args[0] == "torchrun"
+            called_cmd = mock_subprocess.call_args.args[1]
+            # Verify launcher args
+            assert "--nproc_per_node=8" in called_cmd
+            # Verify axolotl args are also present
+            assert "--learning-rate=2e-4" in called_cmd
+            assert "--micro-batch-size=4" in called_cmd
+
+    def test_train_cloud_with_launcher_args(
+        self, cli_runner, tmp_path, valid_test_config
+    ):
+        """Test train with cloud and launcher arguments"""
+        config_path = tmp_path / "config.yml"
+        config_path.write_text(valid_test_config)
+
+        cloud_path = tmp_path / "cloud.yml"
+        cloud_path.write_text("provider: modal\ngpu: a100")
+
+        with patch("axolotl.cli.cloud.do_cli_train") as mock_cloud_train:
+            result = cli_runner.invoke(
+                cli,
+                [
+                    "train",
+                    str(config_path),
+                    "--cloud",
+                    str(cloud_path),
+                    "--launcher",
+                    "torchrun",
+                    "--",
+                    "--nproc_per_node=4",
+                    "--nnodes=2",
+                ],
+                catch_exceptions=False,
+            )
+
+            assert result.exit_code == 0
+            mock_cloud_train.assert_called_once()
+
+            # Verify cloud training was called with launcher args
+            call_kwargs = mock_cloud_train.call_args.kwargs
+            assert call_kwargs["launcher"] == "torchrun"
+            assert call_kwargs["launcher_args"] == ["--nproc_per_node=4", "--nnodes=2"]
diff --git a/tests/cli/test_utils.py b/tests/cli/test_utils.py
index 2dab5bba9..a3e4e9887 100644
--- a/tests/cli/test_utils.py
+++ b/tests/cli/test_utils.py
@@ -72,3 +72,160 @@ def test_fetch_from_github_network_error():
     with patch("requests.get", side_effect=requests.RequestException):
         with pytest.raises(requests.RequestException):
             fetch_from_github("examples/", None)
+
+
+def assert_launcher_args_in_command(
+    mock_subprocess_call,
+    launcher: str,
+    expected_launcher_args: list[str],
+    command_module: str,
+):
+    """
+    Helper function to verify launcher arguments are properly passed in subprocess calls.
+
+    Args:
+        mock_subprocess_call: The mock subprocess.run call
+        launcher: Expected launcher ("accelerate", "torchrun", etc.)
+        expected_launcher_args: List of expected launcher arguments
+        command_module: Expected module name (e.g., "axolotl.cli.train")
+    """
+    assert mock_subprocess_call.called, "subprocess.run should have been called"
+    called_cmd = mock_subprocess_call.call_args.args[0]
+
+    # Verify launcher
+    assert (
+        called_cmd[0] == launcher
+    ), f"Expected launcher {launcher}, got {called_cmd[0]}"
+
+    # Verify launcher args are present
+    for arg in expected_launcher_args:
+        assert (
+            arg in called_cmd
+        ), f"Expected launcher arg '{arg}' not found in command: {called_cmd}"
+
+    # Verify module is present
+    assert "-m" in called_cmd, "Expected -m flag for module execution"
+    assert (
+        command_module in called_cmd
+    ), f"Expected module {command_module} not found in command: {called_cmd}"
+
+
+def assert_no_launcher_args_contamination(mock_subprocess_call, launcher: str):
+    """
+    Helper function to verify no unwanted launcher arguments are present.
+
+    Args:
+        mock_subprocess_call: The mock subprocess.run call
+        launcher: Expected launcher ("accelerate", "torchrun", etc.)
+    """
+    assert mock_subprocess_call.called, "subprocess.run should have been called"
+    called_cmd = mock_subprocess_call.call_args.args[0]
+
+    if launcher == "accelerate":
+        # For accelerate, launcher args should be between 'launch' and '-m'
+        launch_idx = called_cmd.index("launch")
+        m_idx = called_cmd.index("-m")
+        launcher_section = called_cmd[launch_idx + 1 : m_idx]
+        assert (
+            len(launcher_section) == 0
+        ), f"Unexpected launcher args found: {launcher_section}"
+    elif launcher == "torchrun":
+        # For torchrun, launcher args should be between 'torchrun' and '-m'
+        torchrun_idx = called_cmd.index("torchrun")
+        m_idx = called_cmd.index("-m")
+        launcher_section = called_cmd[torchrun_idx + 1 : m_idx]
+        assert (
+            len(launcher_section) == 0
+        ), f"Unexpected launcher args found: {launcher_section}"
+
+
+@pytest.fixture
+def common_launcher_args():
+    """Fixture providing common launcher argument combinations for testing."""
+    return {
+        "torchrun": ["--nproc_per_node=2", "--nnodes=1"],
+        "accelerate": ["--config_file=accelerate_config.yml", "--num_processes=4"],
+    }
+
+
+def test_add_default_rdzv_args_with_endpoint():
+    """Test that default RDZV args are added when rdzv_endpoint is present."""
+    from axolotl.cli.utils.train import _add_default_rdzv_args
+
+    launcher_args = ["--nnodes=2", "--rdzv_endpoint=127.0.0.1:29400"]
+    result = _add_default_rdzv_args(launcher_args)
+
+    # Should have added rdzv_backend
+    assert "--rdzv_backend" in result
+    assert "c10d" in result
+
+    # Original args should still be present
+    assert "--nnodes=2" in result
+    assert "--rdzv_endpoint=127.0.0.1:29400" in result
+
+
+def test_add_default_rdzv_args_with_existing_backend():
+    """Test that existing rdzv_backend is not overridden."""
+    from axolotl.cli.utils.train import _add_default_rdzv_args
+
+    launcher_args = [
+        "--nnodes=2",
+        "--rdzv_endpoint=127.0.0.1:29400",
+        "--rdzv_backend=static",
+    ]
+    result = _add_default_rdzv_args(launcher_args)
+
+    # Should not add another rdzv_backend
+    backend_count = sum(1 for arg in result if "--rdzv_backend" in arg)
+    assert backend_count == 1
+    assert "--rdzv_backend=static" in result
+
+
+def test_add_default_rdzv_args_with_existing_id():
+    """Test that existing rdzv_id is not overridden."""
+    from axolotl.cli.utils.train import _add_default_rdzv_args
+
+    launcher_args = [
+        "--nnodes=2",
+        "--rdzv_endpoint=127.0.0.1:29400",
+        "--rdzv_id=my_job_123",
+    ]
+    result = _add_default_rdzv_args(launcher_args)
+
+    # Should not add another rdzv_id
+    id_count = sum(1 for arg in result if "--rdzv_id" in arg)
+    assert id_count == 1
+    assert "--rdzv_id=my_job_123" in result
+
+    # Should still add rdzv_backend
+    assert "--rdzv_backend" in result
+    assert "c10d" in result
+
+
+def test_add_default_rdzv_args_without_endpoint():
+    """Test that no RDZV args are added when rdzv_endpoint is not present."""
+    from axolotl.cli.utils.train import _add_default_rdzv_args
+
+    launcher_args = ["--nnodes=2", "--nproc_per_node=4"]
+    result = _add_default_rdzv_args(launcher_args)
+
+    # Should not add any rdzv args
+    assert "--rdzv_backend" not in result
+    assert result == launcher_args
+
+
+def test_add_default_rdzv_args_with_all_existing():
+    """Test that no defaults are added when all RDZV args are present."""
+    from axolotl.cli.utils.train import _add_default_rdzv_args
+
+    launcher_args = [
+        "--nnodes=2",
+        "--rdzv_endpoint=127.0.0.1:29400",
+        "--rdzv_backend=static",
+        "--rdzv_id=existing_job",
+    ]
+    result = _add_default_rdzv_args(launcher_args)
+
+    # Should not add any additional args
+    assert len(result) == len(launcher_args)
+    assert result == launcher_args
diff --git a/tests/conftest.py b/tests/conftest.py
index 8ab8fd6a4..9e1af318d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -4,26 +4,33 @@ shared pytest fixtures
 
 import functools
 import importlib
+import logging
 import os
 import shutil
 import sys
 import tempfile
 import time
 from pathlib import Path
+from typing import Generator
 
 import datasets
 import pytest
 import requests
+import torch
 from huggingface_hub import snapshot_download
 from huggingface_hub.errors import LocalEntryNotFoundError
 from tokenizers import AddedToken
 from transformers import AutoTokenizer
 
+from axolotl.utils.dict import DictDefault
+
 from tests.hf_offline_utils import (
     enable_hf_offline,
     hf_offline_context,
 )
 
+logging.getLogger("filelock").setLevel(logging.CRITICAL)
+
 
 def retry_on_request_exceptions(max_retries=3, delay=1):
     # pylint: disable=duplicate-code
@@ -411,7 +418,7 @@ def tokenizer_mistral_7b_instruct_chatml(tokenizer_mistral_7b_instruct):
 
 
 @pytest.fixture
-def temp_dir():
+def temp_dir() -> Generator[str, None, None]:
     # Create a temporary directory
     _temp_dir = tempfile.mkdtemp()
     yield _temp_dir
@@ -419,6 +426,11 @@ def temp_dir():
     shutil.rmtree(_temp_dir)
 
 
+@pytest.fixture(scope="function", autouse=True)
+def torch_manual_seed():
+    torch.manual_seed(42)
+
+
 @pytest.fixture(scope="function", autouse=True)
 def cleanup_monkeypatches():
     from transformers import Trainer
@@ -529,6 +541,22 @@ def dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff(
     return datasets.load_from_disk(ds_path)["train"]
 
 
+@pytest.fixture(name="min_base_cfg")
+def fixture_min_base_cfg():
+    return DictDefault(
+        base_model="HuggingFaceTB/SmolLM2-135M",
+        learning_rate=1e-3,
+        datasets=[
+            {
+                "path": "mhenrichsen/alpaca_2k_test",
+                "type": "alpaca",
+            },
+        ],
+        micro_batch_size=1,
+        gradient_accumulation_steps=1,
+    )
+
+
 # # pylint: disable=redefined-outer-name,unused-argument
 @pytest.mark.skipif(
     os.environ.get("AXOLOTL_IS_CI_CACHE_PRELOAD", "-1") != "1",
diff --git a/tests/core/test_builders.py b/tests/core/test_builders.py
new file mode 100644
index 000000000..fab01a644
--- /dev/null
+++ b/tests/core/test_builders.py
@@ -0,0 +1,604 @@
+"""Unit tests for axolotl.core.builders"""
+
+# pylint: disable=protected-access
+
+import sys
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+
+from axolotl.common.datasets import load_datasets
+from axolotl.core.builders import HFCausalTrainerBuilder, HFRLTrainerBuilder
+from axolotl.loaders import ModelLoader, load_tokenizer
+from axolotl.utils.config import normalize_config
+from axolotl.utils.data import prepare_preference_datasets
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.schemas.enums import RLType
+
+from tests.constants import ALPACA_MESSAGES_CONFIG_REVISION
+
+
+@pytest.fixture(name="base_cfg")
+def fixture_base_cfg():
+    """
+    Base config with all common arguments between SFT and RLHF
+    """
+    cfg = DictDefault(
+        {
+            # Model and tokenizer settings
+            "base_model": "HuggingFaceTB/SmolLM2-135M-Instruct",
+            "sequence_len": 2048,
+            "model_config_type": "llama",  # example type
+            # Basic training settings
+            "micro_batch_size": 2,
+            "eval_batch_size": 2,
+            "num_epochs": 1,
+            "gradient_accumulation_steps": 1,
+            "max_steps": 100,
+            "val_set_size": 0,
+            # Optimizer settings
+            "optimizer": "adamw_torch_fused",
+            "learning_rate": 0.00005,
+            "weight_decay": 0.01,
+            "adam_beta1": 0.998,
+            "adam_beta2": 0.9,
+            "adam_epsilon": 0.00001,
+            "max_grad_norm": 1.0,
+            # LR scheduler settings
+            "lr_scheduler": "cosine",
+            "lr_scheduler_kwargs": {"foo": "bar"},
+            "warmup_steps": 10,
+            "warmup_ratio": None,
+            "cosine_min_lr_ratio": 0.1,
+            "cosine_constant_lr_ratio": 0.2,
+            # Checkpointing and saving
+            "save_steps": 100,
+            "output_dir": "./model-out",
+            "save_safetensors": True,
+            "save_total_limit": 4,
+            "save_only_model": False,
+            # Hardware/performance settings
+            "gradient_checkpointing": False,
+            "gradient_checkpointing_kwargs": {"use_reentrant": False},
+            "dataloader_num_workers": 1,
+            "dataloader_pin_memory": True,
+            "dataloader_prefetch_factor": 2,
+            "context_parallel_size": 1,
+            "tensor_parallel_size": 1,
+            # Dtype
+            "fp16": False,
+            "bf16": False,
+            "tf32": False,
+            # Logging and evaluation
+            "logging_steps": 10,
+            "eval_steps": 50,
+            "eval_strategy": "steps",
+            "save_strategy": "steps",
+            "include_tokens_per_second": True,
+            # Other common settings
+            "seed": 42,
+            "remove_unused_columns": True,
+            "ddp_timeout": 1800,
+            "ddp_bucket_cap_mb": 25,
+            "ddp_broadcast_buffers": False,
+            "dataset_processes": 4,
+        }
+    )
+
+    normalize_config(cfg)
+    return cfg
+
+
+@pytest.fixture(name="dpo_cfg")
+def fixture_dpo_cfg(base_cfg):
+    cfg = base_cfg.copy()
+    cfg.update(
+        {
+            "rl": RLType.DPO,
+            "dpo_use_weighting": True,
+            "dpo_use_logits_to_keep": True,
+            "dpo_label_smoothing": 0.1,
+            "beta": 0.1,  # DPO beta
+        }
+    )
+    return cfg
+
+
+@pytest.fixture(name="orpo_cfg")
+def fixture_orpo_cfg(base_cfg):
+    cfg = base_cfg.copy()
+    cfg.update(
+        {
+            "rl": RLType.ORPO,
+            "orpo_alpha": 0.1,
+            "max_prompt_len": 512,
+        }
+    )
+    return cfg
+
+
+@pytest.fixture(name="kto_cfg")
+def fixture_kto_cfg(base_cfg):
+    cfg = base_cfg.copy()
+    cfg.update(
+        {
+            "rl": RLType.KTO,
+            "kto_desirable_weight": 1.0,
+            "kto_undesirable_weight": 1.0,
+            "max_prompt_len": 512,
+        }
+    )
+    return cfg
+
+
+@pytest.fixture(name="grpo_cfg")
+def fixture_grpo_cfg(base_cfg):
+    cfg = base_cfg.copy()
+    cfg.update(
+        {
+            "rl": RLType.GRPO,
+            "trl": DictDefault(
+                {
+                    "beta": 0.001,
+                    "max_completion_length": 256,
+                    "use_vllm": False,  # run on CPU
+                    # "vllm_device": "auto",
+                    # "vllm_gpu_memory_utilization": 0.15,
+                    "num_generations": 4,
+                    "reward_funcs": ["rewards.rand_reward_func"],
+                }
+            ),
+            # Must be evenly divisible by num_generations
+            "micro_batch_size": 4,
+        }
+    )
+    return cfg
+
+
+@pytest.fixture(name="ipo_cfg")
+def fixture_ipo_cfg(base_cfg):
+    cfg = base_cfg.copy()
+    cfg.update(
+        {
+            "rl": RLType.IPO,
+            "dpo_label_smoothing": 0,
+            "beta": 0.1,
+        }
+    )
+    return cfg
+
+
+@pytest.fixture(name="simpo_cfg")
+def fixture_simpo_cfg(base_cfg):
+    cfg = base_cfg.copy()
+    cfg.update(
+        {
+            "rl": RLType.SIMPO,
+            "rl_beta": 0.2,
+            "cpo_alpha": 0.9,
+            "simpo_gamma": 0.4,
+        }
+    )
+    return cfg
+
+
+@pytest.fixture(name="sft_cfg")
+def fixture_sft_cfg(base_cfg):
+    cfg = base_cfg.copy()
+    cfg.update(
+        {
+            "rl": None,
+            "sample_packing": False,
+            "eval_sample_packing": False,
+            "flash_attention": False,
+        }
+    )
+    return cfg
+
+
+@pytest.fixture(name="rm_cfg")
+def fixture_rm_cfg(sft_cfg):
+    cfg = sft_cfg.copy()
+    cfg.update(
+        DictDefault(
+            {
+                "reward_model": True,
+                "datasets": [
+                    {
+                        "path": "argilla/distilabel-intel-orca-dpo-pairs",
+                        "type": "bradley_terry.chat_template",
+                        "split": "train[:1%]",
+                    }
+                ],
+            }
+        )
+    )
+    return cfg
+
+
+@pytest.fixture(name="prm_cfg")
+def fixture_prm_cfg(sft_cfg):
+    cfg = sft_cfg.copy()
+    cfg.update(
+        DictDefault(
+            {
+                "process_reward_model": True,
+                "datasets": [
+                    {
+                        "path": "trl-lib/math_shepherd",
+                        "type": "stepwise_supervised",
+                        "split": "train[:1%]",
+                    }
+                ],
+            }
+        )
+    )
+    return cfg
+
+
+@pytest.fixture(name="tokenizer")
+def fixture_tokenizer(base_cfg):
+    return load_tokenizer(base_cfg)
+
+
+@pytest.fixture(name="model")
+def fixture_model(base_cfg, tokenizer):
+    model, _ = ModelLoader(base_cfg, tokenizer).load()
+    return model
+
+
+class TestHFRLTrainerBuilder:
+    """
+    TestCase class for RLHF trainer builders
+    """
+
+    def _test_common_training_arguments(self, training_arguments, rl: str):
+        """Helper to test common arguments across all variants"""
+        # Basic training settings
+        if rl == "grpo":
+            # grpo_cfg's micro_batch_size is diff from others
+            assert training_arguments.per_device_train_batch_size == 4
+        else:
+            assert training_arguments.per_device_train_batch_size == 2
+        assert training_arguments.gradient_accumulation_steps == 1
+        assert training_arguments.max_steps == 100
+
+        # Optimizer settings
+        assert training_arguments.learning_rate == 0.00005
+        assert training_arguments.weight_decay == 0.01
+        assert training_arguments.adam_beta1 == 0.998
+        assert training_arguments.adam_beta2 == 0.9
+        assert training_arguments.adam_epsilon == 0.00001
+        assert training_arguments.max_grad_norm == 1.0
+
+        # LR scheduler settings
+        assert training_arguments.lr_scheduler_type == "cosine"
+        assert training_arguments.warmup_steps == 10
+        assert training_arguments.cosine_min_lr_ratio == 0.1
+        assert training_arguments.cosine_constant_lr_ratio == 0.2
+
+        # Other settings
+        assert training_arguments.dataloader_num_workers == 1
+        assert training_arguments.dataloader_pin_memory is True
+
+        # TODO(wing): restore once trl releases 0.22.0
+        # assert training_arguments.gradient_checkpointing is True
+
+    def test_dpo_training_arguments(self, dpo_cfg, model, tokenizer):
+        builder = HFRLTrainerBuilder(dpo_cfg, model, tokenizer)
+        training_arguments, _ = builder._build_training_arguments(100)
+
+        self._test_common_training_arguments(training_arguments, rl=dpo_cfg.rl)
+        # DPO specific
+        assert training_arguments.beta == 0.1
+        assert hasattr(training_arguments, "use_weighting")
+        assert training_arguments.use_weighting is True
+        assert training_arguments.label_smoothing == 0.1
+
+    def test_orpo_training_arguments(self, orpo_cfg, model, tokenizer):
+        builder = HFRLTrainerBuilder(orpo_cfg, model, tokenizer)
+        training_arguments, _ = builder._build_training_arguments(100)
+
+        self._test_common_training_arguments(training_arguments, rl=orpo_cfg.rl)
+        # ORPO specific
+        assert training_arguments.beta == 0.1  # maps from orpo_alpha
+        assert training_arguments.max_prompt_length == 512
+
+    def test_kto_training_arguments(self, kto_cfg, model, tokenizer):
+        builder = HFRLTrainerBuilder(kto_cfg, model, tokenizer)
+        training_arguments, _ = builder._build_training_arguments(100)
+
+        self._test_common_training_arguments(training_arguments, rl=kto_cfg.rl)
+        # KTO specific
+        assert training_arguments.desirable_weight == 1.0
+        assert training_arguments.undesirable_weight == 1.0
+        assert training_arguments.max_prompt_length == 512
+
+    def _write_rewards_file(self, rewards_dir: Path):
+        """
+        Writes reward function to local tmp path to be loaded on trainer building
+        """
+        # Create rewards.py in a directory we can import from
+        rewards_dir.mkdir()
+        rewards_file = rewards_dir / "rewards.py"
+        rewards_file.write_text(
+            """import random
+def rand_reward_func(prompts, completions) -> list[float]:
+    return [random.uniform(0, 1) for _ in completions]
+"""
+        )
+
+    def test_grpo_training_arguments(self, grpo_cfg, model, tokenizer, tmp_path):
+
+        rewards_dir = tmp_path / "rewards_test"
+        self._write_rewards_file(rewards_dir)
+
+        # Add the directory to Python path so we can import the module
+        sys.path.insert(0, str(rewards_dir))
+
+        try:
+            builder = HFRLTrainerBuilder(grpo_cfg, model, tokenizer)
+            training_arguments, _ = builder._build_training_arguments(100)
+
+            self._test_common_training_arguments(training_arguments, rl=grpo_cfg.rl)
+            # GRPO specific
+            assert training_arguments.beta == 0.001
+            assert training_arguments.max_completion_length == 256
+            assert training_arguments.use_vllm is False
+            # assert training_arguments.vllm_device == "auto"
+            # assert training_arguments.vllm_gpu_memory_utilization == 0.15
+            assert training_arguments.num_generations == 4
+
+            # Test trainer creation to verify reward_funcs
+            trainer = builder.build(100)
+
+            # Verify reward functions are properly loaded
+            assert len(trainer.reward_funcs) == 1
+            assert trainer.reward_funcs[0].__module__ == "rewards"
+            assert trainer.reward_funcs[0].__name__ == "rand_reward_func"
+        finally:
+            # remove imported module from path
+            if str(rewards_dir) in sys.path:
+                sys.path.remove(str(rewards_dir))
+
+    def test_ipo_training_arguments(self, ipo_cfg, model, tokenizer):
+        builder = HFRLTrainerBuilder(ipo_cfg, model, tokenizer)
+        training_arguments, _ = builder._build_training_arguments(100)
+
+        self._test_common_training_arguments(training_arguments, rl=ipo_cfg.rl)
+        # IPO specific
+        assert training_arguments.beta == 0.1
+        assert training_arguments.loss_type == "ipo"
+        assert training_arguments.label_smoothing == 0
+
+    def test_simpo_training_arguments(self, simpo_cfg, model, tokenizer):
+        builder = HFRLTrainerBuilder(simpo_cfg, model, tokenizer)
+        training_arguments, _ = builder._build_training_arguments(100)
+
+        self._test_common_training_arguments(training_arguments, rl=simpo_cfg.rl)
+        # SIMPO specific
+        assert training_arguments.beta == 0.2
+        assert training_arguments.cpo_alpha == 0.9
+        assert training_arguments.simpo_gamma == 0.4
+
+    @pytest.mark.parametrize(
+        ("cfg_string", "dataset_name"),
+        [
+            (
+                "dpo_cfg",
+                "dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff",
+            ),
+            (
+                "ipo_cfg",
+                "dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff",
+            ),
+            (
+                "grpo_cfg",
+                "dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff",
+            ),
+            ("orpo_cfg", None),  # don't use fixture for orpo to use smaller split
+            ("kto_cfg", None),  # no fixture for kto
+            (
+                "simpo_cfg",
+                "dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff",
+            ),
+        ],
+    )
+    def test_custom_optimizer_cls_and_kwargs(
+        self,
+        request,
+        cfg_string,
+        dataset_name,
+        tmp_path,
+        model,
+        tokenizer,
+    ):
+        cfg = request.getfixturevalue(cfg_string)
+
+        builder = HFRLTrainerBuilder(cfg, model, tokenizer)
+        cfg["optimizer"] = "muon"
+
+        if cfg_string in ["dpo_cfg", "ipo_cfg", "grpo_cfg", "simpo_cfg"]:
+            cfg["datasets"] = [DictDefault(ALPACA_MESSAGES_CONFIG_REVISION)]
+        elif cfg_string == "kto_cfg":
+            cfg["datasets"] = [
+                DictDefault(
+                    {
+                        "path": "argilla/ultrafeedback-binarized-preferences-cleaned-kto",
+                        "type": "llama3.ultra",
+                        "split": "train[:1%]",
+                    }
+                )
+            ]
+        elif cfg_string == "orpo_cfg":
+            cfg["datasets"] = [
+                DictDefault(
+                    {
+                        "path": "argilla/ultrafeedback-binarized-preferences-cleaned",
+                        "type": "chat_template.argilla",
+                        "split": "train[:1%]",
+                    }
+                )
+            ]
+        else:
+            raise ValueError(f"Unhandled cfg_string: {cfg_string}")
+        cfg["dataset_processes"] = 4
+
+        if cfg_string == "grpo_cfg":
+            rewards_dir = tmp_path / "rewards_test"
+            self._write_rewards_file(rewards_dir)
+
+            # Add the directory to Python path so we can import the module
+            sys.path.insert(0, str(rewards_dir))
+
+        try:
+            # Only use mock for the commented out configs
+            if dataset_name is not None:
+                with patch(
+                    "axolotl.utils.data.rl.load_dataset_with_config"
+                ) as mock_load_dataset:
+                    mock_load_dataset.return_value = request.getfixturevalue(
+                        dataset_name
+                    )
+                    train_dataset, eval_dataset = prepare_preference_datasets(
+                        cfg, tokenizer
+                    )
+            else:
+                # Load actual datasets for orpo_cfg and kto_cfg
+                train_dataset, eval_dataset = prepare_preference_datasets(
+                    cfg, tokenizer
+                )
+
+            builder.train_dataset = train_dataset
+            builder.eval_dataset = eval_dataset
+
+            trainer = builder.build(100)
+
+            assert trainer.optimizer_cls_and_kwargs is not None
+
+            from axolotl.contribs.mit.muon import (  # pylint: disable=no-name-in-module
+                Muon,
+                MuonOptimizerFactory,
+            )
+
+            optimizer_cls, optimizer_kwargs = trainer.optimizer_cls_and_kwargs
+            assert optimizer_cls is MuonOptimizerFactory
+            assert optimizer_kwargs["lr"] == 0.00005
+            assert optimizer_kwargs["weight_decay"] == 0.01
+            assert optimizer_kwargs["betas"] == (0.998, 0.9)
+            assert optimizer_kwargs["eps"] == 0.00001
+
+            # Ensure optimizer is created with correct class
+            optim = trainer.create_optimizer()
+            assert isinstance(optim, Muon)
+
+        finally:
+            # remove imported module from path
+            if cfg_string == "grpo_cfg" and str(rewards_dir) in sys.path:
+                sys.path.remove(str(rewards_dir))
+
+
+class TestHFCausalTrainerBuilder:
+    """
+    TestCase class for SFT trainer builder
+    """
+
+    def test_training_arguments(self, sft_cfg, model, tokenizer):
+        builder = HFCausalTrainerBuilder(sft_cfg, model, tokenizer)
+        trainer = builder.build(100)
+        training_arguments = trainer.args
+
+        # Test common arguments
+        assert training_arguments.per_device_train_batch_size == 2
+        assert training_arguments.gradient_accumulation_steps == 1
+        assert training_arguments.max_steps == 100
+
+        assert training_arguments.learning_rate == 0.00005
+        assert training_arguments.weight_decay == 0.01
+        assert training_arguments.adam_beta1 == 0.998
+        assert training_arguments.adam_beta2 == 0.9
+        assert training_arguments.adam_epsilon == 0.00001
+        assert training_arguments.max_grad_norm == 1.0
+
+        assert training_arguments.lr_scheduler_type == "cosine"
+        assert training_arguments.warmup_steps == 10
+        assert training_arguments.cosine_min_lr_ratio == 0.1
+
+        assert training_arguments.dataloader_num_workers == 1
+        assert training_arguments.dataloader_pin_memory is True
+        assert training_arguments.gradient_checkpointing is False
+
+        # SFT specific
+        assert training_arguments.sample_packing is False
+        assert training_arguments.eval_sample_packing is False
+
+    @pytest.mark.parametrize(
+        "cfg_string",
+        [
+            "sft_cfg",
+            "rm_cfg",
+            "prm_cfg",
+        ],
+    )
+    def test_custom_optimizer_cls_and_kwargs(
+        self, request, cfg_string, model, tokenizer
+    ):
+        cfg = request.getfixturevalue(cfg_string)
+        builder = HFCausalTrainerBuilder(cfg, model, tokenizer)
+        cfg["optimizer"] = "muon"
+
+        # need to load datasets for reward model and process reward model trainer
+        if cfg_string in ["rm_cfg", "prm_cfg"]:
+            dataset_meta = load_datasets(cfg=cfg)
+
+            builder.train_dataset = dataset_meta.train_dataset
+            builder.eval_dataset = dataset_meta.eval_dataset
+
+        trainer = builder.build(100)
+
+        assert trainer.optimizer_cls_and_kwargs is not None
+
+        from axolotl.contribs.mit.muon import (  # pylint: disable=no-name-in-module
+            Muon,
+            MuonOptimizerFactory,
+        )
+
+        optimizer_cls, optimizer_kwargs = trainer.optimizer_cls_and_kwargs
+        assert optimizer_cls is MuonOptimizerFactory
+        assert optimizer_kwargs["lr"] == 0.00005
+        assert optimizer_kwargs["weight_decay"] == 0.01
+        assert optimizer_kwargs["betas"] == (0.998, 0.9)
+        assert optimizer_kwargs["eps"] == 0.00001
+
+        # Ensure optimizer is created with correct class
+        optim = trainer.create_optimizer()
+        assert isinstance(optim, Muon)
+
+
+class TestTrainerClsPlugin:
+    """
+    TestCase class for trainer builder with plugin
+    """
+
+    def test_trainer_cls_is_not_none_with_plugin(self, kto_cfg, model, tokenizer):
+        """
+        Test that the trainer cls is not none with plugin
+
+        Fixes #2693
+        """
+        cfg = kto_cfg.copy()
+        cfg.plugins = ["axolotl.integrations.liger.LigerPlugin"]
+
+        # Expected AttributeError as we don't pass regular model configs to RL trainer builder
+        # If it throws `TypeError: None is not a callable object`, trainer_cls could be None
+        try:
+            builder = HFRLTrainerBuilder(cfg, model, tokenizer)
+
+            builder.build(100)
+        except TypeError as e:
+            # Error raised if trainer_cls is None
+            assert "'tuple' object has no attribute 'config'" not in str(e)
+        except Exception:  # pylint: disable=broad-exception-caught
+            # Another error happens, so we passed trainer_cls to builder
+            pass
diff --git a/tests/core/test_trainer_builder.py b/tests/core/test_trainer_builder.py
deleted file mode 100644
index 492578c40..000000000
--- a/tests/core/test_trainer_builder.py
+++ /dev/null
@@ -1,90 +0,0 @@
-"""Unit tests for axolotl.core.trainer_builder"""
-
-import pytest
-
-from axolotl.core.trainer_builder import HFRLTrainerBuilder
-from axolotl.loaders import ModelLoader, load_tokenizer
-from axolotl.utils.config import normalize_config
-from axolotl.utils.dict import DictDefault
-from axolotl.utils.schemas.enums import RLType
-
-
-@pytest.fixture(name="cfg")
-def fixture_cfg():
-    cfg = DictDefault(
-        {
-            "base_model": "HuggingFaceTB/SmolLM2-135M",
-            "micro_batch_size": 1,
-            "gradient_accumulation_steps": 1,
-            "learning_rate": 0.00005,
-            "save_steps": 100,
-            "output_dir": "./model-out",
-            "warmup_steps": 10,
-            "gradient_checkpointing": False,
-            "optimizer": "adamw_torch_fused",
-            "sequence_len": 2048,
-            "rl": True,
-            "adam_beta1": 0.998,
-            "adam_beta2": 0.9,
-            "adam_epsilon": 0.00001,
-            "dataloader_num_workers": 1,
-            "dataloader_pin_memory": True,
-            "model_config_type": "llama",
-            "special_tokens": {
-                "pad_token": "<|endoftext|>",
-            },
-        }
-    )
-
-    normalize_config(cfg)
-
-    return cfg
-
-
-@pytest.fixture(name="tokenizer")
-def fixture_tokenizer(cfg):
-    return load_tokenizer(cfg)
-
-
-@pytest.fixture(name="model")
-def fixture_model(cfg, tokenizer):
-    return ModelLoader(cfg, tokenizer).load()
-
-
-class TestHFRLTrainerBuilder:
-    """
-    TestCase class for DPO trainer builder
-    """
-
-    def test_build_training_arguments(self, cfg, model, tokenizer):
-        builder = HFRLTrainerBuilder(cfg, model, tokenizer)
-        training_arguments = builder.build_training_arguments(100)
-        assert training_arguments.adam_beta1 == 0.998
-        assert training_arguments.adam_beta2 == 0.9
-        assert training_arguments.adam_epsilon == 0.00001
-        assert training_arguments.dataloader_num_workers == 1
-        assert training_arguments.dataloader_pin_memory is True
-
-
-class TestTrainerClsPlugin:
-    """
-    TestCase class for trainer builder with plugin
-    """
-
-    def test_trainer_cls_is_not_none_with_plugin(self, cfg, model, tokenizer):
-        """
-        Test that the trainer cls is not none with plugin
-
-        Fixes #2693
-        """
-        cfg.plugins = ["axolotl.integrations.liger.LigerPlugin"]
-        cfg.rl = RLType.KTO
-
-        # Expected AttributeError as we don't pass regular model configs to RL trainer builder
-        # If it throws `TypeError: None is not a callable object`, trainer_cls could be None
-        with pytest.raises(
-            AttributeError, match=r".*'tuple' object has no attribute 'config'.*"
-        ):
-            builder = HFRLTrainerBuilder(cfg, model, tokenizer)
-
-            builder.build(100)
diff --git a/tests/e2e/integrations/test_cut_cross_entropy.py b/tests/e2e/integrations/test_cut_cross_entropy.py
index 2ae59a15a..34e6c9644 100644
--- a/tests/e2e/integrations/test_cut_cross_entropy.py
+++ b/tests/e2e/integrations/test_cut_cross_entropy.py
@@ -4,7 +4,6 @@ Simple end-to-end test for Cut Cross Entropy integration
 
 import pytest
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils import get_pytorch_version
@@ -45,6 +44,7 @@ def min_cfg(temp_dir):
         "save_safetensors": True,
         "max_steps": 10,
         "bf16": "auto",
+        "save_first_step": False,
     }
 
 
@@ -59,8 +59,7 @@ class TestCutCrossEntropyIntegration:
         cfg = validate_config(cfg)
         prepare_plugins(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         major, minor, _ = get_pytorch_version()
         if (major, minor) < (2, 4):
@@ -100,13 +99,13 @@ class TestCutCrossEntropyIntegration:
                 "save_safetensors": True,
                 "max_steps": 10,
                 "bf16": "auto",
+                "save_first_step": False,
             }
         )
         cfg = validate_config(cfg)
         prepare_plugins(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         major, minor, _ = get_pytorch_version()
         if (major, minor) < (2, 4):
@@ -134,8 +133,7 @@ class TestCutCrossEntropyIntegration:
         cfg = validate_config(cfg)
         prepare_plugins(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         major, minor, _ = get_pytorch_version()
         if (major, minor) < (2, 4):
diff --git a/tests/e2e/integrations/test_fp8.py b/tests/e2e/integrations/test_fp8.py
new file mode 100644
index 000000000..0302b7e35
--- /dev/null
+++ b/tests/e2e/integrations/test_fp8.py
@@ -0,0 +1,62 @@
+"""
+Simple end-to-end smoke tests for FP8 mixed precision training
+"""
+
+from axolotl.common.datasets import load_datasets
+from axolotl.train import train
+from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.dict import DictDefault
+
+from tests.e2e.utils import check_model_output_exists, require_torch_2_7_0
+
+
+class FP8IntegrationTestCase:
+    """
+    e2e smoke tests for FP8 mixed precision training with Axolotl
+    """
+
+    @require_torch_2_7_0
+    def test_fp8_single_gpu_smoke(self, temp_dir):
+        """Smoke test for single GPU FP8 + torch.compile training"""
+        # pylint: disable=duplicate-code
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "tokenizer_type": "AutoTokenizer",
+                "trust_remote_code": True,
+                "sequence_len": 512,
+                "val_set_size": 0.05,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "mhenrichsen/alpaca_2k_test",
+                        "type": "alpaca",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 3,  # Very short smoke test
+                "micro_batch_size": 1,
+                "gradient_accumulation_steps": 2,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "sdp_attention": True,
+                "pad_to_seq_len": True,
+                "sample_packing": True,
+                "fp8": True,
+                "torch_compile": True,
+                "save_safetensors": True,
+                "save_first_step": False,
+            }
+        )
+
+        # pylint: disable=duplicate-code
+        cfg = validate_config(cfg)
+        normalize_config(cfg)
+        dataset_meta = load_datasets(cfg=cfg)
+
+        train(cfg=cfg, dataset_meta=dataset_meta)
+        check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/integrations/test_hooks.py b/tests/e2e/integrations/test_hooks.py
index 45d7200fb..8743efb98 100644
--- a/tests/e2e/integrations/test_hooks.py
+++ b/tests/e2e/integrations/test_hooks.py
@@ -5,7 +5,6 @@ e2e tests to make sure all the hooks are fired on the plugin
 import os
 from pathlib import Path
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.integrations.base import BasePlugin
 from axolotl.train import train
@@ -154,14 +153,14 @@ class TestPluginHooks:
                 "max_steps": 5,
                 "flash_attention": True,
                 "bf16": "auto",
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         prepare_plugins(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/integrations/test_kd.py b/tests/e2e/integrations/test_kd.py
index f36eef953..1ac3b537e 100644
--- a/tests/e2e/integrations/test_kd.py
+++ b/tests/e2e/integrations/test_kd.py
@@ -5,11 +5,9 @@ e2e tests for kd trainer support in Axolotl
 from pathlib import Path
 
 import pytest
+import yaml
+from accelerate.test_utils import execute_subprocess_async, get_torch_dist_unique_port
 
-from axolotl.cli.args import TrainerCliArgs
-from axolotl.common.datasets import load_datasets
-from axolotl.train import train
-from axolotl.utils.config import normalize_config, prepare_plugins, validate_config
 from axolotl.utils.dict import DictDefault
 
 from tests.e2e.utils import check_tensorboard, require_torch_2_5_1
@@ -18,8 +16,8 @@ from tests.e2e.utils import check_tensorboard, require_torch_2_5_1
 @pytest.fixture(name="kd_min_cfg")
 def min_cfg(temp_dir):
     return {
-        "base_model": "osllmai-community/Llama-3.2-1B",
-        "tokenizer_config": "axolotl-ai-co/Llama-3.3-70B-Instruct-tokenizer",
+        "base_model": "Qwen/Qwen3-0.6B",
+        "tokenizer_config": "winglian/qwen3-14b-math",
         "plugins": [
             "axolotl.integrations.kd.KDPlugin",
             "axolotl.integrations.liger.LigerPlugin",
@@ -32,20 +30,22 @@ def min_cfg(temp_dir):
         "kd_ce_alpha": 0.1,
         "kd_alpha": 0.9,
         "kd_temperature": 1.0,
+        "kd_beta": 0.0,
+        "kd_normalize_topk": True,
         "dataloader_prefetch_factor": 8,
         "dataloader_num_workers": 4,
         "dataloader_pin_memory": True,
         "datasets": [
             {
-                "path": "axolotl-ai-co/evolkit-logprobs-pipeline-75k-v2-sample",
-                "type": "axolotl.integrations.kd.chat_template",
-                "field_messages": "messages_combined",
+                "path": "winglian/OpenThoughts-114k-math-correct-qwen3-14b-math-prepared-topk128-normalized",
+                "type": "chat_template",
                 "split": "train",
-                "logprobs_field": "llm_text_generation_vllm_logprobs",
-                "temperature": 1.0,
-                "preprocess_shards": 2,
+                "split_thinking": True,
+                "eot_tokens": ["<|im_end|>"],
+                "data_files": ["train/batch-000000.parquet"],
             },
         ],
+        "skip_prepare_dataset": True,
         "val_set_size": 0.0,
         "sequence_len": 2048,
         "sample_packing": True,
@@ -67,6 +67,7 @@ def min_cfg(temp_dir):
         "output_dir": temp_dir,
         "save_safetensors": True,
         "use_tensorboard": True,
+        "save_first_step": False,
     }
 
 
@@ -81,18 +82,29 @@ class TestKnowledgeDistillation:
     def test_llama_kd(self, temp_dir, kd_min_cfg):
         cfg = DictDefault(kd_min_cfg)
         # pylint: disable=duplicate-code
-        cfg = validate_config(cfg)
-        prepare_plugins(cfg)
-        normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        # write cfg to yaml file
+        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
 
-        train(cfg=cfg, dataset_meta=dataset_meta)
-        assert (Path(temp_dir) / "model.safetensors").exists()
-        check_tensorboard(
-            temp_dir + "/runs", "train/loss", 1.2, "Train Loss (%s) is too high"
+        execute_subprocess_async(
+            [
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
+                "--num-processes",
+                "1",
+                "--main-process-port",
+                f"{get_torch_dist_unique_port()}",
+            ]
         )
 
+        assert (Path(temp_dir) / "model.safetensors").exists()
+        check_tensorboard(
+            temp_dir + "/runs", "train/loss", 1.4, "Train Loss (%s) is too high"
+        )
+
+    @pytest.mark.skip(reason="Chunked KD loss doesn't support PEFT/LoRA")
     @pytest.mark.parametrize(
         "load_in_8bit",
         [True, False],
@@ -112,13 +124,22 @@ class TestKnowledgeDistillation:
             | kd_min_cfg
         )
         # pylint: disable=duplicate-code
-        cfg = validate_config(cfg)
-        prepare_plugins(cfg)
-        normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        # write cfg to yaml file
+        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
 
-        train(cfg=cfg, dataset_meta=dataset_meta)
+        execute_subprocess_async(
+            [
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
+                "--num-processes",
+                "1",
+                "--main-process-port",
+                f"{get_torch_dist_unique_port()}",
+            ]
+        )
         assert (Path(temp_dir) / "adapter_model.safetensors").exists()
         check_tensorboard(
             temp_dir + "/runs", "train/loss", 1.2, "Train Loss (%s) is too high"
diff --git a/tests/e2e/integrations/test_liger.py b/tests/e2e/integrations/test_liger.py
index 8ecfc4746..b1f5befdd 100644
--- a/tests/e2e/integrations/test_liger.py
+++ b/tests/e2e/integrations/test_liger.py
@@ -2,7 +2,6 @@
 Simple end-to-end test for Liger integration
 """
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, prepare_plugins, validate_config
@@ -51,14 +50,14 @@ class LigerIntegrationTestCase:
                 "save_safetensors": True,
                 "bf16": "auto",
                 "max_steps": 5,
+                "save_first_step": False,
             }
         )
         # pylint: disable=duplicate-code
         cfg = validate_config(cfg)
         prepare_plugins(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
@@ -98,14 +97,14 @@ class LigerIntegrationTestCase:
                 "save_safetensors": True,
                 "bf16": "auto",
                 "max_steps": 5,
+                "save_first_step": False,
             }
         )
         # pylint: disable=duplicate-code
         cfg = validate_config(cfg)
         prepare_plugins(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/integrations/test_llm_compressor.py b/tests/e2e/integrations/test_llm_compressor.py
index 20bf821bf..dceecea9f 100644
--- a/tests/e2e/integrations/test_llm_compressor.py
+++ b/tests/e2e/integrations/test_llm_compressor.py
@@ -6,7 +6,6 @@ from pathlib import Path
 
 import pytest
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, prepare_plugins, validate_config
@@ -82,14 +81,14 @@ class TestLLMCompressorIntegration:
                     },
                     "save_compressed": save_compressed,
                 },
+                "save_first_step": False,
             }
         )
 
         prepare_plugins(cfg)
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         try:
             train(cfg=cfg, dataset_meta=dataset_meta)
diff --git a/tests/e2e/kernels/test_geglu.py b/tests/e2e/kernels/test_geglu.py
index 005a1935d..4094a8ce7 100644
--- a/tests/e2e/kernels/test_geglu.py
+++ b/tests/e2e/kernels/test_geglu.py
@@ -1,7 +1,6 @@
 """Tests for GEGLU activation function Triton kernels."""
 
-# pylint: disable=duplicate-code
-
+import pytest
 import torch
 import torch.nn.functional as F
 
@@ -20,8 +19,15 @@ def test_geglu_forward_shape():
     assert out.device == gate.device
 
 
-def test_geglu_forward_values():
+@pytest.mark.flaky(retries=1, delay=5)
+@pytest.mark.parametrize(
+    "torch_seed",
+    [0, 42],
+)
+def test_geglu_forward_values(torch_seed):
     """Test GEGLU forward pass matches PyTorch reference implementation."""
+    torch.manual_seed(torch_seed)
+
     gate = torch.randn(2, 3, 64, device="cuda")
     up = torch.randn(2, 3, 64, device="cuda")
 
@@ -34,8 +40,15 @@ def test_geglu_forward_values():
     assert torch.allclose(triton_out, torch_out, rtol=1e-3)
 
 
-def test_geglu_backward():
+@pytest.mark.flaky(retries=1, delay=5)
+@pytest.mark.parametrize(
+    "torch_seed",
+    [0, 42],
+)
+def test_geglu_backward(torch_seed):
     """Test GEGLU backward pass matches PyTorch autograd."""
+    torch.manual_seed(torch_seed)
+
     gate = torch.randn(2, 3, 64, device="cuda", requires_grad=True)
     up = torch.randn(2, 3, 64, device="cuda", requires_grad=True)
     grad_output = torch.randn(2, 3, 64, device="cuda")
diff --git a/tests/e2e/kernels/test_lora.py b/tests/e2e/kernels/test_lora.py
index 5ad186cbf..cd6131ff1 100644
--- a/tests/e2e/kernels/test_lora.py
+++ b/tests/e2e/kernels/test_lora.py
@@ -64,6 +64,7 @@ def sample_tensors():
             batch_size, seq_len, hidden_dim, device="cuda", dtype=torch.float16
         ),
         "W": torch.randn(out_dim, hidden_dim, device="cuda", dtype=torch.float16),
+        "b": torch.randn(out_dim, device="cuda", dtype=torch.float16),
         "scale": 0.5,
         "shapes": {
             "batch": batch_size,
@@ -103,23 +104,24 @@ def mock_proj():
 def test_get_lora_parameters(mock_proj):
     """Tests get_lora_parameters function"""
     # Test with LoRA enabled
-    W, _, A, B, s = get_lora_parameters(mock_proj)
+    W, b, _, A, B, s = get_lora_parameters(mock_proj)
 
     assert isinstance(W, torch.Tensor)
     assert W.shape == (128, 64)
+    assert b.shape == (128,)
     assert A.shape == (8, 64)
     assert B.shape == (128, 8)
     assert s == 0.5
 
     # Test with LoRA disabled
     mock_proj.disable_adapters = True
-    W, _, A, B, s = get_lora_parameters(mock_proj)
+    W, b, _, A, B, s = get_lora_parameters(mock_proj)
     assert A is None and B is None and s is None
 
     # Test with merged state
     mock_proj.disable_adapters = False
     mock_proj.merged = True
-    W, _, A, B, s = get_lora_parameters(mock_proj)
+    W, b, _, A, B, s = get_lora_parameters(mock_proj)
     assert A is None and B is None and s is None
 
 
@@ -127,6 +129,7 @@ def test_matmul_lora(sample_tensors):
     """Tests matmul_lora function"""
     X = sample_tensors["X"]
     W = sample_tensors["W"]
+    b = sample_tensors["b"]
     scale = sample_tensors["scale"]
 
     shapes = sample_tensors["shapes"]
@@ -138,19 +141,20 @@ def test_matmul_lora(sample_tensors):
     B = torch.randn(out_dim, rank, device="cuda", dtype=torch.float16)
 
     # Test base matmul
-    out1 = matmul_lora(X, W, None, None, None, None)
-    expected1 = torch.matmul(X, W.t())
+    out1 = matmul_lora(X, W, b, None, None, None, None)
+    matmul = torch.matmul(X, W.t())
+    expected1 = matmul + b
     assert torch.allclose(out1, expected1, rtol=1e-3)
 
     # Test with LoRA
-    out2 = matmul_lora(X, W, None, A, B, scale)
+    out2 = matmul_lora(X, W, b, None, A, B, scale)
     lora_term = scale * torch.matmul(torch.matmul(X, A.t()), B.t())
-    expected2 = expected1 + lora_term
+    expected2 = matmul + lora_term + b
     assert torch.allclose(out2, expected2, rtol=1e-3)
 
     # Test 3D input reshaping
     X_3d = X.clone()
-    out3 = matmul_lora(X_3d, W, None, A, B, scale)
+    out3 = matmul_lora(X_3d, W, b, None, A, B, scale)
     assert out3.shape == (X.shape[0], X.shape[1], W.shape[0])
 
 
@@ -175,16 +179,19 @@ def test_lora_mlp_direct(sample_tensors, activation_forward, activation_backward
     output = LoRA_MLP.apply(
         X,
         gate_proj.weight,
+        gate_proj.bias,
         None,  # gate_quant
         None,  # gate_A
         None,  # gate_B
         None,  # gate_scale
         up_proj.weight,
+        up_proj.bias,
         None,  # up_quant
         None,  # up_A
         None,  # up_B
         None,  # up_scale
         down_proj.weight,
+        down_proj.bias,
         None,  # down_quant
         None,  # down_A
         None,  # down_B
@@ -243,16 +250,19 @@ def test_lora_mlp_with_adapters(
     output = LoRA_MLP.apply(
         X,
         gate_proj.weight,
+        gate_proj.bias,
         None,
         gate_A,
         gate_B,
         scale,
         up_proj.weight,
+        up_proj.bias,
         None,
         up_A,
         up_B,
         scale,
         down_proj.weight,
+        down_proj.bias,
         None,
         down_A,
         down_B,
@@ -323,6 +333,7 @@ def test_lora_qkv(sample_tensors):
     X.requires_grad = True
 
     # Test without LoRA adapters
+    # pylint: disable=duplicate-code
     Q1, K1, V1 = LoRA_QKV.apply(
         X,
         q_weight,
@@ -330,16 +341,19 @@ def test_lora_qkv(sample_tensors):
         None,
         None,
         None,
+        None,
         k_weight,
         None,
         None,
         None,
         None,
+        None,
         v_weight,
         None,
         None,
         None,
         None,
+        None,
         True,
     )
 
@@ -356,16 +370,19 @@ def test_lora_qkv(sample_tensors):
         X,
         q_weight,
         None,
+        None,
         q_A,
         q_B,
         scale,
         k_weight,
         None,
+        None,
         k_A,
         k_B,
         scale,
         v_weight,
         None,
+        None,
         v_A,
         v_B,
         scale,
@@ -399,6 +416,7 @@ def test_lora_o(sample_tensors):
     """Tests LoRA output projection"""
     X = sample_tensors["X"]
     W = sample_tensors["W"]
+    b = sample_tensors["b"]
     scale = sample_tensors["scale"]
 
     shapes = sample_tensors["shapes"]
@@ -411,7 +429,7 @@ def test_lora_o(sample_tensors):
 
     # Test forward pass
     X.requires_grad = True
-    output = LoRA_O.apply(X, W, None, A, B, scale)
+    output = LoRA_O.apply(X, W, b, None, A, B, scale)
 
     assert output.shape == (X.shape[0], X.shape[1], W.shape[0])
 
@@ -425,6 +443,7 @@ def test_with_quantization(sample_tensors, mock_quantstate):
     """Tests LoRA with quantized weights"""
     X = sample_tensors["X"]  # [batch, seq, hidden]
     W = sample_tensors["W"]  # [out, hidden]
+    b = sample_tensors["b"]  # [out]
     scale = 0.5
 
     shapes = sample_tensors["shapes"]
@@ -436,13 +455,13 @@ def test_with_quantization(sample_tensors, mock_quantstate):
     B = torch.randn(out_dim, rank, device="cuda", dtype=torch.float16)
 
     # Test matmul with quantization
-    out = matmul_lora(X, W, mock_quantstate, A, B, scale)
+    out = matmul_lora(X, W, b, mock_quantstate, A, B, scale)
     assert out.shape == (X.shape[0], X.shape[1], W.shape[0])
     assert not torch.isnan(out).any()
 
     # Test with different batch sizes
     X2 = torch.randn(4, 6, hidden_dim, device="cuda", dtype=torch.float16)
-    out2 = matmul_lora(X2, W, mock_quantstate, A, B, scale)
+    out2 = matmul_lora(X2, W, b, mock_quantstate, A, B, scale)
     assert out2.shape == (4, 6, W.shape[0])
     assert not torch.isnan(out2).any()
 
@@ -459,11 +478,12 @@ def test_shapes_and_dimensions(batch, seq, hidden, rank, out):
     """Tests various input shapes and dimensions"""
     X = torch.randn(batch, seq, hidden, device="cuda", dtype=torch.float16)
     W = torch.randn(out, hidden, device="cuda", dtype=torch.float16)
+    b = torch.randn(out, device="cuda", dtype=torch.float16)
     A = torch.randn(rank, hidden, device="cuda", dtype=torch.float16)
     B = torch.randn(out, rank, device="cuda", dtype=torch.float16)
     scale = 0.5
 
-    result = matmul_lora(X, W, None, A, B, scale)
+    result = matmul_lora(X, W, b, None, A, B, scale)
     assert result.shape == (batch, seq, out)
 
 
@@ -471,6 +491,7 @@ def test_gradient_flow(sample_tensors):
     """Tests gradient flow through LoRA layers"""
     X = sample_tensors["X"].clone()
     W = sample_tensors["W"].clone()
+    b = sample_tensors["b"].clone()
     scale = sample_tensors["scale"]
 
     shapes = sample_tensors["shapes"]
@@ -486,7 +507,7 @@ def test_gradient_flow(sample_tensors):
     B.requires_grad = True
 
     # Forward pass
-    out = matmul_lora(X, W, None, A, B, scale)
+    out = matmul_lora(X, W, b, None, A, B, scale)
     loss = out.sum()
 
     # Backward pass
diff --git a/tests/e2e/multigpu/patched/test_sp.py b/tests/e2e/multigpu/patched/test_sp.py
index 1170f5eee..a005e6742 100644
--- a/tests/e2e/multigpu/patched/test_sp.py
+++ b/tests/e2e/multigpu/patched/test_sp.py
@@ -1,6 +1,5 @@
 """E2E tests for sequence parallelism"""
 
-import os
 from pathlib import Path
 
 import pytest
@@ -12,8 +11,6 @@ from axolotl.utils.dict import DictDefault
 
 from ...utils import check_tensorboard
 
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestSequenceParallelism:
     """Test case for training with sequence parallelism enabled"""
@@ -57,6 +54,7 @@ class TestSequenceParallelism:
                 "micro_batch_size": micro_batch_size,
                 "gradient_accumulation_steps": 2,
                 "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_8bit",
                 "lr_scheduler": "cosine",
@@ -69,8 +67,9 @@ class TestSequenceParallelism:
                 "logging_steps": 1,
                 "weight_decay": 0.0,
                 "use_tensorboard": True,
-                "sequence_parallel_degree": 2,
+                "context_parallel_size": 2,
                 "ring_attn_func": ring_attn_func,
+                "save_first_step": False,
             }
         )
 
@@ -94,7 +93,10 @@ class TestSequenceParallelism:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", threshold, "Train Loss is too high"
+            temp_dir + "/runs",
+            "train/train_loss",
+            threshold,
+            "Train Loss (%s) is too high",
         )
 
     @pytest.mark.parametrize(
@@ -103,13 +105,13 @@ class TestSequenceParallelism:
             (True, 1, True, None, 2.5),  # defaults to varlen_llama3 ring_attn_func
             (False, 2, True, None, 2.5),  # defaults to batch_ring ring_attn_func
             # (False, 2, True, "batch_zigzag", 2.5),
-            (False, 2, False, None, 2.5),  # defaults to batch_ring ring_attn_func
+            # (False, 2, False, None, 2.65),  # defaults to batch_ring ring_attn_func
         ],
         ids=[
             "sample_packing, varlen_llama3 ring_attn_func",
             "no sample_packing, pad_to_sequence_len, batch_ring ring_attn_func",
             # "no sample_packing, no pad_to_sequence_len, batch_zigzag ring_attn_func",
-            "no sample_packing, no pad_to_sequence_len, batch_ring ring_attn_func",
+            # "no sample_packing, no pad_to_sequence_len, batch_ring ring_attn_func",
         ],
     )
     def test_sequence_parallel_training(
diff --git a/tests/e2e/multigpu/solo/test_flex.py b/tests/e2e/multigpu/solo/test_flex.py
index 471b112c1..cbdf8de96 100644
--- a/tests/e2e/multigpu/solo/test_flex.py
+++ b/tests/e2e/multigpu/solo/test_flex.py
@@ -2,8 +2,6 @@
 E2E tests for multigpu lora tinyllama
 """
 
-import logging
-import os
 from pathlib import Path
 
 import pytest
@@ -17,9 +15,6 @@ from axolotl.utils.dict import DictDefault
 
 from tests.e2e.utils import check_tensorboard, require_torch_2_6_0
 
-LOG = logging.getLogger("axolotl.tests.e2e.multigpu")
-os.environ["WANDB_DISABLED"] = "true"
-
 AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent
 
 
@@ -59,12 +54,14 @@ class TestPackedFlex:
                 "gradient_accumulation_steps": 2,
                 "gradient_checkpointing": True,
                 "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch_fused",
                 "lr_scheduler": "cosine",
                 "max_steps": 2,
                 "use_tensorboard": True,
                 "save_strategy": "no",
+                "save_first_step": False,
             }
         )
         if is_torch_bf16_gpu_available():
@@ -90,5 +87,5 @@ class TestPackedFlex:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/multigpu/solo/test_grpo.py b/tests/e2e/multigpu/solo/test_grpo.py
index 575b7a620..92e0f7040 100644
--- a/tests/e2e/multigpu/solo/test_grpo.py
+++ b/tests/e2e/multigpu/solo/test_grpo.py
@@ -4,7 +4,6 @@ GRPO test suite
 
 import os
 import random
-import shutil
 import subprocess  # nosec B404
 import sys
 import tempfile
@@ -106,7 +105,7 @@ def start_vllm(
                 print(f"{i}: VLLM server failed to start: {str(exc)}")
 
             # also check if the process.pid is still running
-            if not process.poll() is None:
+            if process.poll() is not None:
                 break
 
             time.sleep(period_seconds)
@@ -118,7 +117,10 @@ def start_vllm(
         recursive_kill(process)
         with open("/tmp/vllm.log", "r", encoding="utf-8") as log_file:
             print(log_file.read())
-        shutil.rmtree("/tmp/vllm.log")
+        try:
+            os.remove("/tmp/vllm.log")
+        except FileNotFoundError:
+            pass
         raise RuntimeError(f"VLLM server process did not start within {wait} seconds.")
 
     # return the process
@@ -139,6 +141,7 @@ def recursive_kill(process: subprocess.Popen):
     os.kill(process.pid, 9)
 
 
+@pytest.mark.skip(reason="flaky vllm tests in modal")
 class TestGRPO:
     """
     Test case for GRPO training using multilpe GPUs
@@ -220,6 +223,7 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
                 "save_safetensors": True,
                 "bf16": "auto",
                 "use_tensorboard": True,
+                "save_first_step": False,
             }
         )
 
@@ -260,6 +264,101 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
                     **current_env,
                 },
             )
+        finally:
+            (recursive_kill(vllm_process))
+
+    @require_vllm
+    def test_llama_lora_sp(self, temp_dir):
+        rnd_reward_suffix = str(random.randint(1000, 9999))
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "chat_template": "llama3",
+                "rl": "grpo",
+                "trl": {
+                    "beta": 0.001,
+                    "max_completion_length": 256,
+                    "use_vllm": True,
+                    "num_generations": 4,
+                    "reward_funcs": [f"rewards_{rnd_reward_suffix}.rand_reward_func"],
+                },
+                "vllm": {
+                    "max_model_len": 800,
+                    "enable_prefix_caching": True,
+                },
+                "datasets": [
+                    {
+                        "path": "openai/gsm8k",
+                        "name": "main",
+                        "type": f"rewards_{rnd_reward_suffix}.oai_gsm8k_transform",
+                    },
+                ],
+                "adapter": "lora",
+                "lora_r": 8,
+                "lora_alpha": 16,
+                "lora_dropout": 0.05,
+                "lora_target_linear": True,
+                "context_parallel_size": 2,
+                "flash_attention": True,
+                "sequence_len": 1024,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "max_steps": 3,
+                "num_epochs": 1,
+                "micro_batch_size": 4,
+                "gradient_accumulation_steps": 2,
+                "warmup_steps": 10,
+                "val_set_size": 0.0,
+                "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
+                "learning_rate": 0.0001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "save_safetensors": True,
+                "bf16": "auto",
+                "use_tensorboard": True,
+                "save_first_step": False,
+            }
+        )
+
+        self._utils_write_yaml_and_rewards(cfg, temp_dir, suffix=rnd_reward_suffix)
+
+        current_env = os.environ.copy()
+        env = {
+            "NCCL_P2P_LEVEL": "LOC",
+            **current_env,
+            "CUDA_VISIBLE_DEVICES": "1",
+        }
+        vllm_process = start_vllm(
+            cfg.base_model,
+            env=env,
+            quiet=True,
+            wait=300,
+            gpu_memory_utilization=0.15,
+            max_model_len=cfg.vllm.max_model_len,
+            enable_prefix_caching=cfg.vllm.enable_prefix_caching,
+            host="0.0.0.0",
+            port=8000,
+        )
+
+        try:
+            execute_subprocess_async(
+                [
+                    "axolotl",
+                    "train",
+                    str(Path(temp_dir) / "config.yaml"),
+                    "--num-processes",
+                    str(2),
+                    "--main-process-port",
+                    f"{get_torch_dist_unique_port()}",
+                ],
+                env={
+                    "NCCL_P2P_LEVEL": "LOC",
+                    "NCCL_DEBUG": "INFO",
+                    **current_env,
+                },
+            )
         finally:
             recursive_kill(vllm_process)
 
@@ -305,12 +404,14 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
                 "warmup_steps": 10,
                 "val_set_size": 0.0,
                 "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                 "learning_rate": 0.0001,
                 "optimizer": "adamw_torch_fused",
                 "lr_scheduler": "cosine",
                 "save_safetensors": True,
                 "bf16": "auto",
                 "use_tensorboard": True,
+                "save_first_step": False,
             }
         )
 
diff --git a/tests/e2e/multigpu/test_eval.py b/tests/e2e/multigpu/test_eval.py
index 4989b81df..4f86278ff 100644
--- a/tests/e2e/multigpu/test_eval.py
+++ b/tests/e2e/multigpu/test_eval.py
@@ -2,8 +2,6 @@
 E2E tests for multigpu eval
 """
 
-import logging
-import os
 from pathlib import Path
 
 import yaml
@@ -14,9 +12,6 @@ from axolotl.utils.dict import DictDefault
 
 from ..utils import check_tensorboard
 
-LOG = logging.getLogger("axolotl.tests.e2e.multigpu")
-os.environ["WANDB_DISABLED"] = "true"
-
 AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent
 
 
@@ -43,12 +38,13 @@ class TestMultiGPUEval:
                 "lora_dropout": 0.05,
                 "lora_target_linear": True,
                 "lora_modules_to_save": ["embed_tokens", "lm_head"],
-                "val_set_size": 0.004,
+                "val_set_size": 0.05,
                 "special_tokens": {"pad_token": "<|endoftext|>"},
                 "datasets": [
                     {
                         "path": "teknium/GPT4-LLM-Cleaned",
                         "type": "alpaca",
+                        "split": "train[:5%]",
                     },
                 ],
                 "num_epochs": 1,
@@ -56,6 +52,7 @@ class TestMultiGPUEval:
                 "micro_batch_size": 2,
                 "gradient_accumulation_steps": 2,
                 "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_8bit",
                 "lr_scheduler": "cosine",
@@ -70,6 +67,7 @@ class TestMultiGPUEval:
                 "logging_steps": 1,
                 "weight_decay": 0.0,
                 "use_tensorboard": True,
+                "save_first_step": False,
             }
         )
 
@@ -112,12 +110,13 @@ class TestMultiGPUEval:
                 "lora_dropout": 0.05,
                 "lora_target_linear": True,
                 "lora_modules_to_save": ["embed_tokens", "lm_head"],
-                "val_set_size": 0.0004,
+                "val_set_size": 0.01,
                 "special_tokens": {"pad_token": "<|endoftext|>"},
                 "datasets": [
                     {
                         "path": "teknium/GPT4-LLM-Cleaned",
                         "type": "alpaca",
+                        "split": "train[:5%]",
                     },
                 ],
                 "num_epochs": 1,
@@ -125,6 +124,7 @@ class TestMultiGPUEval:
                 "micro_batch_size": 2,
                 "gradient_accumulation_steps": 2,
                 "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_8bit",
                 "lr_scheduler": "cosine",
@@ -139,6 +139,7 @@ class TestMultiGPUEval:
                 "logging_steps": 1,
                 "weight_decay": 0.0,
                 "use_tensorboard": True,
+                "save_first_step": False,
             }
         )
 
diff --git a/tests/e2e/multigpu/test_fp8_fsdp2.py b/tests/e2e/multigpu/test_fp8_fsdp2.py
new file mode 100644
index 000000000..f7fa29a31
--- /dev/null
+++ b/tests/e2e/multigpu/test_fp8_fsdp2.py
@@ -0,0 +1,121 @@
+"""Test module for FP8 mixed precision with FSDP2 multi-GPU functionality."""
+
+# pylint: disable=duplicate-code
+
+import os
+from pathlib import Path
+
+import torch
+import yaml
+from accelerate.test_utils import execute_subprocess_async
+from tbparse import SummaryReader
+from transformers.testing_utils import get_torch_dist_unique_port
+
+from axolotl.utils.dict import DictDefault
+
+from tests.e2e.utils import most_recent_subdir, require_hopper, require_torch_2_7_0
+
+AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent
+
+
+def verify_fp8_training_success(temp_dir):
+    """Verify that FP8 training completed successfully by checking artifacts and loss."""
+    output_path = Path(temp_dir)
+
+    model_files = list(output_path.glob("*.bin")) + list(
+        output_path.glob("*.safetensors")
+    )
+    assert len(model_files) > 0, "No model files found - training may have failed"
+
+    checkpoint_files = list(output_path.glob("checkpoint-*"))
+    assert (
+        len(checkpoint_files) > 0
+    ), "No checkpoint files found - training may have failed"
+
+    tb_log_path = most_recent_subdir(temp_dir + "/runs")
+    if tb_log_path:
+        event_files = sorted(os.listdir(tb_log_path))
+        if event_files:
+            event_file = os.path.join(tb_log_path, event_files[0])
+            reader = SummaryReader(event_file)
+            df = reader.scalars
+            train_loss_df = df[df.tag == "train/train_loss"]
+            if len(train_loss_df) > 0:
+                final_loss = train_loss_df.value.values[-1]
+                assert not torch.isnan(
+                    torch.tensor(final_loss)
+                ), f"Training loss is NaN: {final_loss}"
+
+
+class TestFP8FSDP2:
+    """Test class for FP8 mixed precision with FSDP2 functionality."""
+
+    @require_torch_2_7_0
+    @require_hopper
+    def test_fp8_fsdp2_smoke(self, temp_dir):
+        """Smoke test for 2-GPU FP8 + torch.compile + FSDP2 training"""
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "tokenizer_type": "AutoTokenizer",
+                "trust_remote_code": True,
+                "sequence_len": 512,
+                "val_set_size": 0.05,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "mhenrichsen/alpaca_2k_test",
+                        "type": "alpaca",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 3,  # Very short smoke test
+                "micro_batch_size": 1,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",  # Use standard optimizer for stability
+                "lr_scheduler": "cosine",
+                "sdp_attention": True,
+                "pad_to_seq_len": True,
+                "sample_packing": True,
+                # FP8 configuration
+                "fp8": True,
+                "fp8_enable_fsdp_float8_all_gather": True,
+                "torch_compile": True,
+                # FSDP2 configuration
+                "fsdp_version": 2,
+                "fsdp_config": {
+                    "offload_params": False,
+                    "cpu_ram_efficient_loading": False,
+                    "transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
+                    "state_dict_type": "FULL_STATE_DICT",
+                    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                    "reshard_after_forward": True,
+                },
+                "use_tensorboard": True,
+                "save_safetensors": True,
+                "save_first_step": False,
+            }
+        )
+
+        # write cfg to yaml file
+        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+        execute_subprocess_async(
+            [
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
+                "--num-processes",
+                "2",
+                "--main-process-port",
+                f"{get_torch_dist_unique_port()}",
+            ]
+        )
+
+        verify_fp8_training_success(temp_dir)
diff --git a/tests/e2e/multigpu/test_fsdp1.py b/tests/e2e/multigpu/test_fsdp1.py
new file mode 100644
index 000000000..fe0badbe2
--- /dev/null
+++ b/tests/e2e/multigpu/test_fsdp1.py
@@ -0,0 +1,326 @@
+"""Test module for FSDP1 multi-GPU functionality."""
+
+# pylint: disable=duplicate-code
+
+import os
+from pathlib import Path
+
+import pytest
+import torch
+import yaml
+from accelerate.test_utils import execute_subprocess_async
+from tbparse import SummaryReader
+from transformers.testing_utils import get_torch_dist_unique_port
+
+from axolotl.utils.dict import DictDefault
+
+from tests.e2e.utils import most_recent_subdir
+
+AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent
+
+
+def verify_training_success(temp_dir):
+    """Verify that training completed successfully by checking artifacts and loss."""
+    output_path = Path(temp_dir)
+
+    model_files = list(output_path.glob("*.bin")) + list(
+        output_path.glob("*.safetensors")
+    )
+    assert len(model_files) > 0, "No model files found - training may have failed"
+
+    checkpoint_files = list(output_path.glob("checkpoint-*"))
+    assert (
+        len(checkpoint_files) > 0
+    ), "No checkpoint files found - training may have failed"
+
+    tb_log_path = most_recent_subdir(temp_dir + "/runs")
+    if tb_log_path:
+        event_files = sorted(os.listdir(tb_log_path))
+        if event_files:
+            event_file = os.path.join(tb_log_path, event_files[0])
+            reader = SummaryReader(event_file)
+            df = reader.scalars
+            train_loss_df = df[df.tag == "train/train_loss"]
+            if len(train_loss_df) > 0:
+                final_loss = train_loss_df.value.values[-1]
+                assert not torch.isnan(
+                    torch.tensor(final_loss)
+                ), f"Training loss is NaN: {final_loss}"
+
+
+class TestFSDP1:
+    """Test class for FSDP1 functionality."""
+
+    @pytest.mark.parametrize(
+        "fsdp_cpu_ram_efficient_loading",
+        [True, False],
+    )
+    def test_fft_sft(self, temp_dir, fsdp_cpu_ram_efficient_loading):
+        cfg = DictDefault(
+            {
+                "base_model": "Qwen/Qwen2.5-0.5B",
+                "sequence_len": 2048,
+                "val_set_size": 0.01,
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                        "split": "train[:10%]",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 2,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "fsdp_version": "1",
+                "fsdp_config": {
+                    "fsdp_offload_params": False,
+                    "fsdp_cpu_ram_efficient_loading": fsdp_cpu_ram_efficient_loading,
+                    "fsdp_transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
+                    "fsdp_state_dict_type": "FULL_STATE_DICT",
+                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                    "fsdp_sharding_strategy": "FULL_SHARD",
+                    "fsdp_sync_module_states": True,
+                    "fsdp_use_orig_params": False,
+                },
+                "use_tensorboard": True,
+                "bf16": True,
+            }
+        )
+
+        # write cfg to yaml file
+        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+        execute_subprocess_async(
+            [
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
+                "--num-processes",
+                "2",
+                "--main-process-port",
+                f"{get_torch_dist_unique_port()}",
+            ]
+        )
+
+        verify_training_success(temp_dir)
+
+    @pytest.mark.parametrize(
+        "adapter_config",
+        [
+            {
+                "adapter": "lora",
+                "load_in_4bit": False,
+            },
+            {
+                "adapter": "qlora",
+                "load_in_4bit": True,
+            },
+        ],
+    )
+    def test_lora_sft(self, temp_dir, adapter_config):
+        cfg = DictDefault(
+            {
+                "base_model": "Qwen/Qwen2.5-0.5B",
+                "sequence_len": 2048,
+                "val_set_size": 0.01,
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                        "split": "train[:10%]",
+                    },
+                ],
+                "adapter": adapter_config["adapter"],
+                "load_in_4bit": adapter_config["load_in_4bit"],
+                "lora_r": 8,
+                "lora_alpha": 16,
+                "lora_dropout": 0.05,
+                "lora_target_linear": True,
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 2,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "fsdp_version": "1",
+                "fsdp_config": {
+                    "fsdp_offload_params": False,
+                    "fsdp_cpu_ram_efficient_loading": True,
+                    "fsdp_transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
+                    "fsdp_state_dict_type": "FULL_STATE_DICT",
+                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                    "fsdp_sharding_strategy": "FULL_SHARD",
+                    "fsdp_sync_module_states": True,
+                    "fsdp_use_orig_params": False,
+                },
+                "use_tensorboard": True,
+                "bf16": True,
+            }
+        )
+
+        # write cfg to yaml file
+        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+        execute_subprocess_async(
+            [
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
+                "--num-processes",
+                "2",
+                "--main-process-port",
+                f"{get_torch_dist_unique_port()}",
+            ]
+        )
+
+        verify_training_success(temp_dir)
+
+    def test_dpo_fft(self, temp_dir):
+        cfg = DictDefault(
+            {
+                "base_model": "Qwen/Qwen2.5-0.5B",
+                "sequence_len": 2048,
+                "val_set_size": 0.01,
+                "rl": "dpo",
+                "chat_template": "chatml",
+                "datasets": [
+                    {
+                        "path": "Intel/orca_dpo_pairs",
+                        "split": "train",
+                        "type": "chatml.intel",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 2,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "fsdp_version": "1",
+                "fsdp_config": {
+                    "fsdp_offload_params": False,
+                    "fsdp_cpu_ram_efficient_loading": True,
+                    "fsdp_transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
+                    "fsdp_state_dict_type": "FULL_STATE_DICT",
+                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                    "fsdp_sharding_strategy": "FULL_SHARD",
+                    "fsdp_sync_module_states": True,
+                    "fsdp_use_orig_params": False,
+                },
+                "use_tensorboard": True,
+            }
+        )
+
+        # write cfg to yaml file
+        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+        execute_subprocess_async(
+            [
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
+                "--num-processes",
+                "2",
+                "--main-process-port",
+                f"{get_torch_dist_unique_port()}",
+            ]
+        )
+
+        verify_training_success(temp_dir)
+
+    @pytest.mark.parametrize(
+        "adapter_config",
+        [
+            {
+                "adapter": "lora",
+                "load_in_4bit": False,
+            },
+            {
+                "adapter": "qlora",
+                "load_in_4bit": True,
+            },
+        ],
+    )
+    def test_dpo_lora(self, temp_dir, adapter_config):
+        cfg = DictDefault(
+            {
+                "base_model": "Qwen/Qwen2.5-0.5B",
+                "load_in_4bit": adapter_config["load_in_4bit"],
+                "rl": "dpo",
+                "chat_template": "chatml",
+                "sequence_len": 2048,
+                "adapter": adapter_config["adapter"],
+                "lora_r": 8,
+                "lora_alpha": 16,
+                "lora_dropout": 0.05,
+                "lora_target_linear": True,
+                "val_set_size": 0.01,
+                "datasets": [
+                    {
+                        "path": "Intel/orca_dpo_pairs",
+                        "split": "train",
+                        "type": "chatml.intel",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 2,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "fsdp_version": "1",
+                "fsdp_config": {
+                    "fsdp_offload_params": False,
+                    "fsdp_cpu_ram_efficient_loading": True,
+                    "fsdp_transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
+                    "fsdp_state_dict_type": "FULL_STATE_DICT",
+                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                    "fsdp_sharding_strategy": "FULL_SHARD",
+                    "fsdp_sync_module_states": True,
+                    "fsdp_use_orig_params": False,
+                },
+                "use_tensorboard": True,
+                "bf16": "auto",
+                "tf32": True,
+            }
+        )
+
+        # write cfg to yaml file
+        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+        execute_subprocess_async(
+            [
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
+                "--num-processes",
+                "2",
+                "--main-process-port",
+                f"{get_torch_dist_unique_port()}",
+            ]
+        )
+
+        verify_training_success(temp_dir)
diff --git a/tests/e2e/multigpu/test_fsdp2.py b/tests/e2e/multigpu/test_fsdp2.py
new file mode 100644
index 000000000..0bb255266
--- /dev/null
+++ b/tests/e2e/multigpu/test_fsdp2.py
@@ -0,0 +1,482 @@
+"""Test module for FSDP2 multi-GPU functionality."""
+
+# pylint: disable=duplicate-code
+
+import os
+from pathlib import Path
+
+import pytest
+import torch
+import yaml
+from accelerate.test_utils import execute_subprocess_async
+from tbparse import SummaryReader
+from transformers.testing_utils import get_torch_dist_unique_port
+
+from axolotl.utils.dict import DictDefault
+
+from tests.e2e.utils import most_recent_subdir, require_torch_2_7_0
+
+AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent
+
+
+def verify_training_success(temp_dir):
+    """Verify that training completed successfully by checking artifacts and loss."""
+    output_path = Path(temp_dir)
+
+    model_files = list(output_path.glob("*.bin")) + list(
+        output_path.glob("*.safetensors")
+    )
+    assert len(model_files) > 0, "No model files found - training may have failed"
+
+    checkpoint_files = list(output_path.glob("checkpoint-*"))
+    assert (
+        len(checkpoint_files) > 0
+    ), "No checkpoint files found - training may have failed"
+
+    tb_log_path = most_recent_subdir(temp_dir + "/runs")
+    if tb_log_path:
+        event_files = sorted(os.listdir(tb_log_path))
+        if event_files:
+            event_file = os.path.join(tb_log_path, event_files[0])
+            reader = SummaryReader(event_file)
+            df = reader.scalars
+            train_loss_df = df[df.tag == "train/train_loss"]
+            if len(train_loss_df) > 0:
+                final_loss = train_loss_df.value.values[-1]
+                assert not torch.isnan(
+                    torch.tensor(final_loss)
+                ), f"Training loss is NaN: {final_loss}"
+
+
+class TestFSDP2:
+    """Test class for FSDP2 functionality."""
+
+    @require_torch_2_7_0
+    @pytest.mark.parametrize(
+        "fsdp_cpu_ram_efficient_loading",
+        [True, False],
+    )
+    def test_fft_sft(self, temp_dir, fsdp_cpu_ram_efficient_loading):
+        cfg = DictDefault(
+            {
+                "base_model": "Qwen/Qwen2.5-0.5B",
+                "sequence_len": 2048,
+                "val_set_size": 0.01,
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                        "split": "train[:10%]",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 2,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "fsdp_version": 2,
+                "fsdp_config": {
+                    "offload_params": False,
+                    "cpu_ram_efficient_loading": fsdp_cpu_ram_efficient_loading,
+                    "transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
+                    "state_dict_type": "FULL_STATE_DICT",
+                    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                    "reshard_after_forward": True,
+                },
+                "use_tensorboard": True,
+                "bf16": True,
+            }
+        )
+
+        # write cfg to yaml file
+        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+        execute_subprocess_async(
+            [
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
+                "--num-processes",
+                "2",
+                "--main-process-port",
+                f"{get_torch_dist_unique_port()}",
+            ]
+        )
+
+        verify_training_success(temp_dir)
+
+    @require_torch_2_7_0
+    @pytest.mark.parametrize("peft_use_dora", [True, False])
+    def test_lora_sft(self, temp_dir, peft_use_dora):
+        cfg = DictDefault(
+            {
+                "base_model": "Qwen/Qwen2.5-0.5B",
+                "sequence_len": 2048,
+                "val_set_size": 0.01,
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                        "split": "train[:10%]",
+                    },
+                ],
+                "peft_use_dora": peft_use_dora,
+                "adapter": "lora",
+                "lora_r": 8,
+                "lora_alpha": 16,
+                "lora_dropout": 0.05,
+                "lora_target_linear": True,
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 2,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "fsdp_version": 2,
+                "fsdp_config": {
+                    "offload_params": False,
+                    "cpu_ram_efficient_loading": False,
+                    "transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
+                    "state_dict_type": "FULL_STATE_DICT",
+                    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                    "reshard_after_forward": True,
+                },
+                "use_tensorboard": True,
+                "bf16": True,
+            }
+        )
+
+        # write cfg to yaml file
+        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+        execute_subprocess_async(
+            [
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
+                "--num-processes",
+                "2",
+                "--main-process-port",
+                f"{get_torch_dist_unique_port()}",
+            ]
+        )
+
+        verify_training_success(temp_dir)
+
+    @require_torch_2_7_0
+    def test_lora_sft_kernels(self, temp_dir):
+        cfg = DictDefault(
+            {
+                "base_model": "Qwen/Qwen2.5-0.5B",
+                "sequence_len": 2048,
+                "val_set_size": 0.01,
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                        "split": "train[:10%]",
+                    },
+                ],
+                "adapter": "lora",
+                "lora_r": 8,
+                "lora_alpha": 16,
+                "lora_target_linear": True,
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 2,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "fsdp_version": 2,
+                "fsdp_config": {
+                    "offload_params": False,
+                    "cpu_ram_efficient_loading": False,
+                    "transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
+                    "state_dict_type": "FULL_STATE_DICT",
+                    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                    "reshard_after_forward": True,
+                },
+                "use_tensorboard": True,
+                "bf16": True,
+                "lora_mlp_kernel": True,
+                "lora_qkv_kernel": True,
+                "lora_o_kernel": True,
+            }
+        )
+
+        # write cfg to yaml file
+        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+        execute_subprocess_async(
+            [
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
+                "--num-processes",
+                "2",
+                "--main-process-port",
+                f"{get_torch_dist_unique_port()}",
+            ]
+        )
+
+        verify_training_success(temp_dir)
+
+    @require_torch_2_7_0
+    def test_qlora_sft(self, temp_dir):
+        cfg = DictDefault(
+            {
+                "base_model": "Qwen/Qwen2.5-0.5B",
+                "sequence_len": 2048,
+                "val_set_size": 0.01,
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                        "split": "train[:10%]",
+                    },
+                ],
+                "load_in_4bit": True,
+                "adapter": "qlora",
+                "lora_r": 8,
+                "lora_alpha": 16,
+                "lora_dropout": 0.05,
+                "lora_target_linear": True,
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 2,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "fsdp_version": 2,
+                "fsdp_config": {
+                    "offload_params": False,
+                    "cpu_ram_efficient_loading": False,
+                    "transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
+                    "state_dict_type": "FULL_STATE_DICT",
+                    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                    "reshard_after_forward": True,
+                },
+                "use_tensorboard": True,
+                "bf16": True,
+            }
+        )
+
+        # write cfg to yaml file
+        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+        execute_subprocess_async(
+            [
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
+                "--num-processes",
+                "2",
+                "--main-process-port",
+                f"{get_torch_dist_unique_port()}",
+            ]
+        )
+
+        verify_training_success(temp_dir)
+
+    @require_torch_2_7_0
+    def test_qlora_sft_kernels(self, temp_dir):
+        cfg = DictDefault(
+            {
+                "base_model": "Qwen/Qwen2.5-0.5B",
+                "sequence_len": 2048,
+                "val_set_size": 0.01,
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                        "split": "train[:10%]",
+                    },
+                ],
+                "load_in_4bit": True,
+                "adapter": "qlora",
+                "lora_r": 8,
+                "lora_alpha": 16,
+                "lora_target_linear": True,
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 2,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "fsdp_version": 2,
+                "fsdp_config": {
+                    "offload_params": False,
+                    "cpu_ram_efficient_loading": False,
+                    "transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
+                    "state_dict_type": "FULL_STATE_DICT",
+                    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                    "reshard_after_forward": True,
+                },
+                "use_tensorboard": True,
+                "bf16": True,
+                "lora_mlp_kernel": True,
+                "lora_qkv_kernel": True,
+                "lora_o_kernel": True,
+            }
+        )
+
+        # write cfg to yaml file
+        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+        execute_subprocess_async(
+            [
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
+                "--num-processes",
+                "2",
+                "--main-process-port",
+                f"{get_torch_dist_unique_port()}",
+            ]
+        )
+
+        verify_training_success(temp_dir)
+
+    @require_torch_2_7_0
+    def test_dpo_fft(self, temp_dir):
+        cfg = DictDefault(
+            {
+                "base_model": "Qwen/Qwen2.5-0.5B",
+                "sequence_len": 2048,
+                "val_set_size": 0.01,
+                "rl": "dpo",
+                "chat_template": "chatml",
+                "datasets": [
+                    {
+                        "path": "Intel/orca_dpo_pairs",
+                        "split": "train",
+                        "type": "chatml.intel",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 2,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "fsdp_version": 2,
+                "fsdp_config": {
+                    "offload_params": False,
+                    "cpu_ram_efficient_loading": False,
+                    "transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
+                    "state_dict_type": "FULL_STATE_DICT",
+                    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                    "reshard_after_forward": True,
+                },
+                "use_tensorboard": True,
+            }
+        )
+
+        # write cfg to yaml file
+        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+        execute_subprocess_async(
+            [
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
+                "--num-processes",
+                "2",
+                "--main-process-port",
+                f"{get_torch_dist_unique_port()}",
+            ]
+        )
+
+        verify_training_success(temp_dir)
+
+    @require_torch_2_7_0
+    def test_dpo_lora(self, temp_dir):
+        cfg = DictDefault(
+            {
+                "base_model": "Qwen/Qwen2.5-0.5B",
+                "sequence_len": 2048,
+                "rl": "dpo",
+                "chat_template": "chatml",
+                "datasets": [
+                    {
+                        "path": "Intel/orca_dpo_pairs",
+                        "split": "train",
+                        "type": "chatml.intel",
+                    },
+                ],
+                "adapter": "lora",
+                "lora_r": 8,
+                "lora_alpha": 16,
+                "lora_dropout": 0.05,
+                "lora_target_linear": True,
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 2,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "fsdp_version": 2,
+                "fsdp_config": {
+                    "offload_params": False,
+                    "cpu_ram_efficient_loading": False,
+                    "transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
+                    "state_dict_type": "FULL_STATE_DICT",
+                    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                    "reshard_after_forward": True,
+                },
+                "use_tensorboard": True,
+            }
+        )
+
+        # write cfg to yaml file
+        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+        execute_subprocess_async(
+            [
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
+                "--num-processes",
+                "2",
+                "--main-process-port",
+                f"{get_torch_dist_unique_port()}",
+            ]
+        )
+
+        verify_training_success(temp_dir)
diff --git a/tests/e2e/multigpu/test_gemma3.py b/tests/e2e/multigpu/test_gemma3.py
index 9de3ed82f..4a7b101a8 100644
--- a/tests/e2e/multigpu/test_gemma3.py
+++ b/tests/e2e/multigpu/test_gemma3.py
@@ -2,8 +2,6 @@
 E2E tests for multigpu lora tinyllama
 """
 
-import logging
-import os
 from pathlib import Path
 
 import pytest
@@ -16,9 +14,6 @@ from axolotl.utils.dict import DictDefault
 
 from tests.e2e.utils import check_tensorboard
 
-LOG = logging.getLogger("axolotl.tests.e2e.multigpu")
-os.environ["WANDB_DISABLED"] = "true"
-
 AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent
 
 
@@ -69,12 +64,14 @@ class TestMultiGPUGemma3:
                 },
                 "gradient_accumulation_steps": 2,
                 "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                 "learning_rate": 0.0001,
                 "optimizer": "adamw_8bit",
                 "lr_scheduler": "cosine",
                 "flash_attention": True,
                 "use_tensorboard": True,
                 "bf16": True,
+                "save_first_step": False,
             }
         )
 
@@ -96,5 +93,5 @@ class TestMultiGPUGemma3:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 1.8, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 1.8, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py
index 38e6e741a..aab14dcc4 100644
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -2,8 +2,6 @@
 E2E tests for multigpu lora tinyllama
 """
 
-import logging
-import os
 from pathlib import Path
 
 import pytest
@@ -18,9 +16,6 @@ from axolotl.utils.dict import DictDefault
 
 from tests.e2e.utils import check_tensorboard, require_torch_2_6_0
 
-LOG = logging.getLogger("axolotl.tests.e2e.multigpu")
-os.environ["WANDB_DISABLED"] = "true"
-
 AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent
 
 
@@ -67,12 +62,14 @@ class TestMultiGPULlama:
                 "gradient_accumulation_steps": 2,
                 # "gradient_checkpointing": True,
                 "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_8bit",
                 "lr_scheduler": "cosine",
                 "flash_attention": True,
                 "use_tensorboard": True,
                 "bf16": True,
+                "save_first_step": False,
             }
         )
 
@@ -94,7 +91,7 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.8, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.parametrize(
@@ -132,12 +129,14 @@ class TestMultiGPULlama:
                 "gradient_accumulation_steps": gradient_accumulation_steps,
                 # "gradient_checkpointing": True,
                 "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_8bit",
                 "lr_scheduler": "cosine",
                 "flash_attention": True,
                 "use_tensorboard": True,
                 "bf16": True,
+                "save_first_step": False,
             }
         )
 
@@ -159,7 +158,7 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
         )
 
     def test_dpo_lora_ddp(self, temp_dir):
@@ -205,6 +204,7 @@ class TestMultiGPULlama:
                 "gradient_accumulation_steps": 2,
                 # "gradient_checkpointing": True,
                 "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                 "warmup_steps": 0,
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_8bit",
@@ -212,6 +212,7 @@ class TestMultiGPULlama:
                 "flash_attention": True,
                 "use_tensorboard": True,
                 "bf16": True,
+                "save_first_step": False,
             }
         )
 
@@ -237,7 +238,7 @@ class TestMultiGPULlama:
             temp_dir + "/runs",
             "train/train_loss",
             loss_threshold,
-            "Train Loss is too high",
+            "Train Loss (%s) is too high",
         )
 
     def test_dpo_qlora_ddp(self, temp_dir):
@@ -283,6 +284,7 @@ class TestMultiGPULlama:
                 "gradient_accumulation_steps": 2,
                 # "gradient_checkpointing": True,
                 "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                 "warmup_steps": 0,
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_8bit",
@@ -290,6 +292,7 @@ class TestMultiGPULlama:
                 "flash_attention": True,
                 "use_tensorboard": True,
                 "bf16": True,
+                "save_first_step": False,
             }
         )
 
@@ -315,7 +318,7 @@ class TestMultiGPULlama:
             temp_dir + "/runs",
             "train/train_loss",
             loss_threshold,
-            "Train Loss is too high",
+            "Train Loss (%s) is too high",
         )
 
     @pytest.mark.parametrize(
@@ -345,6 +348,7 @@ class TestMultiGPULlama:
                 "gradient_accumulation_steps": gradient_accumulation_steps,
                 # "gradient_checkpointing": True,
                 "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch_fused",
                 "lr_scheduler": "cosine",
@@ -364,6 +368,8 @@ class TestMultiGPULlama:
                     "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                 },
                 "use_tensorboard": True,
+                "seed": 42,
+                "save_first_step": False,
             }
         )
 
@@ -385,12 +391,15 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.parametrize(
         "fsdp_state_dict_type",
-        ["FULL_STATE_DICT", "SHARDED_STATE_DICT"],
+        [
+            "FULL_STATE_DICT",
+            # "SHARDED_STATE_DICT",  # not supported since intermediate checkpoints fail with fsdp1
+        ],
     )
     def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
         # pylint: disable=duplicate-code
@@ -412,11 +421,13 @@ class TestMultiGPULlama:
                     },
                 ],
                 "num_epochs": 1,
-                "max_steps": 2,
+                "max_steps": 3,
+                "save_steps": 2,
                 "micro_batch_size": 2,
                 "gradient_accumulation_steps": 2,
                 # "gradient_checkpointing": True,
                 "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch_fused",
                 "lr_scheduler": "cosine",
@@ -436,6 +447,7 @@ class TestMultiGPULlama:
                     "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                 },
                 "use_tensorboard": True,
+                "save_first_step": False,
             }
         )
 
@@ -457,7 +469,7 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
         )
 
     @require_torch_2_6_0
@@ -496,6 +508,7 @@ class TestMultiGPULlama:
                 "gradient_accumulation_steps": 2,
                 "gradient_checkpointing": True,
                 "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch_8bit",
                 "lr_scheduler": "cosine",
@@ -513,6 +526,7 @@ class TestMultiGPULlama:
                     "fsdp_reshard_after_forward": fsdp_reshard_after_forward,
                 },
                 "use_tensorboard": True,
+                "save_first_step": False,
             }
         )
         if attention_backend == "flash":
@@ -538,7 +552,7 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
         )
 
     def test_fsdp_qlora_prequant_packed(self, temp_dir):
@@ -578,6 +592,7 @@ class TestMultiGPULlama:
                 "gradient_accumulation_steps": 2,
                 # "gradient_checkpointing": True,
                 "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch_fused",
                 "lr_scheduler": "cosine",
@@ -593,10 +608,11 @@ class TestMultiGPULlama:
                     "fsdp_use_orig_params": False,
                     "fsdp_cpu_ram_efficient_loading": True,
                     "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
-                    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
+                    "fsdp_state_dict_type": "FULL_STATE_DICT",
                     "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                 },
                 "use_tensorboard": True,
+                "save_first_step": False,
             }
         )
 
@@ -618,7 +634,7 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.parametrize(
@@ -674,12 +690,14 @@ class TestMultiGPULlama:
                 "micro_batch_size": 1,
                 "gradient_accumulation_steps": gradient_accumulation_steps,
                 "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch_fused",
                 "lr_scheduler": "cosine",
                 "flash_attention": True,
                 "deepspeed": str(AXOLOTL_ROOT / deepspeed),
                 "use_tensorboard": True,
+                "save_first_step": False,
                 **adapter,
             }
         )
@@ -702,7 +720,7 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.45, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.parametrize(
@@ -748,12 +766,15 @@ class TestMultiGPULlama:
                 "micro_batch_size": 1,
                 "gradient_accumulation_steps": gradient_accumulation_steps,
                 "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch_fused",
                 "lr_scheduler": "cosine",
                 "flash_attention": True,
                 "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
                 "use_tensorboard": True,
+                "seed": 42,
+                "save_first_step": False,
                 **adapter,
             }
         )
@@ -776,7 +797,7 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.parametrize(
@@ -822,12 +843,14 @@ class TestMultiGPULlama:
                 "micro_batch_size": 1,
                 "gradient_accumulation_steps": gradient_accumulation_steps,
                 "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch_fused",
                 "lr_scheduler": "cosine",
                 "flash_attention": True,
                 "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
                 "use_tensorboard": True,
+                "save_first_step": False,
                 **adapter,
             }
         )
@@ -850,7 +873,7 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.skip(
@@ -896,6 +919,7 @@ class TestMultiGPULlama:
                 "save_safetensors": True,
                 # "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
                 "use_tensorboard": True,
+                "save_first_step": False,
             }
         )
 
@@ -917,5 +941,5 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 4.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 4.0, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/multigpu/test_locking.py b/tests/e2e/multigpu/test_locking.py
new file mode 100644
index 000000000..42502dfa3
--- /dev/null
+++ b/tests/e2e/multigpu/test_locking.py
@@ -0,0 +1,192 @@
+"""Tests for FileLockLoader class."""
+
+import tempfile
+import threading
+import time
+from pathlib import Path
+from unittest.mock import MagicMock, Mock, patch
+
+import pytest
+
+from axolotl.utils.data.lock import FileLockLoader
+from axolotl.utils.dict import DictDefault
+
+
+class TestFileLockLoader:
+    """Class with tests for FileLockLoader."""
+
+    @pytest.fixture
+    def temp_dir(self):
+        """Create a temporary directory for testing."""
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            yield Path(tmp_dir)
+
+    @pytest.fixture
+    def cfg(self, temp_dir):
+        """Create a test configuration."""
+        return DictDefault({"dataset_prepared_path": str(temp_dir)})
+
+    @pytest.fixture
+    def loader(self, cfg):
+        """Create a FileLockLoader instance for testing."""
+        return FileLockLoader(cfg)
+
+    def test_load_first_process(self, loader):
+        """Test load() when no ready flag exists (first process)."""
+        mock_load_fn = Mock(return_value="test_data")
+
+        result = loader.load(mock_load_fn)
+
+        # Should call the load function
+        mock_load_fn.assert_called_once()
+        assert result == "test_data"
+
+        # Should create the ready flag
+        assert loader.ready_flag_path.exists()
+
+    def test_load_subsequent_process(self, loader):
+        """Test load() when ready flag already exists (subsequent process)."""
+        # Create ready flag first
+        loader.ready_flag_path.touch()
+
+        mock_load_fn = Mock(return_value="loaded_data")
+
+        result = loader.load(mock_load_fn)
+
+        # Should still call load function (to load the prepared data)
+        mock_load_fn.assert_called_once()
+        assert result == "loaded_data"
+
+    def test_load_concurrent_processes(self, cfg):
+        """Test that concurrent processes coordinate correctly."""
+        results = []
+        call_count = 0
+
+        def slow_load_fn():
+            nonlocal call_count
+            call_count += 1
+            time.sleep(0.1)  # Simulate slow loading
+            return f"data_{call_count}"
+
+        def worker():
+            loader = FileLockLoader(cfg)
+            result = loader.load(slow_load_fn)
+            results.append(result)
+
+        # Start multiple threads simultaneously
+        threads = [threading.Thread(target=worker) for _ in range(3)]
+        for t in threads:
+            t.start()
+        for t in threads:
+            t.join()
+
+        # Only one thread should have done the initial loading
+        # All should return data, but the load function should be called
+        # once by the first process and once by each subsequent process
+        assert len(results) == 3
+        assert all(result.startswith("data_") for result in results)
+
+    @patch("time.sleep")
+    def test_load_waiting_for_ready_flag(self, mock_sleep, loader):
+        """Test that processes wait for the ready flag to appear."""
+        mock_load_fn = Mock(return_value="waiting_data")
+        mock_ready_flag_path = Mock()
+        exists_call_count = 0
+
+        def mock_exists():
+            nonlocal exists_call_count
+            exists_call_count += 1
+
+            if exists_call_count == 1:
+                # First check: ready flag exists (not first process)
+                return True
+            if exists_call_count <= 3:
+                # While loop checks: flag doesn't exist yet
+                return False
+            return True
+
+        mock_ready_flag_path.exists.side_effect = mock_exists
+
+        # Replace the ready_flag_path with our mock
+        original_path = loader.ready_flag_path
+        loader.ready_flag_path = mock_ready_flag_path
+
+        try:
+            result = loader.load(mock_load_fn)
+        finally:
+            # Restore original path
+            loader.ready_flag_path = original_path
+
+        # Should have slept twice while waiting
+        assert mock_sleep.call_count == 2
+        mock_sleep.assert_called_with(1)
+
+        # Should eventually call load function
+        mock_load_fn.assert_called_once()
+        assert result == "waiting_data"
+
+    def test_complete_workflow_with_cleanup(self, loader):
+        """Test the complete load -> cleanup workflow."""
+        mock_load_fn = Mock(return_value="test_data")
+
+        # First process calls load (this should set up counter)
+        result = loader.load(mock_load_fn)
+        assert result == "test_data"
+        assert loader.ready_flag_path.exists()
+        assert loader.counter_path.exists()
+
+        # Cleanup should remove everything since there's only one process
+        loader.cleanup()
+        assert not loader.ready_flag_path.exists()
+        assert not loader.counter_path.exists()
+
+    def test_multiple_processes_workflow(self, loader):
+        """Test workflow with multiple processes."""
+        # Simulate multiple processes by manually setting up counter
+        loader.ready_flag_path.touch()
+        loader.counter_path.write_text("3")  # 3 processes
+
+        # First process cleanup
+        loader.cleanup()
+        assert loader.ready_flag_path.exists()
+        assert loader.counter_path.read_text().strip() == "2"
+
+        # Second process cleanup
+        loader.cleanup()
+        assert loader.ready_flag_path.exists()
+        assert loader.counter_path.read_text().strip() == "1"
+
+        # Last process cleanup
+        loader.cleanup()
+        assert not loader.ready_flag_path.exists()
+        assert not loader.counter_path.exists()
+
+    def test_load_exception_handling(self, loader):
+        """Test behavior when load_fn raises an exception."""
+
+        def failing_load_fn():
+            raise ValueError("Load failed")
+
+        with pytest.raises(ValueError, match="Load failed"):
+            loader.load(failing_load_fn)
+
+        # Ready flag should not be created on failure
+        assert not loader.ready_flag_path.exists()
+
+    def test_file_lock_called(self, loader):
+        """Test that FileLock is properly used."""
+        mock_load_fn = Mock(return_value="locked_data")
+
+        with patch("axolotl.utils.data.lock.FileLock") as mock_filelock:
+            mock_context = MagicMock()
+            mock_filelock.return_value.__enter__ = Mock(return_value=mock_context)
+            mock_filelock.return_value.__exit__ = Mock(return_value=None)
+
+            loader.load(mock_load_fn)
+
+            # Verify FileLock was called with correct path
+            mock_filelock.assert_called_once_with(str(loader.lock_file_path))
+
+            # Verify context manager was used
+            mock_filelock.return_value.__enter__.assert_called_once()
+            mock_filelock.return_value.__exit__.assert_called_once()
diff --git a/tests/e2e/multigpu/test_qwen2.py b/tests/e2e/multigpu/test_qwen2.py
deleted file mode 100644
index 9599c3abf..000000000
--- a/tests/e2e/multigpu/test_qwen2.py
+++ /dev/null
@@ -1,97 +0,0 @@
-"""
-E2E tests for multigpu qwen2
-"""
-
-import logging
-import os
-from pathlib import Path
-
-import pytest
-import yaml
-from accelerate.test_utils import execute_subprocess_async
-from transformers.testing_utils import get_torch_dist_unique_port
-
-from axolotl.utils.dict import DictDefault
-
-LOG = logging.getLogger("axolotl.tests.e2e.multigpu")
-os.environ["WANDB_DISABLED"] = "true"
-
-
-class TestMultiGPUQwen2:
-    """
-    Test case for Llama models using LoRA
-    """
-
-    @pytest.mark.parametrize("base_model", ["Qwen/Qwen2-0.5B", "Qwen/Qwen2.5-0.5B"])
-    def test_qlora_fsdp_dpo(self, base_model, temp_dir):
-        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": base_model,
-                "load_in_4bit": True,
-                "rl": "dpo",
-                "chat_template": "chatml",
-                "sequence_len": 2048,
-                "adapter": "qlora",
-                "lora_r": 8,
-                "lora_alpha": 16,
-                "lora_dropout": 0.05,
-                "lora_target_linear": True,
-                "val_set_size": 0.01,
-                "datasets": [
-                    {
-                        "path": "Intel/orca_dpo_pairs",
-                        "split": "train",
-                        "type": "chatml.intel",
-                    },
-                ],
-                "num_epochs": 1,
-                "max_steps": 2,
-                "warmup_steps": 20,
-                "micro_batch_size": 2,
-                "gradient_accumulation_steps": 2,
-                "output_dir": temp_dir,
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_torch_fused",
-                "lr_scheduler": "cosine",
-                "flash_attention": True,
-                "bf16": "auto",
-                "tf32": True,
-                # "gradient_checkpointing": True,
-                "gradient_checkpointing_kwargs": {
-                    "use_reentrant": False,
-                },
-                "fsdp": [
-                    "full_shard",
-                    "auto_wrap",
-                ],
-                "fsdp_config": {
-                    "fsdp_limit_all_gathers": True,
-                    "fsdp_offload_params": False,
-                    "fsdp_sync_module_states": True,
-                    "fsdp_use_orig_params": False,
-                    "fsdp_cpu_ram_efficient_loading": False,
-                    "fsdp_transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
-                    "fsdp_state_dict_type": "FULL_STATE_DICT",
-                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
-                    "fsdp_sharding_strategy": "FULL_SHARD",
-                },
-            }
-        )
-
-        # write cfg to yaml file
-        Path(temp_dir).mkdir(parents=True, exist_ok=True)
-        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
-            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
-
-        execute_subprocess_async(
-            [
-                "axolotl",
-                "train",
-                str(Path(temp_dir) / "config.yaml"),
-                "--num-processes",
-                "2",
-                "--main-process-port",
-                f"{get_torch_dist_unique_port()}",
-            ]
-        )
diff --git a/tests/e2e/multigpu/test_ray.py b/tests/e2e/multigpu/test_ray.py
index 843adac91..7f1278abf 100644
--- a/tests/e2e/multigpu/test_ray.py
+++ b/tests/e2e/multigpu/test_ray.py
@@ -2,8 +2,6 @@
 E2E tests for multigpu post-training use Ray Train
 """
 
-import logging
-import os
 from pathlib import Path
 
 import pytest
@@ -12,10 +10,11 @@ from accelerate.test_utils import execute_subprocess_async
 
 from axolotl.utils.dict import DictDefault
 
-from tests.e2e.utils import check_tensorboard, require_torch_lt_2_6_0
-
-LOG = logging.getLogger(__name__)
-os.environ["WANDB_DISABLED"] = "true"
+from tests.e2e.utils import (
+    check_tensorboard,
+    require_torch_2_7_0,
+    require_torch_lt_2_6_0,
+)
 
 AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent
 
@@ -53,6 +52,7 @@ class TestMultiGPURay:
                 "micro_batch_size": 4,
                 "gradient_accumulation_steps": 2,
                 "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_8bit",
                 "lr_scheduler": "cosine",
@@ -60,6 +60,7 @@ class TestMultiGPURay:
                 "use_tensorboard": True,
                 "use_ray": True,
                 "ray_num_workers": 2,
+                "save_first_step": False,
             }
         )
 
@@ -80,7 +81,7 @@ class TestMultiGPURay:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
         )
 
     @require_torch_lt_2_6_0
@@ -112,12 +113,14 @@ class TestMultiGPURay:
                 "micro_batch_size": 1,
                 "gradient_accumulation_steps": gradient_accumulation_steps,
                 "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch",
                 "lr_scheduler": "cosine",
                 "flash_attention": True,
                 "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
                 "use_tensorboard": True,
+                "save_first_step": False,
             }
         )
 
@@ -138,5 +141,73 @@ class TestMultiGPURay:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
+        )
+
+    @require_torch_2_7_0
+    @pytest.mark.parametrize(
+        "gradient_accumulation_steps",
+        [1, 2],
+    )
+    def test_sft_fsdp2_packed(self, temp_dir, gradient_accumulation_steps):
+        # pylint: disable=duplicate-code
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "sample_packing": True,
+                "pad_to_sequence_len": True,
+                "sequence_len": 1024,
+                "val_set_size": 0.01,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                        "split": "train[:10%]",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 1,
+                "gradient_accumulation_steps": gradient_accumulation_steps,
+                "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "fsdp_version": 2,
+                "fsdp_config": {
+                    "offload_params": False,
+                    "cpu_ram_efficient_loading": False,
+                    "transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
+                    "state_dict_type": "FULL_STATE_DICT",
+                    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                    "reshard_after_forward": True,
+                },
+                "use_tensorboard": True,
+                "save_first_step": False,
+            }
+        )
+
+        # write cfg to yaml file
+        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+        execute_subprocess_async(
+            [
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
+                "--use-ray",
+                "--ray-num-workers",
+                "2",
+            ]
+        )
+
+        check_tensorboard(
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/multigpu/test_tp.py b/tests/e2e/multigpu/test_tp.py
new file mode 100644
index 000000000..87a1c6339
--- /dev/null
+++ b/tests/e2e/multigpu/test_tp.py
@@ -0,0 +1,69 @@
+"""multigpu e2e test for tensor parallelism."""
+
+from pathlib import Path
+
+import pytest
+import yaml
+from accelerate.test_utils import execute_subprocess_async, get_torch_dist_unique_port
+
+from axolotl.utils.dict import DictDefault
+
+from tests.e2e.utils import check_tensorboard, require_torch_2_7_0
+
+
+class TestTensorParallel:
+    """Test class for Tensor Parallel functionality."""
+
+    @pytest.mark.skip(
+        reason="TP doesn't work with models with tied weights (embeddings)"
+    )
+    @require_torch_2_7_0
+    def test_fft_sft(self, temp_dir):
+        # pylint: disable=duplicate-code
+        cfg = DictDefault(
+            {
+                "base_model": "Qwen/Qwen2.5-0.5B",
+                "sequence_len": 2048,
+                "val_set_size": 0.01,
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                        "split": "train[:10%]",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 2,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch",
+                "tensor_parallel_size": 2,
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "use_tensorboard": True,
+                "bf16": True,
+            }
+        )
+
+        # write cfg to yaml file
+        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+        execute_subprocess_async(
+            [
+                "axolotl",
+                "train",
+                str(Path(temp_dir) / "config.yaml"),
+                "--num-processes",
+                "2",
+                "--main-process-port",
+                f"{get_torch_dist_unique_port()}",
+            ]
+        )
+
+        check_tensorboard(
+            temp_dir + "/runs", "train/train_loss", 1.0, "Train Loss (%s) is too high"
+        )
diff --git a/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py b/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
index f6b7ee9b9..b4dc5de54 100644
--- a/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
+++ b/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
@@ -21,8 +21,13 @@ from axolotl.kernels.lora import (
     apply_lora_o,
     apply_lora_qkv,
 )
+from axolotl.loaders.model import ModelLoader
+from axolotl.loaders.tokenizer import load_tokenizer
 from axolotl.monkeypatch.lora_kernels import (
     apply_lora_kernel_patches,
+    find_self_attn_in_layer,
+    get_attention_cls_from_config,
+    get_layers,
     patch_self_attn_lora,
 )
 from axolotl.utils.dict import DictDefault
@@ -80,7 +85,7 @@ def small_llama_model():
 )
 def test_attention_patching_integration(model_name, attention_cls):
     """Test attention patching in integration context."""
-    cfg = {"base_model": model_name}
+    cfg = DictDefault({"base_model": model_name})
 
     # Store the original implementation
     original_forward = getattr(attention_cls, "forward")
@@ -391,7 +396,7 @@ def test_model_architecture(model_config):
 
 
 # pylint: disable=duplicate-code
-def test_kernel_training_integration():
+def test_kernel_training_integration(temp_dir):
     """Test model loading with kernel patches enabled."""
     from axolotl.cli.utils import load_model_and_tokenizer
 
@@ -421,6 +426,14 @@ def test_kernel_training_integration():
         }
     )
 
+    # Write cfg to yaml file
+    path = Path(temp_dir) / "config.yaml"
+    with open(path, "w", encoding="utf-8") as fout:
+        fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+    # Load config
+    cfg = load_cfg(str(path))
+
     # Load model
     model, _, _ = load_model_and_tokenizer(cfg=cfg)
 
@@ -466,3 +479,103 @@ def test_kernel_training_integration_auto_enable(temp_dir):
     assert cfg.lora_mlp_kernel is True
     assert cfg.lora_qkv_kernel is True
     assert cfg.lora_o_kernel is True
+
+    # Get the attention class before patching to check for side effects
+    attention_cls = get_attention_cls_from_config(cfg)
+
+    # Store original state before patching
+    original_forward_method = attention_cls.forward
+
+    # Load the model (this should trigger the patches)
+    tokenizer = load_tokenizer(cfg)
+    model, _ = ModelLoader(cfg, tokenizer).load()
+
+    # Test side effects of patch_self_attn_lora
+    assert hasattr(attention_cls, "_original_forward")
+    assert attention_cls.forward != original_forward_method
+
+    # Find at least one self-attention module and verify it has the patched methods
+    found_patched_attn = False
+    for layer in model.model.model.layers:
+        if hasattr(layer, "self_attn"):
+            self_attn = layer.self_attn
+            if all(
+                hasattr(self_attn, proj)
+                for proj in ["q_proj", "k_proj", "v_proj", "o_proj"]
+            ):
+                # These methods should be added by apply_lora_kernel_patches
+                assert hasattr(self_attn, "apply_qkv") and callable(self_attn.apply_qkv)
+                assert hasattr(self_attn, "apply_o") and callable(self_attn.apply_o)
+
+                found_patched_attn = True
+                break
+
+    assert found_patched_attn
+
+
+def test_kernel_training_integration_dropout_non_zero(temp_dir):
+    """Test model loading with dropout non-zero should not patch."""
+
+    from axolotl.cli.utils import load_model_and_tokenizer
+
+    # Create minimal config
+    cfg = DictDefault(
+        {
+            "base_model": "HuggingFaceTB/SmolLM2-135M",
+            "tokenizer_config": "HuggingFaceTB/SmolLM2-135M",
+            "learning_rate": 0.000001,
+            "datasets": [
+                {
+                    "path": "mhenrichsen/alpaca_2k_test",
+                    "type": "alpaca",
+                }
+            ],
+            "micro_batch_size": 1,
+            "gradient_accumulation_steps": 1,
+            "adapter": "lora",
+            "lora_r": 8,
+            "lora_alpha": 16,
+            "lora_dropout": 0.1,
+            "lora_target_linear": True,
+            "sequence_len": 1024,
+        }
+    )
+
+    # Write cfg to yaml file
+    path = Path(temp_dir) / "config.yaml"
+    with open(path, "w", encoding="utf-8") as fout:
+        fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+    # Load config
+    cfg = load_cfg(str(path))
+
+    # Get original attention class
+    attention_cls = get_attention_cls_from_config(cfg)
+
+    # Store original state before patching
+    original_forward_method = attention_cls.forward
+
+    # Load model
+    model, tokenizer, _ = load_model_and_tokenizer(cfg=cfg)
+
+    # We call modelloader as that's where the patches are applied
+    # despite the fact that we're not using it to load the model
+    model_loader = ModelLoader(cfg, tokenizer)
+
+    # Apply patch
+    model_loader.patch_manager._apply_self_attention_lora_patch()  # pylint: disable=protected-access
+
+    # Verify patch was not applied
+    assert attention_cls.forward == original_forward_method
+
+    # Apply apply_lora_kernel_patches
+    model_loader.patch_manager._apply_lora_kernel_patch(  # pylint: disable=protected-access
+        model
+    )
+
+    # Verify patch was not applied
+    layers = get_layers(model)
+    for layer in layers:
+        for self_attn in find_self_attn_in_layer(layer):
+            assert not hasattr(self_attn, "apply_qkv")
+            assert not hasattr(self_attn, "apply_o")
diff --git a/tests/e2e/patched/test_4d_multipack_llama.py b/tests/e2e/patched/test_4d_multipack_llama.py
index 12dd51c13..1824443e7 100644
--- a/tests/e2e/patched/test_4d_multipack_llama.py
+++ b/tests/e2e/patched/test_4d_multipack_llama.py
@@ -2,11 +2,8 @@
 E2E tests for multipack fft llama using 4d attention masks
 """
 
-import logging
-import os
 import unittest
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -14,9 +11,6 @@ from axolotl.utils.dict import DictDefault
 
 from ..utils import check_model_output_exists, with_temp_dir
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class Test4dMultipackLlama(unittest.TestCase):
     """
@@ -61,12 +55,12 @@ class Test4dMultipackLlama(unittest.TestCase):
                 "save_steps": 3,
                 "eval_steps": 4,
                 "fp16": True,
+                "save_first_step": False,
             }
         )
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
@@ -109,12 +103,12 @@ class Test4dMultipackLlama(unittest.TestCase):
                 "save_steps": 3,
                 "eval_steps": 4,
                 "fp16": True,
+                "save_first_step": False,
             }
         )
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/patched/test_activation_checkpointing.py b/tests/e2e/patched/test_activation_checkpointing.py
index 45107b871..06e3de274 100644
--- a/tests/e2e/patched/test_activation_checkpointing.py
+++ b/tests/e2e/patched/test_activation_checkpointing.py
@@ -6,7 +6,6 @@ import pytest
 import transformers
 from torch.utils.checkpoint import checkpoint
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -70,13 +69,14 @@ class TestActivationCheckpointing:
                 "bf16": True,
                 "save_safetensors": True,
                 "gradient_checkpointing": gradient_checkpointing,
+                "save_first_step": False,
+                "dataset_processes": 4,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/patched/test_fa_xentropy.py b/tests/e2e/patched/test_fa_xentropy.py
index f71e4fb4a..38099b220 100644
--- a/tests/e2e/patched/test_fa_xentropy.py
+++ b/tests/e2e/patched/test_fa_xentropy.py
@@ -2,13 +2,9 @@
 E2E tests for lora llama
 """
 
-import logging
-import os
-
 import pytest
 from transformers.utils import is_torch_bf16_gpu_available
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -16,9 +12,6 @@ from axolotl.utils.dict import DictDefault
 
 from ..utils import check_model_output_exists, check_tensorboard
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestFAXentropyLlama:
     """
@@ -69,6 +62,7 @@ class TestFAXentropyLlama:
                 "optimizer": "adamw_8bit",
                 "lr_scheduler": "cosine",
                 "use_tensorboard": True,
+                "save_first_step": False,
             }
         )
         if is_torch_bf16_gpu_available():
@@ -79,12 +73,11 @@ class TestFAXentropyLlama:
         cfg = validate_config(cfg)
         normalize_config(cfg)
 
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/patched/test_falcon_samplepack.py b/tests/e2e/patched/test_falcon_samplepack.py
index 667b62ffb..ef31b11c7 100644
--- a/tests/e2e/patched/test_falcon_samplepack.py
+++ b/tests/e2e/patched/test_falcon_samplepack.py
@@ -2,13 +2,10 @@
 E2E tests for falcon
 """
 
-import logging
-import os
 import unittest
 
 import pytest
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -16,9 +13,6 @@ from axolotl.utils.dict import DictDefault
 
 from ..utils import check_model_output_exists, with_temp_dir
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestFalconPatched(unittest.TestCase):
     """
@@ -64,12 +58,12 @@ class TestFalconPatched(unittest.TestCase):
                 "save_steps": 10,
                 "eval_steps": 10,
                 "bf16": "auto",
+                "save_first_step": False,
             }
         )
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
@@ -106,12 +100,12 @@ class TestFalconPatched(unittest.TestCase):
                 "save_steps": 10,
                 "eval_steps": 10,
                 "bf16": "auto",
+                "save_first_step": False,
             }
         )
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/patched/test_flattening.py b/tests/e2e/patched/test_flattening.py
new file mode 100644
index 000000000..fdaab558d
--- /dev/null
+++ b/tests/e2e/patched/test_flattening.py
@@ -0,0 +1,82 @@
+"""
+E2E tests for flattening batches
+"""
+
+import pytest
+from transformers.utils import is_torch_bf16_gpu_available
+
+from axolotl.common.datasets import load_datasets
+from axolotl.train import train
+from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.dict import DictDefault
+
+from ..utils import check_model_output_exists, check_tensorboard
+
+
+class TestFAFlattening:
+    """
+    Test case for Llama models using LoRA w batch flattening
+    """
+
+    @pytest.mark.parametrize(
+        "gradient_accumulation_steps",
+        [1, 4],
+    )
+    def test_lora_packing_flattening(self, temp_dir, gradient_accumulation_steps):
+        # pylint: disable=duplicate-code
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "sequence_len": 1024,
+                "batch_flattening": True,
+                "flash_attention": True,
+                "load_in_8bit": True,
+                "adapter": "lora",
+                "lora_r": 8,
+                "lora_alpha": 16,
+                "lora_dropout": 0.05,
+                "lora_target_linear": True,
+                "val_set_size": 0.05,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "chat_template": "chatml",
+                "datasets": [
+                    {
+                        "path": "mlabonne/FineTome-100k",
+                        "field_messages": "conversations",
+                        "message_field_content": "value",
+                        "message_field_role": "from",
+                        "type": "chat_template",
+                        "split": "train[:2%]",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 5,
+                "save_steps": 5,
+                "micro_batch_size": 2,
+                "gradient_accumulation_steps": gradient_accumulation_steps,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_8bit",
+                "lr_scheduler": "cosine",
+                "use_tensorboard": True,
+                "save_first_step": False,
+            }
+        )
+        if is_torch_bf16_gpu_available():
+            cfg.bf16 = True
+        else:
+            cfg.fp16 = True
+
+        cfg = validate_config(cfg)
+        normalize_config(cfg)
+
+        dataset_meta = load_datasets(cfg=cfg)
+
+        train(cfg=cfg, dataset_meta=dataset_meta)
+        check_model_output_exists(temp_dir, cfg)
+
+        check_tensorboard(
+            temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss (%s) is too high"
+        )
diff --git a/tests/e2e/patched/test_fsdp2_qlora.py b/tests/e2e/patched/test_fsdp2_qlora.py
new file mode 100644
index 000000000..9dd053ad8
--- /dev/null
+++ b/tests/e2e/patched/test_fsdp2_qlora.py
@@ -0,0 +1,131 @@
+"""Integration tests for FSDP Params4bit patches."""
+
+from unittest.mock import Mock, patch
+
+import bitsandbytes as bnb
+import pytest
+import torch
+from torch.distributed.fsdp._fully_shard._fsdp_param import FSDPParam
+
+from axolotl.monkeypatch.fsdp2_qlora import (
+    apply_bnb_torch_function_patch,
+    patched_torch_function,
+)
+
+
+@pytest.fixture
+def mock_params4bit():
+    """Create a mock Params4bit instance with test attributes."""
+    mock_instance = Mock()
+    mock_instance.requires_grad = True
+    mock_instance.quant_state = "test_state"
+    mock_instance.blocksize = 128
+    mock_instance.compress_statistics = True
+    mock_instance.quant_type = "fp4"
+    mock_instance.quant_storage = "test_storage"
+    mock_instance.module = "test_module"
+    mock_instance.bnb_quantized = True
+    return mock_instance
+
+
+class TestBnbTorchFunctionPatch:
+    """Test the Params4bit.__torch_function__ patch."""
+
+    def test_apply_patch(self):
+        """Test that the patch can be applied."""
+        with patch("bitsandbytes.nn.modules.Params4bit") as mock_cls:
+            apply_bnb_torch_function_patch()
+            assert hasattr(mock_cls, "__torch_function__")
+            assert isinstance(mock_cls.__torch_function__, classmethod)
+
+    # pylint: disable=redefined-outer-name
+    def test_torch_chunk_preserves_attributes(self, mock_params4bit):
+        """Test that torch.chunk preserves Params4bit attributes."""
+        mock_cls = Mock()
+        chunks = (torch.tensor([1, 2]), torch.tensor([3, 4]))
+
+        with patch("torch.nn.Parameter.__torch_function__", return_value=chunks):
+            result = patched_torch_function(
+                mock_cls,
+                torch.chunk,
+                (type(mock_params4bit),),
+                args=(mock_params4bit, 2),
+            )
+
+            assert isinstance(result, tuple)
+            assert len(result) == 2
+
+            # Check that Params4bit constructor was called with preserved attributes
+            assert mock_cls.call_count == 2
+            for call in mock_cls.call_args_list:
+                kwargs = call[1]
+                assert kwargs["requires_grad"] == mock_params4bit.requires_grad
+                assert kwargs["quant_state"] == mock_params4bit.quant_state
+                assert kwargs["blocksize"] == mock_params4bit.blocksize
+
+    # pylint: disable=redefined-outer-name
+    def test_other_functions_fallback(self, mock_params4bit):
+        """Test that non-chunk/split functions use Parameter fallback."""
+        mock_cls = Mock()
+        fallback_result = torch.tensor([5, 6, 7])
+
+        with patch(
+            "torch.nn.Parameter.__torch_function__", return_value=fallback_result
+        ) as mock_fallback:
+            result = patched_torch_function(
+                mock_cls, torch.add, (type(mock_params4bit),), args=(mock_params4bit, 1)
+            )
+
+            # Should call Parameter.__torch_function__ and return its result
+            mock_fallback.assert_called_once()
+            assert result is fallback_result
+            mock_cls.assert_not_called()
+
+
+class TestFSDPPatchIntegration:
+    """Test FSDP patch integration."""
+
+    @pytest.mark.integration
+    def test_all_patches_together(self):
+        """Test that all patches can be applied together."""
+        from axolotl.monkeypatch.fsdp2_qlora import (
+            apply_init_sharded_param_patch,
+            apply_init_unsharded_param_patch,
+        )
+
+        # Store original methods before patching
+        original_torch_function = getattr(
+            bnb.nn.modules.Params4bit, "__torch_function__", None
+        )
+
+        # pylint: disable=protected-access
+        original_init_sharded = FSDPParam._init_sharded_param
+        original_init_unsharded = FSDPParam.init_unsharded_param
+
+        # Apply patches
+        apply_bnb_torch_function_patch()
+        apply_init_sharded_param_patch()
+        apply_init_unsharded_param_patch()
+
+        # Verify patches were applied
+        current_torch_function = getattr(
+            bnb.nn.modules.Params4bit, "__torch_function__", None
+        )
+        if original_torch_function is not None:
+            assert (
+                current_torch_function != original_torch_function
+            ), "Params4bit.__torch_function__ was not patched"
+        else:
+            assert (
+                current_torch_function is not None
+            ), "Params4bit.__torch_function__ was not added"
+
+        # Check that FSDP methods were patched
+        assert (
+            # pylint: disable=protected-access
+            FSDPParam._init_sharded_param
+            != original_init_sharded
+        ), "_init_sharded_param was not patched"
+        assert (
+            FSDPParam.init_unsharded_param != original_init_unsharded
+        ), "init_unsharded_param was not patched"
diff --git a/tests/e2e/patched/test_fused_llama.py b/tests/e2e/patched/test_fused_llama.py
index 7725e095d..f0c4f155f 100644
--- a/tests/e2e/patched/test_fused_llama.py
+++ b/tests/e2e/patched/test_fused_llama.py
@@ -2,14 +2,11 @@
 E2E tests for lora llama
 """
 
-import logging
-import os
 import unittest
 
 import pytest
 from transformers.utils import is_torch_bf16_gpu_available
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -17,9 +14,6 @@ from axolotl.utils.dict import DictDefault
 
 from ..utils import check_model_output_exists, with_temp_dir
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 @pytest.mark.skip("FIXME, mostly underused functionality")
 class TestFusedLlama(unittest.TestCase):
@@ -35,7 +29,6 @@ class TestFusedLlama(unittest.TestCase):
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
                 "flash_attention": True,
                 "pad_to_sequence_len": True,
-                "flash_attn_fuse_qkv": True,
                 "flash_attn_fuse_mlp": True,
                 "sample_packing": True,
                 "sequence_len": 1024,
@@ -59,6 +52,7 @@ class TestFusedLlama(unittest.TestCase):
                 "max_steps": 10,
                 "save_steps": 5,
                 "eval_steps": 5,
+                "save_first_step": False,
             }
         )
         if is_torch_bf16_gpu_available():
@@ -67,8 +61,7 @@ class TestFusedLlama(unittest.TestCase):
             cfg.fp16 = True
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/patched/test_llama_s2_attention.py b/tests/e2e/patched/test_llama_s2_attention.py
index 3cf43ba9d..ba5556a59 100644
--- a/tests/e2e/patched/test_llama_s2_attention.py
+++ b/tests/e2e/patched/test_llama_s2_attention.py
@@ -2,13 +2,10 @@
 E2E tests for llama w/ S2 attn
 """
 
-import logging
-import os
 import unittest
 
 import pytest
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -16,9 +13,6 @@ from axolotl.utils.dict import DictDefault
 
 from ..utils import check_model_output_exists, with_temp_dir
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 @pytest.mark.skip(reason="FIXME?")
 class TestLlamaShiftedSparseAttention(unittest.TestCase):
@@ -64,13 +58,13 @@ class TestLlamaShiftedSparseAttention(unittest.TestCase):
                 "save_steps": 5,
                 "eval_steps": 5,
                 "bf16": "auto",
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
@@ -107,13 +101,13 @@ class TestLlamaShiftedSparseAttention(unittest.TestCase):
                 "save_steps": 5,
                 "eval_steps": 5,
                 "bf16": "auto",
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/patched/test_lora_llama_multipack.py b/tests/e2e/patched/test_lora_llama_multipack.py
index ca989f241..fdf6adbc6 100644
--- a/tests/e2e/patched/test_lora_llama_multipack.py
+++ b/tests/e2e/patched/test_lora_llama_multipack.py
@@ -2,14 +2,11 @@
 E2E tests for lora llama
 """
 
-import logging
-import os
 import unittest
 
 import pytest
 from transformers.utils import is_auto_gptq_available, is_torch_bf16_gpu_available
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -17,9 +14,6 @@ from axolotl.utils.dict import DictDefault
 
 from ..utils import check_model_output_exists, with_temp_dir
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestLoraLlama(unittest.TestCase):
     """
@@ -61,6 +55,7 @@ class TestLoraLlama(unittest.TestCase):
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch_fused",
                 "lr_scheduler": "cosine",
+                "save_first_step": False,
             }
         )
         if is_torch_bf16_gpu_available():
@@ -70,8 +65,7 @@ class TestLoraLlama(unittest.TestCase):
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
@@ -115,12 +109,12 @@ class TestLoraLlama(unittest.TestCase):
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch_fused",
                 "lr_scheduler": "cosine",
+                "save_first_step": False,
             }
         )
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/patched/test_mistral_samplepack.py b/tests/e2e/patched/test_mistral_samplepack.py
index fe8fafb19..bea0f9c68 100644
--- a/tests/e2e/patched/test_mistral_samplepack.py
+++ b/tests/e2e/patched/test_mistral_samplepack.py
@@ -2,20 +2,14 @@
 E2E tests for lora llama
 """
 
-import logging
-import os
 import unittest
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
 from axolotl.utils.dict import DictDefault
 
-from ..utils import check_model_output_exists, with_temp_dir
-
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
+from ..utils import check_model_output_exists, require_torch_2_6_0, with_temp_dir
 
 
 class TestMistral(unittest.TestCase):
@@ -23,6 +17,7 @@ class TestMistral(unittest.TestCase):
     Test case for Llama models using LoRA
     """
 
+    @require_torch_2_6_0
     @with_temp_dir
     def test_lora_packing(self, temp_dir):
         # pylint: disable=duplicate-code
@@ -61,12 +56,12 @@ class TestMistral(unittest.TestCase):
                 "save_steps": 3,
                 "eval_steps": 4,
                 "bf16": "auto",
+                "save_first_step": False,
             }
         )
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
@@ -103,12 +98,12 @@ class TestMistral(unittest.TestCase):
                 "save_steps": 3,
                 "eval_steps": 4,
                 "bf16": "auto",
+                "save_first_step": False,
             }
         )
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/patched/test_mixtral_samplepack.py b/tests/e2e/patched/test_mixtral_samplepack.py
index ebc2ba092..09e427abd 100644
--- a/tests/e2e/patched/test_mixtral_samplepack.py
+++ b/tests/e2e/patched/test_mixtral_samplepack.py
@@ -2,11 +2,8 @@
 E2E tests for mixtral
 """
 
-import logging
-import os
 import unittest
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -14,9 +11,6 @@ from axolotl.utils.dict import DictDefault
 
 from ..utils import check_model_output_exists, with_temp_dir
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestMixtral(unittest.TestCase):
     """
@@ -58,12 +52,12 @@ class TestMixtral(unittest.TestCase):
                 "save_steps": 3,
                 "eval_steps": 4,
                 "bf16": "auto",
+                "save_first_step": False,
             }
         )
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
@@ -97,12 +91,12 @@ class TestMixtral(unittest.TestCase):
                 "save_steps": 3,
                 "eval_steps": 4,
                 "bf16": "auto",
+                "save_first_step": False,
             }
         )
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/patched/test_model_patches.py b/tests/e2e/patched/test_model_patches.py
index 5ea88b001..b90be23e4 100644
--- a/tests/e2e/patched/test_model_patches.py
+++ b/tests/e2e/patched/test_model_patches.py
@@ -45,6 +45,7 @@ class TestModelPatches(unittest.TestCase):
                 "max_steps": 20,
                 "save_steps": 10,
                 "eval_steps": 10,
+                "save_first_step": False,
             }
         )
         cfg = validate_config(cfg)
@@ -78,6 +79,7 @@ class TestModelPatches(unittest.TestCase):
                 "max_steps": 20,
                 "save_steps": 10,
                 "eval_steps": 10,
+                "save_first_step": False,
             }
         )
         cfg = validate_config(cfg)
diff --git a/tests/e2e/patched/test_peft_embeddings.py b/tests/e2e/patched/test_peft_embeddings.py
index d4f59a128..4769319ae 100644
--- a/tests/e2e/patched/test_peft_embeddings.py
+++ b/tests/e2e/patched/test_peft_embeddings.py
@@ -49,6 +49,7 @@ class TestLlamaPeftEmbeddings:
                 "bf16": "auto",
                 "save_safetensors": True,
                 "embeddings_skip_upcast": True,
+                "save_first_step": False,
             }
         )
 
diff --git a/tests/e2e/patched/test_phi_multipack.py b/tests/e2e/patched/test_phi_multipack.py
index d8130d119..1f0ddd630 100644
--- a/tests/e2e/patched/test_phi_multipack.py
+++ b/tests/e2e/patched/test_phi_multipack.py
@@ -2,11 +2,8 @@
 E2E tests for lora llama
 """
 
-import logging
-import os
 import unittest
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -14,9 +11,6 @@ from axolotl.utils.dict import DictDefault
 
 from ..utils import check_model_output_exists, with_temp_dir
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestPhiMultipack(unittest.TestCase):
     """
@@ -60,13 +54,13 @@ class TestPhiMultipack(unittest.TestCase):
                 "eval_steps": 3,
                 "save_steps": 4,
                 "bf16": "auto",
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
@@ -112,13 +106,13 @@ class TestPhiMultipack(unittest.TestCase):
                 "eval_steps": 3,
                 "save_steps": 4,
                 "bf16": "auto",
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/patched/test_resume.py b/tests/e2e/patched/test_resume.py
index 61e4a0e03..54b8245ee 100644
--- a/tests/e2e/patched/test_resume.py
+++ b/tests/e2e/patched/test_resume.py
@@ -2,14 +2,11 @@
 E2E tests for resuming training
 """
 
-import logging
-import os
 import re
 import subprocess
 
 from transformers.utils import is_torch_bf16_gpu_available
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -17,9 +14,6 @@ from axolotl.utils.dict import DictDefault
 
 from ..utils import check_model_output_exists, most_recent_subdir, require_torch_2_6_0
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestResumeLlama:
     """
@@ -64,6 +58,7 @@ class TestResumeLlama:
                 "max_steps": 15,
                 "use_tensorboard": True,
                 "save_safetensors": True,
+                "save_first_step": False,
             }
         )
         if is_torch_bf16_gpu_available():
@@ -72,8 +67,7 @@ class TestResumeLlama:
             cfg.fp16 = True
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
 
@@ -83,7 +77,6 @@ class TestResumeLlama:
             }
         )
         normalize_config(resume_cfg)
-        cli_args = TrainerCliArgs()
 
         train(cfg=resume_cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/patched/test_sp.py b/tests/e2e/patched/test_sp.py
deleted file mode 100644
index 2b4d11b30..000000000
--- a/tests/e2e/patched/test_sp.py
+++ /dev/null
@@ -1,480 +0,0 @@
-"""Tests for sequence parallelism functionality."""
-
-# pylint: disable=redefined-outer-name,unused-argument
-
-import functools
-import sys
-from unittest.mock import MagicMock, patch
-
-import pytest
-import torch
-from accelerate.state import PartialState
-
-from axolotl.monkeypatch.ring_attn import (
-    get_ring_attn_group,
-    register_ring_attn,
-    set_ring_attn_group,
-)
-from axolotl.utils.ctx_managers.sequence_parallel import apply_sequence_parallelism
-from axolotl.utils.dict import DictDefault
-from axolotl.utils.schemas.enums import RingAttnFunc
-from axolotl.utils.schemas.trl import TRLConfig
-
-
-@pytest.fixture
-def partial_state():
-    """Create a real PartialState instance for testing."""
-    state = PartialState()
-    return state
-
-
-@pytest.fixture(name="cfg")
-def fixture_cfg():
-    cfg = DictDefault(
-        {
-            "base_model": "HuggingFaceTB/SmolLM2-135M",
-            "datasets": [
-                {
-                    "path": "mhenrichsen/alpaca_2k_test",
-                    "type": "alpaca",
-                },
-            ],
-            "micro_batch_size": 1,
-            "gradient_accumulation_steps": 1,
-            "learning_rate": 1e-3,
-            "output_dir": "./model-out",
-            "sequence_len": 512,
-            "special_tokens": {
-                "pad_token": "<|endoftext|>",
-            },
-        }
-    )
-
-    return cfg
-
-
-@pytest.fixture
-def sequence_parallel_batch():
-    """Create a test batch for sequence parallelism tests."""
-    batch_size = 1
-    seq_len = 8
-
-    # Create test tensors
-    input_ids = torch.arange(batch_size * seq_len).reshape(batch_size, seq_len)
-    attention_mask = torch.ones(batch_size, seq_len)
-    position_ids = torch.arange(seq_len).expand(batch_size, seq_len)
-    labels = input_ids.clone()
-
-    # Create test batch
-    batch = {
-        "input_ids": input_ids,
-        "attention_mask": attention_mask,
-        "position_ids": position_ids,
-        "labels": labels,
-    }
-
-    return batch
-
-
-class TestRingAttention:
-    """Tests for the ring attention functionality."""
-
-    @patch("torch.distributed.get_rank")
-    @patch("torch.distributed.get_world_size")
-    def test_get_ring_attn_group_no_registration(
-        self, mock_world_size, mock_rank, partial_state
-    ):
-        """Test that get_ring_attn_group raises RuntimeError when no group has been registered."""
-        # Setup mocks
-        mock_world_size.return_value = 4
-        mock_rank.return_value = 0
-
-        # Verify that RuntimeError is raised when no group is registered
-        with pytest.raises(
-            RuntimeError, match="register_ring_attn\\(\\) not yet called"
-        ):
-            get_ring_attn_group()
-
-    @patch("torch.distributed.new_group")
-    @patch("torch.distributed.get_rank")
-    @patch("torch.distributed.get_world_size")
-    def test_register_ring_attn(
-        self, mock_world_size, mock_rank, mock_new_group, partial_state
-    ):
-        """Test that ring attention groups are created correctly."""
-        # Setup mocks
-        mock_world_size.return_value = 8  # 8 GPUs total
-        mock_rank.return_value = 3  # GPU #3
-        mock_group = MagicMock()
-        mock_new_group.return_value = mock_group
-
-        # Call register_ring_attn with size 4
-        register_ring_attn(
-            sequence_parallel_degree=4,
-            heads_k_stride=1,
-            ring_attn_func=RingAttnFunc.VARLEN_LLAMA3,
-        )
-
-        # Verify the number of calls without examining the arguments
-        assert mock_new_group.call_count == 2
-
-        # Verify that new_group was called
-        mock_new_group.assert_called()
-
-        # Clean up
-        set_ring_attn_group(None)
-
-
-class TestConfigValidation:
-    """Tests for validating sequence parallelism configurations."""
-
-    @pytest.fixture(autouse=True)
-    def setup_mocks(self, monkeypatch):
-        """Set up mocks for all tests in this class."""
-        # Mock the ring_flash_attn module
-        monkeypatch.setitem(sys.modules, "ring_flash_attn", MagicMock())
-
-    @pytest.fixture
-    def base_cfg(self):
-        """Create a base configuration for testing."""
-        return DictDefault(
-            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
-                "datasets": [{"path": "mhenrichsen/alpaca_2k_test", "type": "alpaca"}],
-                "micro_batch_size": 1,
-                "gradient_accumulation_steps": 1,
-                "learning_rate": 1e-3,
-                "output_dir": "./model-out",
-                "sequence_len": 512,
-                "special_tokens": {"pad_token": "<|endoftext|>"},
-            }
-        )
-
-    @pytest.mark.parametrize(
-        "config_updates, expected_values, should_pass, error_msg",
-        [
-            # Valid configuration
-            (
-                {"sequence_parallel_degree": 2, "flash_attention": True},
-                {"sequence_parallel_degree": 2, "flash_attention": True},
-                True,
-                None,
-            ),
-            # Default sequence_parallel_degree
-            ({}, {"sequence_parallel_degree": 1}, True, None),
-            # Invalid: sequence_parallel_degree > 1 without flash_attention
-            (
-                {"sequence_parallel_degree": 2, "flash_attention": False},
-                None,
-                False,
-                "flash_attention: true must be set",
-            ),
-            # Invalid: sequence_parallel_degree > 1 with sample_packing and micro_batch_size > 1
-            (
-                {
-                    "sequence_parallel_degree": 2,
-                    "flash_attention": True,
-                    "sample_packing": True,
-                    "micro_batch_size": 2,
-                    "pad_to_sequence_len": True,
-                },
-                None,
-                False,
-                "micro_batch_size must be set to 1",
-            ),
-            # Valid: Basic GRPO config
-            (
-                {
-                    "sequence_parallel_degree": 2,
-                    "flash_attention": True,
-                    "micro_batch_size": 2,
-                    "trl": {"use_liger_loss": True},
-                },
-                {
-                    "sequence_parallel_degree": 2,
-                    "flash_attention": True,
-                    "micro_batch_size": 2,
-                    "trl": TRLConfig(use_liger_loss=True),
-                },
-                True,
-                "GRPO + SP + Liger not currently supported",
-            ),
-            # Invalid: GRPO config with Liger loss
-            (
-                {
-                    "rl": "grpo",
-                    "sequence_parallel_degree": 2,
-                    "flash_attention": True,
-                    "micro_batch_size": 2,
-                    "trl": {"use_liger_loss": True},
-                },
-                None,
-                False,
-                "GRPO + SP + Liger not currently supported",
-            ),
-        ],
-        ids=[
-            "valid_config",
-            "default_sp_degree",
-            "without_flash_attention",
-            "sample_packing_with_large_batch",
-            "valid_grpo",
-            "grpo_with_liger_loss",
-        ],
-    )
-    def test_sequence_parallel_config_validation(
-        self, base_cfg, config_updates, expected_values, should_pass, error_msg
-    ):
-        """Test various sequence parallelism configuration scenarios."""
-        from axolotl.utils.schemas.config import AxolotlInputConfig
-
-        # Apply updates to base config
-        cfg = base_cfg
-        cfg.update(config_updates)
-
-        if should_pass:
-            # Should validate without errors
-            config = AxolotlInputConfig(**cfg)
-
-            # Check expected values
-            for key, value in expected_values.items():
-                assert getattr(config, key) == value
-        else:
-            # Should raise exception
-            with pytest.raises(ValueError) as excinfo:
-                AxolotlInputConfig(**cfg)
-            assert error_msg in str(excinfo.value)
-
-    @pytest.mark.parametrize(
-        "ring_attn_func, sample_packing, expected_func",
-        [
-            (None, True, RingAttnFunc.VARLEN_LLAMA3),
-            (None, False, RingAttnFunc.BATCH_RING),
-        ],
-        ids=["default_with_sample_packing", "default_without_sample_packing"],
-    )
-    def test_ring_attn_func_validation(
-        self, base_cfg, ring_attn_func, sample_packing, expected_func
-    ):
-        """Test ring_attn_func validation and defaults."""
-        from axolotl.utils.schemas.config import AxolotlInputConfig
-
-        # Apply updates to base config
-        cfg = base_cfg | {
-            "sequence_parallel_degree": 2,
-            "flash_attention": True,
-            "sample_packing": sample_packing,
-        }
-
-        if ring_attn_func is not None:
-            cfg["ring_attn_func"] = ring_attn_func
-
-        # Should validate without errors
-        config = AxolotlInputConfig(**cfg)
-
-        # Check ring_attn_func value
-        assert config.ring_attn_func.value == expected_func
-
-    def test_invalid_ring_attn_func(self, base_cfg):
-        """Test that an invalid ring_attn_func is rejected."""
-        from axolotl.utils.schemas.config import AxolotlInputConfig
-
-        # Invalid configuration with invalid ring_attn_func
-        cfg = base_cfg | {
-            "sequence_parallel_degree": 2,
-            "flash_attention": True,
-            "ring_attn_func": "INVALID_FUNC",
-        }
-
-        # Should raise ValidationError
-        with pytest.raises(ValueError) as excinfo:
-            AxolotlInputConfig(**cfg)
-
-        # Verify error message
-        assert "Input should be 'varlen_llama3' or 'batch_ring'" in str(excinfo.value)
-
-
-class TestApplySequenceParallelism:
-    """Tests for the apply_sequence_parallelism function."""
-
-    @pytest.fixture(autouse=True)
-    def mock_distributed(self, monkeypatch):
-        """Mock torch.distributed functions for testing."""
-        # Mock is_initialized to return True
-        monkeypatch.setattr(torch.distributed, "is_initialized", lambda: True)
-
-        # Mock get_rank to return 0 by default
-        monkeypatch.setattr(torch.distributed, "get_rank", lambda *args, **kwargs: 0)
-
-        # Mock get_world_size to return 2 by default
-        monkeypatch.setattr(
-            torch.distributed, "get_world_size", lambda *args, **kwargs: 2
-        )
-
-        # Mock the process group
-        monkeypatch.setattr(
-            "axolotl.monkeypatch.ring_attn.get_ring_attn_group",
-            MagicMock,
-        )
-
-        # Mock update_ring_attn_params
-        monkeypatch.setattr(
-            "axolotl.monkeypatch.ring_attn.update_ring_attn_params",
-            lambda **kwargs: None,
-        )
-
-    @patch("axolotl.monkeypatch.ring_attn.patch.get_ring_attn_group")
-    def test_world_size_one(self, mock_get_ring_attn_group, sequence_parallel_batch):
-        """Test that function returns original batch when world size is 1."""
-        mock_get_ring_attn_group.return_value = 0
-
-        result, _, _ = apply_sequence_parallelism(
-            batch=sequence_parallel_batch,
-            local_rank=0,
-            local_world_size=1,
-            gradient_accumulation_steps=1,
-            ring_attn_func=RingAttnFunc.BATCH_RING,
-        )
-
-        # Should return the original batch unchanged
-        assert result == sequence_parallel_batch
-
-    @patch("axolotl.monkeypatch.ring_attn.patch.get_ring_attn_group")
-    def test_batch_ring_rank0(self, mock_get_ring_attn_group, sequence_parallel_batch):
-        """Test BATCH_RING sharding for rank 0 in a 2-process group."""
-        mock_get_ring_attn_group.return_value = 0
-
-        batch = sequence_parallel_batch
-        seq_len = batch["input_ids"].size(1)
-
-        result, _, _ = apply_sequence_parallelism(
-            batch=batch,
-            local_rank=0,
-            local_world_size=2,
-            gradient_accumulation_steps=1,
-            ring_attn_func=RingAttnFunc.BATCH_RING,
-        )
-
-        # Check that sequence dimension was sharded correctly
-        assert result["input_ids"].shape[1] == seq_len // 2
-        assert result["attention_mask"].shape[1] == seq_len // 2
-
-        # Verify content: rank 0 should get the first half of the sequence
-        assert torch.equal(result["input_ids"], batch["input_ids"][:, : seq_len // 2])
-        assert torch.equal(
-            result["position_ids"], batch["position_ids"][:, : seq_len // 2]
-        )
-
-    @patch("axolotl.monkeypatch.ring_attn.patch.get_ring_attn_group")
-    def test_batch_ring_rank1(self, mock_get_ring_attn_group, sequence_parallel_batch):
-        """Test BATCH_RING sharding for rank 1 in a 2-process group."""
-        mock_get_ring_attn_group.return_value = 0
-
-        batch = sequence_parallel_batch
-        seq_len = batch["input_ids"].size(1)
-        original_input_ids = batch["input_ids"].clone()
-
-        result, _, _ = apply_sequence_parallelism(
-            batch=batch,
-            local_rank=1,
-            local_world_size=2,
-            gradient_accumulation_steps=1,
-            ring_attn_func=RingAttnFunc.BATCH_RING,
-        )
-
-        # Verify content: rank 1 should get the second half of the sequence
-        assert torch.equal(result["input_ids"], original_input_ids[:, seq_len // 2 :])
-
-    # TODO(djsaunde): add back once implemented.
-    # def test_batch_zigzag(self, sequence_parallel_batch):
-    #     """Test BATCH_ZIGZAG sharding pattern."""
-    #     batch = sequence_parallel_batch
-    #     original_input_ids = batch["input_ids"].clone()
-    #     seq_len = batch["input_ids"].size(1)
-
-    #     # Test rank 0
-    #     result_rank0 = apply_sequence_parallelism(
-    #         batch={k: v.clone() for k, v in batch.items()},
-    #         local_rank=0,
-    #         local_world_size=2,
-    #         ring_attn_func=RingAttnFunc.BATCH_ZIGZAG,
-    #     )
-
-    #     # Test rank 1
-    #     result_rank1 = apply_sequence_parallelism(
-    #         batch={k: v.clone() for k, v in batch.items()},
-    #         local_rank=1,
-    #         local_world_size=2,
-    #         ring_attn_func=RingAttnFunc.BATCH_ZIGZAG,
-    #     )
-
-    #     # Checks for both ranks
-    #     assert result_rank0["input_ids"].shape[1] == seq_len // 2
-    #     assert result_rank1["input_ids"].shape[1] == seq_len // 2
-
-    #     # For a 2-rank system with 8 tokens, check specific zigzag pattern
-    #     # Rank 0 should get chunks [0, 1] and [6, 7]
-    #     # Rank 1 should get chunks [2, 3] and [4, 5]
-    #     if seq_len == 8:
-    #         # Create expected tensors for comparison
-    #         rank0_expected = torch.cat(
-    #             [original_input_ids[:, :2], original_input_ids[:, 6:8]], dim=1
-    #         )
-
-    #         rank1_expected = torch.cat(
-    #             [original_input_ids[:, 2:4], original_input_ids[:, 4:6]], dim=1
-    #         )
-
-    #         assert torch.equal(result_rank0["input_ids"], rank0_expected)
-    #         assert torch.equal(result_rank1["input_ids"], rank1_expected)
-
-    @patch("axolotl.monkeypatch.ring_attn.patch.get_ring_attn_group")
-    def test_partial_application(
-        self, mock_get_ring_attn_group, sequence_parallel_batch
-    ):
-        """Test that we can create a partially applied version of the function."""
-        mock_get_ring_attn_group.return_value = 0
-
-        batch = sequence_parallel_batch
-        original_input_ids = batch["input_ids"].clone()
-
-        # Create a partially applied function
-        rank0_ring_parallel = functools.partial(
-            apply_sequence_parallelism,
-            local_rank=0,
-            local_world_size=2,
-            gradient_accumulation_steps=1,
-            ring_attn_func=RingAttnFunc.BATCH_RING,
-        )
-
-        # Use the partially applied function
-        result, _, _ = rank0_ring_parallel(batch=batch)
-
-        # Verify it works as expected
-        assert result["input_ids"].shape[1] == original_input_ids.shape[1] // 2
-        assert torch.equal(
-            result["input_ids"],
-            original_input_ids[:, : original_input_ids.shape[1] // 2],
-        )
-
-    def test_missing_position_ids(self, sequence_parallel_batch):
-        """Test handling of batch without position_ids."""
-        # Create a batch without position_ids
-        batch = {
-            k: v for k, v in sequence_parallel_batch.items() if k != "position_ids"
-        }
-        original_input_ids = batch["input_ids"].clone()
-
-        # This should run without error even though position_ids is missing
-        result, _, _ = apply_sequence_parallelism(
-            batch=batch,
-            local_rank=0,
-            local_world_size=2,
-            gradient_accumulation_steps=1,
-            ring_attn_func=RingAttnFunc.BATCH_RING,
-        )
-
-        # Verification should pass
-        assert "position_ids" in result
-        assert result["input_ids"].shape[1] == result["position_ids"].shape[1]
-        assert result["input_ids"].shape[1] == original_input_ids.shape[1] // 2
diff --git a/tests/e2e/patched/test_unsloth_qlora.py b/tests/e2e/patched/test_unsloth_qlora.py
index 5f8fde6b4..2c8ee4eb0 100644
--- a/tests/e2e/patched/test_unsloth_qlora.py
+++ b/tests/e2e/patched/test_unsloth_qlora.py
@@ -2,12 +2,8 @@
 e2e tests for unsloth qlora
 """
 
-import logging
-import os
-
 import pytest
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -15,9 +11,6 @@ from axolotl.utils.dict import DictDefault
 
 from ..utils import check_model_output_exists, check_tensorboard
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 # pylint: disable=duplicate-code
 @pytest.mark.skip(
@@ -69,19 +62,19 @@ class TestUnslothQLoRA:
                 "lr_scheduler": "cosine",
                 "use_tensorboard": True,
                 "bf16": "auto",
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
         )
 
     def test_unsloth_llama_qlora_unpacked(self, temp_dir):
@@ -120,19 +113,19 @@ class TestUnslothQLoRA:
                 "lr_scheduler": "cosine",
                 "use_tensorboard": True,
                 "bf16": "auto",
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.parametrize(
@@ -176,17 +169,17 @@ class TestUnslothQLoRA:
                 "lr_scheduler": "cosine",
                 "use_tensorboard": True,
                 "fp16": True,
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/solo/test_flex.py b/tests/e2e/solo/test_flex.py
index 71da795f8..76364fc0e 100644
--- a/tests/e2e/solo/test_flex.py
+++ b/tests/e2e/solo/test_flex.py
@@ -2,13 +2,10 @@
 E2E tests for packed training w/ flex attention
 """
 
-import logging
-import os
 import unittest
 
 from transformers.utils import is_torch_bf16_gpu_available
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -16,9 +13,6 @@ from axolotl.utils.dict import DictDefault
 
 from ..utils import check_tensorboard, require_torch_2_6_0, with_temp_dir
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestPackedFlex(unittest.TestCase):
     """
@@ -55,6 +49,7 @@ class TestPackedFlex(unittest.TestCase):
                 "lr_scheduler": "cosine",
                 "max_steps": 5,
                 "use_tensorboard": True,
+                "save_first_step": False,
             }
         )
         if is_torch_bf16_gpu_available():
@@ -64,11 +59,10 @@ class TestPackedFlex(unittest.TestCase):
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/solo/test_relora_llama.py b/tests/e2e/solo/test_relora_llama.py
index 504466b90..b399b4680 100644
--- a/tests/e2e/solo/test_relora_llama.py
+++ b/tests/e2e/solo/test_relora_llama.py
@@ -2,12 +2,9 @@
 E2E tests for relora llama
 """
 
-import logging
-import os
 import unittest
 from pathlib import Path
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -15,9 +12,6 @@ from axolotl.utils.dict import DictDefault
 
 from ..utils import check_model_output_exists, check_tensorboard, with_temp_dir
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestReLoraLlama(unittest.TestCase):
     """
@@ -40,9 +34,10 @@ class TestReLoraLlama(unittest.TestCase):
                 "lora_alpha": 16,
                 "lora_dropout": 0.05,
                 "lora_target_modules": ["q_proj", "v_proj"],
-                "relora_steps": 50,
-                "relora_warmup_steps": 10,
-                "relora_anneal_steps": 10,
+                "relora": True,
+                "jagged_restart_steps": 50,
+                "jagged_restart_warmup_steps": 10,
+                "jagged_restart_anneal_steps": 10,
                 "relora_prune_ratio": 0.9,
                 "relora_cpu_offload": True,
                 "val_set_size": 0.0,
@@ -71,13 +66,13 @@ class TestReLoraLlama(unittest.TestCase):
                 "lr_scheduler": "cosine",
                 "save_safetensors": True,
                 "use_tensorboard": True,
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(Path(temp_dir) / "checkpoint-100/adapter", cfg)
diff --git a/tests/e2e/test_activation_offloading.py b/tests/e2e/test_activation_offloading.py
new file mode 100644
index 000000000..06c5c0656
--- /dev/null
+++ b/tests/e2e/test_activation_offloading.py
@@ -0,0 +1,83 @@
+"""
+E2E tests for activation offloading
+"""
+
+import pytest
+
+from axolotl.common.datasets import load_datasets
+from axolotl.train import train
+from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.dict import DictDefault
+
+from .utils import check_model_output_exists
+
+# pylint: disable=duplicate-code
+
+
+class TestActivationOffloading:
+    """
+    E2E test cases for activation offloading
+    """
+
+    @pytest.mark.parametrize(
+        "adapter",
+        ["lora", "qlora", None],
+    )
+    def test_activation_offloading(
+        self,
+        temp_dir,
+        adapter,
+    ):
+        # pylint: disable=duplicate-code
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "sequence_len": 1024,
+                "val_set_size": 0.0,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                    "eos_token": "<|im_end|>",
+                },
+                "datasets": [
+                    {
+                        "chat_template": "chatml",
+                        "path": "mlabonne/FineTome-100k",
+                        "type": "chat_template",
+                        "split": "train[:10%]",
+                        "field_messages": "conversations",
+                        "message_field_role": "from",
+                        "message_field_content": "value",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 1,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_8bit",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "sample_packing": True,
+                "bf16": "auto",
+                "save_safetensors": True,
+                "gradient_checkpointing": True,
+                "activation_offloading": True,
+                "save_first_step": False,
+                "lora_r": 8,
+                "lora_alpha": 16,
+                "lora_target_linear": True,
+            }
+        )
+        if adapter == "lora":
+            cfg["adapter"] = "lora"
+        if adapter == "qlora":
+            cfg["adapter"] = "qlora"
+            cfg["load_in_4bit"] = True
+
+        cfg = validate_config(cfg)
+        normalize_config(cfg)
+        dataset_meta = load_datasets(cfg=cfg)
+
+        train(cfg=cfg, dataset_meta=dataset_meta)
+        check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/test_deepseekv3.py b/tests/e2e/test_deepseekv3.py
index 2afda640f..e4a47fb0a 100644
--- a/tests/e2e/test_deepseekv3.py
+++ b/tests/e2e/test_deepseekv3.py
@@ -2,13 +2,10 @@
 E2E tests for deepseekv3
 """
 
-import logging
-import os
 from pathlib import Path
 
 import pytest
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -16,9 +13,6 @@ from axolotl.utils.dict import DictDefault
 
 from tests.hf_offline_utils import enable_hf_offline
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestDeepseekV3:
     """
@@ -73,12 +67,12 @@ class TestDeepseekV3:
                 "max_steps": 5,
                 "save_safetensors": True,
                 "bf16": True,
+                "save_first_step": False,
             }
         )
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         assert (Path(temp_dir) / "adapter_model.safetensors").exists()
@@ -123,12 +117,12 @@ class TestDeepseekV3:
                 "max_steps": 5,
                 "save_safetensors": True,
                 "bf16": True,
+                "save_first_step": False,
             }
         )
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         assert (Path(temp_dir) / "model.safetensors").exists()
diff --git a/tests/e2e/test_dpo.py b/tests/e2e/test_dpo.py
index 84d723ec0..a1df69535 100644
--- a/tests/e2e/test_dpo.py
+++ b/tests/e2e/test_dpo.py
@@ -1,9 +1,5 @@
-"""
-E2E tests for lora llama
-"""
+"""E2E tests for lora llama"""
 
-import logging
-import os
 import unittest
 from pathlib import Path
 
@@ -17,9 +13,6 @@ from axolotl.utils.dict import DictDefault
 
 from .utils import check_model_output_exists, with_temp_dir
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestDPOLlamaLora(unittest.TestCase):
     """
@@ -63,6 +56,7 @@ class TestDPOLlamaLora(unittest.TestCase):
                 "warmup_steps": 5,
                 "gradient_checkpointing": True,
                 "gradient_checkpointing_kwargs": {"use_reentrant": True},
+                "save_first_step": False,
             }
         )
 
@@ -112,6 +106,7 @@ class TestDPOLlamaLora(unittest.TestCase):
                 "warmup_steps": 5,
                 "gradient_checkpointing": True,
                 "gradient_checkpointing_kwargs": {"use_reentrant": True},
+                "save_first_step": False,
             }
         )
 
@@ -161,6 +156,7 @@ class TestDPOLlamaLora(unittest.TestCase):
                 "warmup_steps": 5,
                 "gradient_checkpointing": True,
                 "gradient_checkpointing_kwargs": {"use_reentrant": True},
+                "save_first_step": False,
             }
         )
 
@@ -210,6 +206,7 @@ class TestDPOLlamaLora(unittest.TestCase):
                 "warmup_steps": 5,
                 "gradient_checkpointing": True,
                 "gradient_checkpointing_kwargs": {"use_reentrant": True},
+                "save_first_step": False,
             }
         )
 
@@ -258,6 +255,7 @@ class TestDPOLlamaLora(unittest.TestCase):
                 "warmup_steps": 5,
                 "gradient_checkpointing": True,
                 "gradient_checkpointing_kwargs": {"use_reentrant": True},
+                "save_first_step": False,
             }
         )
 
@@ -309,6 +307,7 @@ class TestDPOLlamaLora(unittest.TestCase):
                 "warmup_steps": 5,
                 "gradient_checkpointing": True,
                 "gradient_checkpointing_kwargs": {"use_reentrant": True},
+                "save_first_step": False,
             }
         )
 
@@ -377,6 +376,7 @@ class TestDPOLlamaLora(unittest.TestCase):
                 "warmup_steps": 5,
                 "gradient_checkpointing": True,
                 "gradient_checkpointing_kwargs": {"use_reentrant": True},
+                "save_first_step": False,
             }
         )
 
diff --git a/tests/e2e/test_embeddings_lr.py b/tests/e2e/test_embeddings_lr.py
index 82b822ad6..e4a06ad14 100644
--- a/tests/e2e/test_embeddings_lr.py
+++ b/tests/e2e/test_embeddings_lr.py
@@ -2,11 +2,8 @@
 E2E tests for llama pretrain
 """
 
-import logging
-import os
 import unittest
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -14,9 +11,6 @@ from axolotl.utils.dict import DictDefault
 
 from .utils import check_model_output_exists, check_tensorboard, with_temp_dir
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestEmbeddingsLrScale(unittest.TestCase):
     """
@@ -54,13 +48,13 @@ class TestEmbeddingsLrScale(unittest.TestCase):
                 "save_safetensors": True,
                 "bf16": "auto",
                 "use_tensorboard": True,
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
@@ -100,12 +94,12 @@ class TestEmbeddingsLrScale(unittest.TestCase):
                 "save_safetensors": True,
                 "bf16": "auto",
                 "use_tensorboard": True,
+                "save_first_step": False,
             }
         )
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/test_evaluate.py b/tests/e2e/test_evaluate.py
index 0278113b7..977497e5e 100644
--- a/tests/e2e/test_evaluate.py
+++ b/tests/e2e/test_evaluate.py
@@ -1,6 +1,5 @@
 """E2E smoke test for evaluate CLI command"""
 
-import os
 from pathlib import Path
 
 import yaml
@@ -9,8 +8,6 @@ from transformers.testing_utils import get_torch_dist_unique_port
 
 from axolotl.utils.dict import DictDefault
 
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestE2eEvaluate:
     """Test cases for evaluate CLI"""
@@ -39,6 +36,7 @@ class TestE2eEvaluate:
                 "optimizer": "adamw_torch_fused",
                 "lr_scheduler": "cosine",
                 "max_steps": 20,
+                "save_first_step": False,
             }
         )
 
diff --git a/tests/e2e/test_falcon.py b/tests/e2e/test_falcon.py
index 24afab0b3..5be6efcf6 100644
--- a/tests/e2e/test_falcon.py
+++ b/tests/e2e/test_falcon.py
@@ -2,13 +2,10 @@
 E2E tests for falcon
 """
 
-import logging
-import os
 import unittest
 
 import pytest
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -16,9 +13,6 @@ from axolotl.utils.dict import DictDefault
 
 from .utils import check_model_output_exists, with_temp_dir
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestFalcon(unittest.TestCase):
     """
@@ -66,13 +60,13 @@ class TestFalcon(unittest.TestCase):
                 "save_steps": 10,
                 "eval_steps": 10,
                 "bf16": "auto",
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
@@ -122,13 +116,13 @@ class TestFalcon(unittest.TestCase):
                 "save_steps": 10,
                 "eval_steps": 10,
                 "bf16": "auto",
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
@@ -164,13 +158,13 @@ class TestFalcon(unittest.TestCase):
                 "save_steps": 10,
                 "eval_steps": 10,
                 "bf16": "auto",
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/test_gemma2.py b/tests/e2e/test_gemma2.py
index 68dc4855d..c0eba72a7 100644
--- a/tests/e2e/test_gemma2.py
+++ b/tests/e2e/test_gemma2.py
@@ -2,21 +2,15 @@
 E2E tests for gemma2
 """
 
-import logging
-import os
 from pathlib import Path
 
 import pytest
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
 from axolotl.utils.dict import DictDefault
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestGemma2:
     """
@@ -74,8 +68,7 @@ class TestGemma2:
         )
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         assert (Path(temp_dir) / "adapter_model.safetensors").exists()
@@ -126,8 +119,7 @@ class TestGemma2:
         )
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         assert (Path(temp_dir) / "model.safetensors").exists()
diff --git a/tests/e2e/test_gemma3_text.py b/tests/e2e/test_gemma3_text.py
index 5cbde04d1..ef38d028d 100644
--- a/tests/e2e/test_gemma3_text.py
+++ b/tests/e2e/test_gemma3_text.py
@@ -2,21 +2,15 @@
 E2E tests for gemma3_text
 """
 
-import logging
-import os
 from pathlib import Path
 
 import pytest
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
 from axolotl.utils.dict import DictDefault
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestGemma3Text:
     """
@@ -69,12 +63,12 @@ class TestGemma3Text:
                 "max_steps": 5,
                 "save_safetensors": True,
                 "bf16": True,
+                "save_first_step": False,
             }
         )
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         assert (Path(temp_dir) / "adapter_model.safetensors").exists()
@@ -120,12 +114,12 @@ class TestGemma3Text:
                 "max_steps": 5,
                 "save_safetensors": True,
                 "bf16": True,
+                "save_first_step": False,
             }
         )
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         assert (Path(temp_dir) / "model.safetensors").exists()
diff --git a/tests/e2e/test_imports.py b/tests/e2e/test_imports.py
index fc0843479..050e4dfb3 100644
--- a/tests/e2e/test_imports.py
+++ b/tests/e2e/test_imports.py
@@ -11,11 +11,11 @@ class TestImports(unittest.TestCase):
     """
 
     def test_import_causal_trainer(self):
-        from axolotl.core.trainer_builder import (  # pylint: disable=unused-import  # noqa: F401
+        from axolotl.core.builders import (  # pylint: disable=unused-import  # noqa: F401
             HFCausalTrainerBuilder,
         )
 
     def test_import_rl_trainer(self):
-        from axolotl.core.trainer_builder import (  # pylint: disable=unused-import  # noqa: F401
+        from axolotl.core.builders import (  # pylint: disable=unused-import  # noqa: F401
             HFRLTrainerBuilder,
         )
diff --git a/tests/e2e/test_llama.py b/tests/e2e/test_llama.py
index d3e37fb3f..1e6df0be9 100644
--- a/tests/e2e/test_llama.py
+++ b/tests/e2e/test_llama.py
@@ -2,10 +2,6 @@
 E2E tests for llama
 """
 
-import logging
-import os
-
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -13,9 +9,6 @@ from axolotl.utils.dict import DictDefault
 
 from tests.e2e.utils import check_model_output_exists
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestLlama:
     """
@@ -52,13 +45,13 @@ class TestLlama:
                 "sample_packing": True,
                 "bf16": True,
                 "save_safetensors": True,
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
@@ -100,13 +93,13 @@ class TestLlama:
                 "sample_packing": True,
                 "bf16": True,
                 "save_safetensors": True,
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
@@ -145,13 +138,13 @@ class TestLlama:
                 "sample_packing": True,
                 "bf16": True,
                 "save_safetensors": True,
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
@@ -186,13 +179,13 @@ class TestLlama:
                 "batch_flattening": True,
                 "bf16": True,
                 "save_safetensors": True,
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/test_llama_pretrain.py b/tests/e2e/test_llama_pretrain.py
index 647285e46..bd5502300 100644
--- a/tests/e2e/test_llama_pretrain.py
+++ b/tests/e2e/test_llama_pretrain.py
@@ -1,13 +1,7 @@
-"""
-E2E tests for llama pretrain
-"""
-
-import logging
-import os
+"""E2E tests for llama pretrain"""
 
 import pytest
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -15,27 +9,19 @@ from axolotl.utils.dict import DictDefault
 
 from .utils import check_model_output_exists, check_tensorboard
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestPretrainLlama:
-    """
-    Test case for Llama models w pretraining
-    """
+    """Test case for Llama models w pretraining"""
 
     @pytest.mark.parametrize(
-        "sample_packing",
-        [True, False],
-    )
-    @pytest.mark.parametrize(
-        "pretrain_multipack_attn",
-        [True, False],
+        ("sample_packing", "pretrain_multipack_attn"),
+        [
+            (False, False),
+            (True, True),
+            (True, False),
+        ],
     )
     def test_pretrain(self, temp_dir, sample_packing, pretrain_multipack_attn):
-        if not sample_packing and pretrain_multipack_attn:
-            return
-
         # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
@@ -67,22 +53,22 @@ class TestPretrainLlama:
                 "save_safetensors": True,
                 "bf16": "auto",
                 "use_tensorboard": True,
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
-        loss_threshold = 3.5
+        loss_threshold = 3.6
         if sample_packing and not pretrain_multipack_attn:
             loss_threshold = 6.5
         check_tensorboard(
             temp_dir + "/runs",
             "train/train_loss",
             loss_threshold,
-            "Train Loss is too high",
+            "Train Loss (%s) is too high",
         )
diff --git a/tests/e2e/test_llama_vision.py b/tests/e2e/test_llama_vision.py
index e1e496ccf..760759bca 100644
--- a/tests/e2e/test_llama_vision.py
+++ b/tests/e2e/test_llama_vision.py
@@ -2,11 +2,8 @@
 E2E tests for lora llama
 """
 
-import logging
-import os
 import unittest
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -14,9 +11,6 @@ from axolotl.utils.dict import DictDefault
 
 from .utils import check_model_output_exists, with_temp_dir
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestLlamaVision(unittest.TestCase):
     """
@@ -38,7 +32,7 @@ class TestLlamaVision(unittest.TestCase):
                 "lora_r": 8,
                 "lora_alpha": 16,
                 "lora_dropout": 0.05,
-                "lora_target_modules": r"language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj",
+                "lora_target_modules": r"model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj",
                 "val_set_size": 0,
                 "chat_template": "llama3_2_vision",
                 "datasets": [
@@ -60,13 +54,13 @@ class TestLlamaVision(unittest.TestCase):
                 "max_steps": 5,
                 "save_safetensors": True,
                 "bf16": True,
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
@@ -86,7 +80,7 @@ class TestLlamaVision(unittest.TestCase):
                 "lora_r": 8,
                 "lora_alpha": 16,
                 "lora_dropout": 0.05,
-                "lora_target_modules": r"language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj",
+                "lora_target_modules": r"model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj",
                 "val_set_size": 0,
                 "chat_template": "llama3_2_vision",
                 "datasets": [
@@ -107,12 +101,12 @@ class TestLlamaVision(unittest.TestCase):
                 "max_steps": 5,
                 "save_safetensors": True,
                 "bf16": True,
+                "save_first_step": False,
             }
         )
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/test_load_model.py b/tests/e2e/test_load_model.py
index 5061945b4..8fcffeb11 100644
--- a/tests/e2e/test_load_model.py
+++ b/tests/e2e/test_load_model.py
@@ -52,6 +52,8 @@ class TestLoadModelUtils:
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch_fused",
                 "lr_scheduler": "cosine",
+                "tensor_parallel_size": 1,
+                "context_parallel_size": 1,
             }
         )
         self.model_loader = (  # pylint: disable=attribute-defined-outside-init
diff --git a/tests/e2e/test_lora_llama.py b/tests/e2e/test_lora_llama.py
index b02fe3d44..7e0ff46cf 100644
--- a/tests/e2e/test_lora_llama.py
+++ b/tests/e2e/test_lora_llama.py
@@ -2,11 +2,8 @@
 E2E tests for lora llama
 """
 
-import logging
-import os
 import unittest
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -14,9 +11,6 @@ from axolotl.utils.dict import DictDefault
 
 from .utils import check_model_output_exists, with_temp_dir
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestLoraLlama(unittest.TestCase):
     """
@@ -55,13 +49,13 @@ class TestLoraLlama(unittest.TestCase):
                 "optimizer": "adamw_torch_fused",
                 "lr_scheduler": "cosine",
                 "max_steps": 5,
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/test_mamba.py b/tests/e2e/test_mamba.py
index f49b53987..73d3bdc26 100644
--- a/tests/e2e/test_mamba.py
+++ b/tests/e2e/test_mamba.py
@@ -2,13 +2,10 @@
 E2E tests for lora llama
 """
 
-import logging
-import os
 import unittest
 
 import pytest
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -16,9 +13,6 @@ from axolotl.utils.dict import DictDefault
 
 from .utils import check_model_output_exists, with_temp_dir
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 @pytest.mark.skip(reason="skipping until upstreamed into transformers")
 class TestMamba(unittest.TestCase):
@@ -57,13 +51,13 @@ class TestMamba(unittest.TestCase):
                 "save_steps": 10,
                 "eval_steps": None,
                 "save_safetensors": False,
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/test_mistral.py b/tests/e2e/test_mistral.py
index ba8cf2896..f47f794e0 100644
--- a/tests/e2e/test_mistral.py
+++ b/tests/e2e/test_mistral.py
@@ -2,13 +2,10 @@
 E2E tests for lora llama
 """
 
-import logging
-import os
 import unittest
 
 from transformers.utils import is_torch_bf16_gpu_available
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -16,9 +13,6 @@ from axolotl.utils.dict import DictDefault
 
 from .utils import check_model_output_exists, with_temp_dir
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestMistral(unittest.TestCase):
     """
@@ -61,13 +55,13 @@ class TestMistral(unittest.TestCase):
                 "max_steps": 20,
                 "save_steps": 10,
                 "eval_steps": 10,
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
@@ -102,6 +96,7 @@ class TestMistral(unittest.TestCase):
                 "max_steps": 20,
                 "save_steps": 10,
                 "eval_steps": 10,
+                "save_first_step": False,
             }
         )
         if is_torch_bf16_gpu_available():
@@ -111,8 +106,7 @@ class TestMistral(unittest.TestCase):
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/test_mixtral.py b/tests/e2e/test_mixtral.py
index 4e0693b94..3fe2bf70f 100644
--- a/tests/e2e/test_mixtral.py
+++ b/tests/e2e/test_mixtral.py
@@ -2,14 +2,11 @@
 E2E tests for mixtral
 """
 
-import logging
-import os
 import unittest
 
 import torch
 from transformers.utils import is_torch_bf16_gpu_available
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -17,9 +14,6 @@ from axolotl.utils.dict import DictDefault
 
 from .utils import check_model_output_exists, with_temp_dir
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestMixtral(unittest.TestCase):
     """
@@ -67,13 +61,13 @@ class TestMixtral(unittest.TestCase):
                 "max_steps": 20,
                 "save_steps": 10,
                 "eval_steps": 10,
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         model, _, _ = train(cfg=cfg, dataset_meta=dataset_meta)
         assert (
@@ -123,13 +117,13 @@ class TestMixtral(unittest.TestCase):
                 "max_steps": 20,
                 "save_steps": 10,
                 "eval_steps": 10,
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         model, _, _ = train(cfg=cfg, dataset_meta=dataset_meta)
         assert (
@@ -178,6 +172,7 @@ class TestMixtral(unittest.TestCase):
                 "max_steps": 20,
                 "save_steps": 10,
                 "eval_steps": 10,
+                "save_first_step": False,
             }
         )
         if is_torch_bf16_gpu_available():
@@ -187,8 +182,7 @@ class TestMixtral(unittest.TestCase):
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         model, _, _ = train(cfg=cfg, dataset_meta=dataset_meta)
         assert (
@@ -237,6 +231,7 @@ class TestMixtral(unittest.TestCase):
                 "max_steps": 20,
                 "save_steps": 10,
                 "eval_steps": 10,
+                "save_first_step": False,
             }
         )
 
@@ -246,8 +241,7 @@ class TestMixtral(unittest.TestCase):
             cfg.bf16 = True
         else:
             cfg.fp16 = True
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         model, _, _ = train(cfg=cfg, dataset_meta=dataset_meta)
         assert (
@@ -283,6 +277,7 @@ class TestMixtral(unittest.TestCase):
                 "max_steps": 20,
                 "save_steps": 10,
                 "eval_steps": 10,
+                "save_first_step": False,
             }
         )
         if is_torch_bf16_gpu_available():
@@ -292,8 +287,7 @@ class TestMixtral(unittest.TestCase):
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/test_optimizers.py b/tests/e2e/test_optimizers.py
index 91f45b762..987d86041 100644
--- a/tests/e2e/test_optimizers.py
+++ b/tests/e2e/test_optimizers.py
@@ -2,20 +2,20 @@
 E2E tests for custom optimizers using Llama
 """
 
-import logging
-import os
 import unittest
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
 from axolotl.utils.dict import DictDefault
 
-from .utils import check_model_output_exists, require_torch_2_5_1, with_temp_dir
-
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
+from .utils import (
+    check_model_output_exists,
+    require_torch_2_5_1,
+    require_torch_2_6_0,
+    require_torch_2_7_0,
+    with_temp_dir,
+)
 
 
 class TestCustomOptimizers(unittest.TestCase):
@@ -56,13 +56,13 @@ class TestCustomOptimizers(unittest.TestCase):
                 "optimizer": "optimi_adamw",
                 "max_steps": 5,
                 "lr_scheduler": "cosine",
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         _, _, trainer = train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
@@ -102,13 +102,13 @@ class TestCustomOptimizers(unittest.TestCase):
                 "learning_rate": 0.00001,
                 "optimizer": "adopt_adamw",
                 "lr_scheduler": "cosine",
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         _, _, trainer = train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
@@ -149,18 +149,61 @@ class TestCustomOptimizers(unittest.TestCase):
                 "optimizer": "muon",
                 "lr_scheduler": "cosine",
                 "weight_decay": 0.01,
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         _, _, trainer = train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
         assert "Muon" in trainer.optimizer.optimizer.__class__.__name__
 
+    @with_temp_dir
+    @require_torch_2_7_0
+    def test_dion(self, temp_dir):
+        # pylint: disable=duplicate-code
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "model_type": "AutoModelForCausalLM",
+                "tokenizer_type": "AutoTokenizer",
+                "sequence_len": 1024,
+                "val_set_size": 0.0,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "mhenrichsen/alpaca_2k_test",
+                        "type": "alpaca",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 5,
+                "micro_batch_size": 8,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "dion",
+                "dion_lr": 0.01,
+                "dion_momentum": 0.95,
+                "lr_scheduler": "cosine",
+                "weight_decay": 0.01,
+                "save_first_step": False,
+            }
+        )
+
+        cfg = validate_config(cfg)
+        normalize_config(cfg)
+        dataset_meta = load_datasets(cfg=cfg)
+
+        _, _, trainer = train(cfg=cfg, dataset_meta=dataset_meta)
+        check_model_output_exists(temp_dir, cfg)
+        assert "Dion" in trainer.optimizer.optimizer.__class__.__name__
+
     @with_temp_dir
     def test_fft_schedule_free_adamw(self, temp_dir):
         # pylint: disable=duplicate-code
@@ -188,19 +231,20 @@ class TestCustomOptimizers(unittest.TestCase):
                 "lr_scheduler": "constant",
                 "save_safetensors": True,
                 "max_steps": 10,
+                "save_first_step": False,
             }
         )
         # pylint: disable=duplicate-code
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
 
     @with_temp_dir
+    @require_torch_2_6_0
     def test_came_pytorch(self, temp_dir):
         # pylint: disable=duplicate-code
         cfg = DictDefault(
@@ -236,13 +280,13 @@ class TestCustomOptimizers(unittest.TestCase):
                 "adam_epsilon2": 1e-16,
                 "max_steps": 5,
                 "lr_scheduler": "cosine",
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/test_packing_loss.py b/tests/e2e/test_packing_loss.py
index 73716f44b..aec9d95f8 100644
--- a/tests/e2e/test_packing_loss.py
+++ b/tests/e2e/test_packing_loss.py
@@ -2,13 +2,10 @@
 E2E tests for packed training
 """
 
-import logging
-import os
 import unittest
 
 from transformers.utils import is_torch_bf16_gpu_available
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -16,9 +13,6 @@ from axolotl.utils.dict import DictDefault
 
 from .utils import check_tensorboard, with_temp_dir
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestPackedLlama(unittest.TestCase):
     """
@@ -54,6 +48,7 @@ class TestPackedLlama(unittest.TestCase):
                 "lr_scheduler": "cosine",
                 "max_steps": 5,
                 "use_tensorboard": True,
+                "save_first_step": False,
             }
         )
         if is_torch_bf16_gpu_available():
@@ -63,11 +58,10 @@ class TestPackedLlama(unittest.TestCase):
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/test_phi.py b/tests/e2e/test_phi.py
index f531a17c5..ab3a63674 100644
--- a/tests/e2e/test_phi.py
+++ b/tests/e2e/test_phi.py
@@ -2,11 +2,8 @@
 E2E tests for lora llama
 """
 
-import logging
-import os
 import unittest
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -14,9 +11,6 @@ from axolotl.utils.dict import DictDefault
 
 from .utils import check_model_output_exists, with_temp_dir
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestPhi(unittest.TestCase):
     """
@@ -59,12 +53,12 @@ class TestPhi(unittest.TestCase):
                 "save_steps": 10,
                 "eval_steps": 10,
                 "bf16": "auto",
+                "save_first_step": False,
             }
         )
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
@@ -109,12 +103,12 @@ class TestPhi(unittest.TestCase):
                 "save_steps": 10,
                 "eval_steps": 10,
                 "bf16": "auto",
+                "save_first_step": False,
             }
         )
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/test_preprocess.py b/tests/e2e/test_preprocess.py
new file mode 100644
index 000000000..25f42e832
--- /dev/null
+++ b/tests/e2e/test_preprocess.py
@@ -0,0 +1,58 @@
+"""E2E Test the preprocess cli"""
+
+from pathlib import Path
+
+import yaml
+from accelerate.test_utils import execute_subprocess_async
+
+from axolotl.utils.dict import DictDefault
+
+AXOLOTL_ROOT = Path(__file__).parent.parent.parent
+
+
+class TestPreprocess:
+    """test cases for preprocess"""
+
+    def test_w_deepspeed(self, temp_dir):
+        """make sure preproces doesn't choke when using deepspeed in the config"""
+        # pylint: disable=duplicate-code
+        cfg = DictDefault(
+            {
+                "base_model": "Qwen/Qwen2.5-0.5B",
+                "sequence_len": 2048,
+                "val_set_size": 0.01,
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                        "split": "train[:10%]",
+                    },
+                ],
+                "num_epochs": 1,
+                "micro_batch_size": 2,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "bf16": "auto",
+                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
+            }
+        )
+
+        # write cfg to yaml file
+        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+        execute_subprocess_async(
+            [
+                "axolotl",
+                "preprocess",
+                str(Path(temp_dir) / "config.yaml"),
+            ]
+        )
+
+        assert (Path(temp_dir) / "last_run_prepared").exists()
diff --git a/tests/e2e/test_process_reward_model_smollm2.py b/tests/e2e/test_process_reward_model_smollm2.py
index 446facdb0..bd9eec48b 100644
--- a/tests/e2e/test_process_reward_model_smollm2.py
+++ b/tests/e2e/test_process_reward_model_smollm2.py
@@ -2,11 +2,8 @@
 E2E tests for process reward model w/ lora llama
 """
 
-import logging
-import os
 import unittest
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -14,9 +11,6 @@ from axolotl.utils.dict import DictDefault
 
 from .utils import check_model_output_exists, check_tensorboard, with_temp_dir
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestProcessRewardSmolLM2(unittest.TestCase):
     """
@@ -55,12 +49,12 @@ class TestProcessRewardSmolLM2(unittest.TestCase):
                 "use_tensorboard": True,
                 "special_tokens": {"pad_token": "<|endoftext|>"},
                 "seed": 42,
+                "save_first_step": False,
             }
         )
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_tensorboard(
diff --git a/tests/e2e/test_profiler.py b/tests/e2e/test_profiler.py
new file mode 100644
index 000000000..ab273b981
--- /dev/null
+++ b/tests/e2e/test_profiler.py
@@ -0,0 +1,113 @@
+"""
+e2e gpu test for the pytorch profiler callback
+"""
+
+from pathlib import Path
+
+import pytest
+
+from axolotl.common.datasets import load_datasets
+from axolotl.train import train
+from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.dict import DictDefault
+
+
+@pytest.fixture(name="profiler_base_cfg")
+def fixture_profiler_base_cfg():
+    cfg = DictDefault(
+        base_model="HuggingFaceTB/SmolLM2-135M",
+        tokenizer_type="AutoTokenizer",
+        sequence_len=1024,
+        load_in_8bit=True,
+        adapter="lora",
+        lora_r=8,
+        lora_alpha=16,
+        lora_dropout=0.05,
+        lora_target_linear=True,
+        val_set_size=0.02,
+        special_tokens={"pad_token": "<|endoftext|>"},
+        datasets=[
+            {
+                "path": "mhenrichsen/alpaca_2k_test",
+                "type": "alpaca",
+            },
+        ],
+        num_epochs=1,
+        micro_batch_size=2,
+        gradient_accumulation_steps=1,
+        learning_rate=0.00001,
+        optimizer="adamw_torch_fused",
+        lr_scheduler="cosine",
+    )
+    return cfg
+
+
+class TestProfiler:
+    """
+    test cases for the pytorch profiler callback
+    """
+
+    def test_profiler_saves(self, profiler_base_cfg, temp_dir):
+        cfg = profiler_base_cfg | DictDefault(
+            output_dir=temp_dir,
+            max_steps=5,
+            profiler_steps=3,
+        )
+
+        cfg = validate_config(cfg)
+        normalize_config(cfg)
+        dataset_meta = load_datasets(cfg=cfg)
+
+        train(cfg=cfg, dataset_meta=dataset_meta)
+        assert (Path(temp_dir) / "snapshot.pickle").exists()
+
+    def test_profiler_saves_w_start(self, profiler_base_cfg, temp_dir):
+        cfg = profiler_base_cfg | DictDefault(
+            output_dir=temp_dir,
+            max_steps=5,
+            profiler_steps=3,
+            profiler_steps_start=1,
+        )
+
+        cfg = validate_config(cfg)
+        normalize_config(cfg)
+        dataset_meta = load_datasets(cfg=cfg)
+
+        train(cfg=cfg, dataset_meta=dataset_meta)
+        assert (Path(temp_dir) / "snapshot.pickle").exists()
+
+    @pytest.mark.parametrize(
+        "profiler_steps_start",
+        [3, 5],
+    )
+    def test_profiler_saves_past_end(
+        self, profiler_base_cfg, temp_dir, profiler_steps_start
+    ):
+        cfg = profiler_base_cfg | DictDefault(
+            output_dir=temp_dir,
+            max_steps=5,
+            profiler_steps=3,
+            profiler_steps_start=profiler_steps_start,
+        )
+
+        cfg = validate_config(cfg)
+        normalize_config(cfg)
+        dataset_meta = load_datasets(cfg=cfg)
+
+        train(cfg=cfg, dataset_meta=dataset_meta)
+        assert (Path(temp_dir) / "snapshot.pickle").exists()
+
+    def test_profiler_never_started(self, profiler_base_cfg, temp_dir):
+        cfg = profiler_base_cfg | DictDefault(
+            output_dir=temp_dir,
+            max_steps=5,
+            profiler_steps=3,
+            profiler_steps_start=6,
+        )
+
+        cfg = validate_config(cfg)
+        normalize_config(cfg)
+        dataset_meta = load_datasets(cfg=cfg)
+
+        train(cfg=cfg, dataset_meta=dataset_meta)
+        assert not (Path(temp_dir) / "snapshot.pickle").exists()
diff --git a/tests/e2e/test_qat.py b/tests/e2e/test_qat.py
new file mode 100644
index 000000000..139ae155a
--- /dev/null
+++ b/tests/e2e/test_qat.py
@@ -0,0 +1,135 @@
+"""
+E2E tests for QAT
+"""
+
+from pathlib import Path
+
+from axolotl.common.datasets import load_datasets, load_preference_datasets
+from axolotl.train import train
+from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.dict import DictDefault
+
+from .utils import check_model_output_exists, check_tensorboard
+
+
+class TestQATLlama:
+    """
+    Test case for QAT Llama models
+    """
+
+    def test_qat(self, temp_dir):
+        # pylint: disable=duplicate-code
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "tokenizer_type": "AutoTokenizer",
+                "sequence_len": 1024,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "mlabonne/FineTome-100k",
+                        "type": "chat_template",
+                        "field_messages": "conversations",
+                        "message_property_mappings": {
+                            "role": "from",
+                            "content": "value",
+                        },
+                        "drop_system_message": True,
+                        "split": "train[:1%]",
+                    },
+                ],
+                "chat_template": "chatml",
+                "qat": {
+                    "quantize_embedding": True,
+                    "activation_dtype": "int8",
+                    "weight_dtype": "int8",
+                    "group_size": 8,
+                },
+                "num_epochs": 1,
+                "micro_batch_size": 1,
+                "gradient_accumulation_steps": 2,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_bnb_8bit",
+                "lr_scheduler": "cosine",
+                "max_steps": 5,
+                "save_safetensors": True,
+                "bf16": True,
+                "save_first_step": False,
+            }
+        )
+        cfg = validate_config(cfg)
+        normalize_config(cfg)
+        dataset_meta = load_datasets(cfg=cfg)
+
+        train(cfg=cfg, dataset_meta=dataset_meta)
+        check_model_output_exists(Path(temp_dir) / "checkpoint-5", cfg)
+
+    def test_qat_dpo(self, temp_dir):
+        # pylint: disable=duplicate-code
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "sequence_len": 2048,
+                "sample_packing": False,
+                "eval_sample_packing": False,
+                "pad_to_sequence_len": True,
+                "val_set_size": 0.01,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "rl": "dpo",
+                "chat_template": "chatml",
+                "datasets": [
+                    {
+                        "path": "fozziethebeat/alpaca_messages_2k_dpo_test",
+                        "type": "chat_template.default",
+                        "field_messages": "conversation",
+                        "field_chosen": "chosen",
+                        "field_rejected": "rejected",
+                        "message_field_role": "role",
+                        "message_field_content": "content",
+                        "roles": {
+                            "system": ["system"],
+                            "user": ["user"],
+                            "assistant": ["assistant"],
+                        },
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 5,
+                "micro_batch_size": 2,
+                "gradient_accumulation_steps": 2,
+                "output_dir": temp_dir,
+                "warmup_steps": 0,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "use_tensorboard": True,
+                "bf16": True,
+                "qat": {
+                    "quantize_embedding": True,
+                    "activation_dtype": "int8",
+                    "weight_dtype": "int8",
+                    "group_size": 8,
+                },
+                "save_first_step": False,
+            }
+        )
+        cfg = validate_config(cfg)
+        normalize_config(cfg)
+        dataset_meta = load_preference_datasets(cfg=cfg)
+
+        train(cfg=cfg, dataset_meta=dataset_meta)
+        check_model_output_exists(Path(temp_dir) / "checkpoint-5", cfg)
+
+        loss_threshold = 2.3
+        check_tensorboard(
+            temp_dir + "/runs",
+            "train/train_loss",
+            loss_threshold,
+            "Train Loss (%s) is too high",
+        )
diff --git a/tests/e2e/test_quantization.py b/tests/e2e/test_quantization.py
new file mode 100644
index 000000000..500b7e556
--- /dev/null
+++ b/tests/e2e/test_quantization.py
@@ -0,0 +1,350 @@
+"""
+Tests for axolotl.utils.quantization
+"""
+
+import pytest
+import torch
+from torch import nn
+from torchao.dtypes.affine_quantized_tensor import AffineQuantizedTensor
+from torchao.quantization.granularity import PerAxis, PerGroup
+from torchao.quantization.linear_activation_quantized_tensor import (
+    LinearActivationQuantizedTensor,
+)
+from torchao.quantization.qat.embedding import FakeQuantizedEmbedding
+from torchao.quantization.qat.linear import FakeQuantizedLinear
+from torchao.quantization.quant_api import (
+    Int4DynamicActivationInt4WeightConfig,
+    Int4WeightOnlyConfig,
+    Int8DynamicActivationInt8WeightConfig,
+    Int8WeightOnlyConfig,
+    UIntXWeightOnlyConfig,
+)
+from transformers import AutoModelForCausalLM
+from transformers.trainer_callback import TrainerState
+
+from axolotl.utils.callbacks.qat import QATCallback
+from axolotl.utils.quantization import (
+    convert_qat_model_for_ptq,
+    get_ptq_config,
+    prepare_model_for_qat,
+    quantize_model_for_ptq,
+)
+from axolotl.utils.schemas.enums import TorchIntDType
+from axolotl.utils.schemas.quantization import QATConfig
+
+from tests.e2e.utils import require_torch_2_6_0
+
+
+@pytest.fixture()
+def model():
+    dummy_model = AutoModelForCausalLM.from_pretrained(
+        "HuggingFaceTB/SmolLM2-135M",
+        device_map="cuda",
+        torch_dtype=torch.bfloat16,
+    )
+    with torch.device(dummy_model.device):
+        dummy_model.model.embed_tokens = torch.nn.Embedding(
+            dummy_model.model.embed_tokens.weight.shape[0],
+            dummy_model.model.embed_tokens.weight.shape[1],
+            dtype=dummy_model.model.embed_tokens.weight.dtype,
+        )
+    return dummy_model
+
+
+ptq_config_test_cases = [
+    # weight_dtype, activation_dtype, group_size, expected_type, expected_params
+    (
+        TorchIntDType.uint4,
+        None,
+        None,
+        UIntXWeightOnlyConfig,
+        {"dtype": torch.uint4, "group_size": None},
+    ),
+    (TorchIntDType.int8, None, 32, Int8WeightOnlyConfig, {"group_size": 32}),
+    (TorchIntDType.int4, None, 4, Int4WeightOnlyConfig, {"group_size": 4}),
+    (
+        TorchIntDType.int4,
+        TorchIntDType.int4,
+        None,
+        Int4DynamicActivationInt4WeightConfig,
+        {},
+    ),
+    (
+        TorchIntDType.int8,
+        TorchIntDType.int8,
+        None,
+        Int8DynamicActivationInt8WeightConfig,
+        {},
+    ),
+]
+
+ptq_test_cases = [
+    # weight_dtype, activation_dtype, group_size, quantize_embedding, expected_exception
+    (TorchIntDType.int8, None, 8, False, None),
+    (TorchIntDType.int4, None, 4, True, None),
+    (TorchIntDType.uint4, None, 8, False, None),
+    (TorchIntDType.int4, TorchIntDType.int4, 8, False, None),
+    (TorchIntDType.int8, TorchIntDType.int8, 8, True, None),
+    (TorchIntDType.int8, None, None, False, ValueError),
+    (TorchIntDType.int4, None, None, False, ValueError),
+]
+
+
+class TestQuantization:
+    """
+    Test quantization utilities
+    """
+
+    @pytest.mark.parametrize(
+        "weight_dtype,activation_dtype,group_size,expected_type,expected_params",
+        ptq_config_test_cases,
+    )
+    @require_torch_2_6_0
+    def test_get_ptq_config(
+        self, weight_dtype, activation_dtype, group_size, expected_type, expected_params
+    ):
+        config = get_ptq_config(weight_dtype, activation_dtype, group_size)
+
+        assert isinstance(config, expected_type)
+
+        for param_name, param_value in expected_params.items():
+            if isinstance(param_value, (PerAxis, PerGroup)):
+                if isinstance(param_value, PerAxis):
+                    assert isinstance(getattr(config, param_name), PerAxis)
+                    assert getattr(config, param_name).axis == param_value.axis
+                else:
+                    assert isinstance(getattr(config, param_name), PerGroup)
+                    assert (
+                        getattr(config, param_name).group_size == param_value.group_size
+                    )
+            else:
+                assert getattr(config, param_name) == param_value
+
+    @pytest.mark.parametrize(
+        "weight_dtype", [TorchIntDType.int8, TorchIntDType.int4, TorchIntDType.uint4]
+    )
+    @pytest.mark.parametrize(
+        "activation_dtype", [None, TorchIntDType.int4, TorchIntDType.int8]
+    )
+    @pytest.mark.parametrize("group_size", [4, 8])
+    @pytest.mark.parametrize("quantize_embedding", [False, True])
+    @require_torch_2_6_0
+    def test_prepare_model_for_qat(
+        self, model, weight_dtype, activation_dtype, group_size, quantize_embedding
+    ):  # pylint: disable=redefined-outer-name
+        prepare_model_for_qat(
+            model, weight_dtype, group_size, activation_dtype, quantize_embedding
+        )
+        if quantize_embedding:
+            assert isinstance(model.model.embed_tokens, FakeQuantizedEmbedding)
+            assert hasattr(model.model.embed_tokens, "weight_fake_quantizer")
+            assert (
+                model.model.embed_tokens.weight_fake_quantizer.config.dtype
+                == weight_dtype.value
+            )
+            assert (
+                model.model.embed_tokens.weight_fake_quantizer.config.group_size
+                == group_size
+            )
+
+        for child in list(model.children()):
+            if isinstance(child, torch.nn.Linear):
+                assert isinstance(child, FakeQuantizedLinear)
+                assert hasattr(child, "weight_fake_quantizer")
+                assert child.weight_fake_quantizer.config.dtype == weight_dtype.value
+                assert child.weight_fake_quantizer.config.group_size == group_size
+                if activation_dtype:
+                    assert hasattr(child, "activation_fake_quantizer")
+                    assert (
+                        child.activation_fake_quantizer.config.dtype
+                        == activation_dtype.value
+                    )
+                else:
+                    assert child.activation_fake_quantizer is None
+
+    @pytest.mark.parametrize(
+        "weight_dtype,activation_dtype,group_size,quantize_embedding,expected_exception",
+        ptq_test_cases,
+    )
+    @require_torch_2_6_0
+    def test_quantize_model_for_ptq(
+        self,
+        model,
+        weight_dtype,
+        activation_dtype,
+        group_size,
+        quantize_embedding,
+        expected_exception,
+    ):  # pylint: disable=redefined-outer-name
+        if expected_exception:
+            with pytest.raises(expected_exception):
+                quantize_model_for_ptq(
+                    model,
+                    weight_dtype,
+                    group_size,
+                    activation_dtype,
+                    quantize_embedding,
+                )
+        else:
+            quantize_model_for_ptq(
+                model, weight_dtype, group_size, activation_dtype, quantize_embedding
+            )
+            if quantize_embedding:
+                assert isinstance(
+                    model.model.embed_tokens.weight, AffineQuantizedTensor
+                ), "Embedding weight should be quantized"
+            for child in list(model.children()):
+                if isinstance(child, torch.nn.Linear):
+                    if activation_dtype:
+                        assert isinstance(
+                            child.weight, LinearActivationQuantizedTensor
+                        ), "Linear weight should be quantized with activation quantization"
+                    else:
+                        assert isinstance(
+                            child.weight, AffineQuantizedTensor
+                        ), "Linear weight should be quantized without activation quantization"
+
+
+class TestQuantizationCallback:
+    """
+    Test QATCallback
+    """
+
+    @pytest.fixture()
+    def trainer_state(self):
+        return TrainerState(
+            global_step=0,
+        )
+
+    @require_torch_2_6_0
+    def test_qat_callback_fake_quant_after_n_steps(
+        self, model, trainer_state
+    ):  # pylint: disable=redefined-outer-name
+        cfg = QATConfig(
+            weight_dtype="int8",
+            activation_dtype="int8",
+            group_size=8,
+            quantize_embedding=True,
+            fake_quant_after_n_steps=100,
+        )
+
+        prepare_model_for_qat(
+            model,
+            cfg.weight_dtype,
+            cfg.group_size,
+            cfg.activation_dtype,
+            cfg.quantize_embedding,
+        )
+
+        # ensure model has been quantized
+        assert isinstance(model.model.embed_tokens, FakeQuantizedEmbedding)
+        assert model.model.embed_tokens.weight_fake_quantizer.enabled
+        assert isinstance(model.lm_head, FakeQuantizedLinear)
+        assert model.lm_head.weight_fake_quantizer.enabled
+
+        qat_callback = QATCallback(cfg)
+
+        # simulate first training step
+        qat_callback.on_step_begin(
+            args=None,
+            state=trainer_state,
+            control=None,
+            model=model,
+        )
+
+        # quantization should have been disabled
+        assert not model.model.embed_tokens.weight_fake_quantizer.enabled
+        assert not model.lm_head.weight_fake_quantizer.enabled
+
+        trainer_state.global_step = 100
+        qat_callback.on_step_begin(
+            args=None,
+            state=trainer_state,
+            control=None,
+            model=model,
+        )
+
+        # quantization should have been enabled
+        assert model.model.embed_tokens.weight_fake_quantizer.enabled
+        assert model.lm_head.weight_fake_quantizer.enabled
+
+    @require_torch_2_6_0
+    def test_qat_callback_fake_quant_after_n_steps_is_none(
+        self, model, trainer_state
+    ):  # pylint: disable=redefined-outer-name
+        cfg = QATConfig(
+            weight_dtype="int8",
+            activation_dtype="int8",
+            group_size=8,
+            quantize_embedding=True,
+            fake_quant_after_n_steps=None,
+        )
+
+        prepare_model_for_qat(
+            model,
+            cfg.weight_dtype,
+            cfg.group_size,
+            cfg.activation_dtype,
+            cfg.quantize_embedding,
+        )
+
+        # ensure model has been quantized
+        assert isinstance(model.model.embed_tokens, FakeQuantizedEmbedding)
+        assert model.model.embed_tokens.weight_fake_quantizer.enabled
+        assert isinstance(model.lm_head, FakeQuantizedLinear)
+        assert model.lm_head.weight_fake_quantizer.enabled
+
+        qat_callback = QATCallback(cfg)
+        # simulate first training step
+        qat_callback.on_step_begin(
+            args=None,
+            state=trainer_state,
+            control=None,
+            model=model,
+        )
+
+        # quantization should be enabled from the get-go
+        assert model.model.embed_tokens.weight_fake_quantizer.enabled
+        assert model.lm_head.weight_fake_quantizer.enabled
+
+
+class TestConvertQATModelForPTQ:
+    """
+    Test convert_qat_model_for_ptq
+    """
+
+    @require_torch_2_6_0
+    def test_convert_qat_model_for_ptq(
+        self, model
+    ):  # pylint: disable=redefined-outer-name
+        config = QATConfig(
+            weight_dtype="int8",
+            activation_dtype="int8",
+            group_size=8,
+            quantize_embedding=True,
+        )
+
+        # quantize model for qat
+        prepare_model_for_qat(
+            model,
+            config.weight_dtype,
+            config.group_size,
+            config.activation_dtype,
+            config.quantize_embedding,
+        )
+
+        assert isinstance(model.model.embed_tokens, FakeQuantizedEmbedding)
+        assert isinstance(model.lm_head, FakeQuantizedLinear)
+
+        # apply conversion
+        convert_qat_model_for_ptq(
+            model,
+            quantize_embedding=config.quantize_embedding,
+        )
+        # ensure modules have been swapped out
+        assert not isinstance(model.model.embed_tokens, FakeQuantizedEmbedding)
+        assert not isinstance(model.lm_head, FakeQuantizedLinear)
+
+        # ensure weights have been quantized
+        assert isinstance(model.model.embed_tokens.weight, nn.Parameter)
+        assert isinstance(model.lm_head.weight, nn.Parameter)
diff --git a/tests/e2e/test_qwen.py b/tests/e2e/test_qwen.py
index 39d55603f..59267d14d 100644
--- a/tests/e2e/test_qwen.py
+++ b/tests/e2e/test_qwen.py
@@ -2,8 +2,6 @@
 E2E tests for qwen
 """
 
-import logging
-import os
 from pathlib import Path
 
 import pytest
@@ -13,9 +11,6 @@ from transformers.testing_utils import get_torch_dist_unique_port
 
 from axolotl.utils.dict import DictDefault
 
-LOG = logging.getLogger("axolotl.tests.qwen")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestE2eQwen:
     """
@@ -64,6 +59,7 @@ class TestE2eQwen:
                 "bf16": "auto",
                 "tf32": True,
                 "gradient_checkpointing": True,
+                "save_first_step": False,
             }
         )
 
diff --git a/tests/e2e/test_reward_model_smollm2.py b/tests/e2e/test_reward_model_smollm2.py
index 240c4b392..82513f99f 100644
--- a/tests/e2e/test_reward_model_smollm2.py
+++ b/tests/e2e/test_reward_model_smollm2.py
@@ -2,11 +2,8 @@
 E2E tests for reward model lora llama
 """
 
-import logging
-import os
 import unittest
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -14,9 +11,6 @@ from axolotl.utils.dict import DictDefault
 
 from .utils import check_model_output_exists, check_tensorboard, with_temp_dir
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestRewardModelLoraSmolLM2(unittest.TestCase):
     """
@@ -64,15 +58,15 @@ class TestRewardModelLoraSmolLM2(unittest.TestCase):
                 "gradient_checkpointing": True,
                 "warmup_ratio": 0.1,
                 "use_tensorboard": True,
+                "save_first_step": False,
             }
         )
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
         )
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/test_save_first_step.py b/tests/e2e/test_save_first_step.py
new file mode 100644
index 000000000..5bbd2302b
--- /dev/null
+++ b/tests/e2e/test_save_first_step.py
@@ -0,0 +1,102 @@
+"""
+E2E tests for relora llama
+"""
+
+import unittest
+from pathlib import Path
+
+import pytest
+
+from axolotl.common.datasets import load_datasets
+from axolotl.train import train
+from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.dict import DictDefault
+
+from .utils import check_model_output_exists, with_temp_dir
+
+
+class TestSaveFirstStepCallback(unittest.TestCase):
+    """Test cases for save_first_step callback config."""
+
+    @with_temp_dir
+    def test_save_first_step(self, temp_dir):
+        # pylint: disable=duplicate-code
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "tokenizer_type": "AutoTokenizer",
+                "sequence_len": 512,
+                "val_set_size": 0.02,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "mhenrichsen/alpaca_2k_test",
+                        "type": "alpaca",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 3,
+                "micro_batch_size": 2,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_bnb_8bit",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "sample_packing": True,
+                "bf16": True,
+                "save_safetensors": True,
+                "save_first_step": True,
+            }
+        )
+
+        cfg = validate_config(cfg)
+        normalize_config(cfg)
+        dataset_meta = load_datasets(cfg=cfg)
+
+        train(cfg=cfg, dataset_meta=dataset_meta)
+        check_model_output_exists(str(Path(temp_dir) / "checkpoint-1"), cfg)
+
+    @with_temp_dir
+    def test_no_save_first_step(self, temp_dir):
+        # pylint: disable=duplicate-code
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "tokenizer_type": "AutoTokenizer",
+                "sequence_len": 512,
+                "val_set_size": 0.02,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "mhenrichsen/alpaca_2k_test",
+                        "type": "alpaca",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 3,
+                "micro_batch_size": 2,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_bnb_8bit",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "sample_packing": True,
+                "bf16": True,
+                "save_safetensors": True,
+                "save_first_step": False,
+            }
+        )
+
+        cfg = validate_config(cfg)
+        normalize_config(cfg)
+        dataset_meta = load_datasets(cfg=cfg)
+
+        train(cfg=cfg, dataset_meta=dataset_meta)
+        with pytest.raises(AssertionError):
+            check_model_output_exists(str(Path(temp_dir) / "checkpoint-1"), cfg)
diff --git a/tests/e2e/test_schedulers.py b/tests/e2e/test_schedulers.py
index 694bb21e8..8f7a13aee 100644
--- a/tests/e2e/test_schedulers.py
+++ b/tests/e2e/test_schedulers.py
@@ -2,11 +2,8 @@
 E2E tests for custom schedulers using Llama
 """
 
-import logging
-import os
 import unittest
 
-from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
@@ -14,9 +11,6 @@ from axolotl.utils.dict import DictDefault
 
 from .utils import check_model_output_exists, with_temp_dir
 
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
 
 class TestCustomSchedulers(unittest.TestCase):
     """
@@ -57,13 +51,13 @@ class TestCustomSchedulers(unittest.TestCase):
                 "lr_scheduler": "rex",
                 "warmup_steps": 5,
                 "cosine_min_lr_ratio": 0.05,
+                "save_first_step": False,
             }
         )
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+        dataset_meta = load_datasets(cfg=cfg)
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/e2e/utils.py b/tests/e2e/utils.py
index 61df1d8fe..5931fe148 100644
--- a/tests/e2e/utils.py
+++ b/tests/e2e/utils.py
@@ -10,8 +10,6 @@ from functools import wraps
 from pathlib import Path
 
 import torch
-
-# from importlib.metadata import version
 from packaging import version
 from tbparse import SummaryReader
 
@@ -79,6 +77,18 @@ def require_torch_2_6_0(test_case):
     return unittest.skipUnless(is_min_2_6_0(), "test requires torch>=2.6.0")(test_case)
 
 
+def require_torch_2_7_0(test_case):
+    """
+    Decorator marking a test that requires torch >= 2.7.0
+    """
+
+    def is_min_2_7_0():
+        torch_version = version.parse(torch.__version__)
+        return torch_version >= version.parse("2.7.0")
+
+    return unittest.skipUnless(is_min_2_7_0(), "test requires torch>=2.7.0")(test_case)
+
+
 def require_torch_lt_2_6_0(test_case):
     """
     Decorator marking a test that requires torch < 2.6.0
@@ -132,6 +142,10 @@ def is_hopper():
     return compute_capability == (9, 0)
 
 
+def require_hopper(test_case):
+    return unittest.skipUnless(is_hopper(), "test requires h100/hopper GPU")(test_case)
+
+
 def check_tensorboard(
     temp_run_dir: str, tag: str, lt_val: float, assertion_err: str
 ) -> None:
diff --git a/tests/integrations/test_liger.py b/tests/integrations/test_liger.py
index cbe1408b8..5c4bd1028 100644
--- a/tests/integrations/test_liger.py
+++ b/tests/integrations/test_liger.py
@@ -2,8 +2,6 @@
 config validation tests for swiglu args
 """
 
-# pylint: disable=duplicate-code
-import logging
 from typing import Optional
 
 import pytest
@@ -12,6 +10,7 @@ from axolotl.utils.config import prepare_plugins, validate_config
 from axolotl.utils.dict import DictDefault
 
 
+# pylint: disable=duplicate-code
 @pytest.fixture(name="minimal_liger_cfg")
 def fixture_cfg():
     return DictDefault(
@@ -41,7 +40,7 @@ class TestValidation:
 
     @pytest.fixture(autouse=True)
     def inject_fixtures(self, caplog):
-        caplog.set_level(logging.WARNING)
+        caplog.set_level("WARNING")
         self._caplog = caplog
 
     def test_deprecated_swiglu(self, minimal_liger_cfg):
@@ -52,9 +51,7 @@ class TestValidation:
             | minimal_liger_cfg
         )
 
-        with self._caplog.at_level(
-            logging.WARNING, logger="axolotl.integrations.liger.args"
-        ):
+        with self._caplog.at_level("WARNING", logger="axolotl.integrations.liger.args"):
             prepare_plugins(test_cfg)
             updated_cfg = validate_config(test_cfg)
             # TODO this test is brittle in CI
diff --git a/tests/monkeypatch/test_trainer_accelerator_args.py b/tests/monkeypatch/test_trainer_accelerator_args.py
new file mode 100644
index 000000000..fab2597f0
--- /dev/null
+++ b/tests/monkeypatch/test_trainer_accelerator_args.py
@@ -0,0 +1,26 @@
+"""
+Unit tests for trainer accelerator args monkeypatch
+"""
+
+import unittest
+
+from axolotl.monkeypatch.trainer_accelerator_args import (
+    check_create_accelerate_code_is_patchable,
+)
+
+
+class TestTrainerAcceleratorArgs(unittest.TestCase):
+    """
+    Unit test class for trainer accelerator args monkeypatch
+    """
+
+    def test_check_create_accelerate_code_is_patchable(self):
+        """
+        Test that the upstream transformers code is still patchable.
+        This will fail if the patched code changes upstream.
+        """
+        assert check_create_accelerate_code_is_patchable()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/monkeypatch/test_trainer_loss_calc.py b/tests/monkeypatch/test_trainer_loss_calc.py
new file mode 100644
index 000000000..de3e92621
--- /dev/null
+++ b/tests/monkeypatch/test_trainer_loss_calc.py
@@ -0,0 +1,28 @@
+"""Unit tests for trainer loss calc monkeypatch."""
+
+import unittest
+
+from axolotl.monkeypatch.transformers.trainer_loss_calc import (
+    check_evaluation_loop_is_fsdp2_patchable,
+    check_evaluation_loop_is_patchable,
+    check_maybe_log_save_evaluate_is_patchable,
+)
+
+
+class TestTrainerLossCalc(unittest.TestCase):
+    """
+    Unit test class for trainer loss calc monkeypatch
+    """
+
+    def test_trainer_loss_calc_is_patchable(self):
+        """
+        Test that the upstream transformers code is still patchable. This will fail if
+        the patched code changes upstream.
+        """
+        assert check_evaluation_loop_is_patchable()
+        assert check_evaluation_loop_is_fsdp2_patchable()
+        assert check_maybe_log_save_evaluate_is_patchable()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/patched/test_validation.py b/tests/patched/test_validation.py
index 1c7325dff..677512d3d 100644
--- a/tests/patched/test_validation.py
+++ b/tests/patched/test_validation.py
@@ -1,7 +1,6 @@
 # pylint: disable=too-many-lines
 """Module for testing the validation module"""
 
-import logging
 import os
 import warnings
 from typing import Optional
@@ -80,7 +79,7 @@ class TestValidation(BaseValidation):
             | minimal_cfg
         )
 
-        with self._caplog.at_level(logging.WARNING):
+        with self._caplog.at_level("WARNING"):
             validate_config(test_cfg)
             assert (
                 "qlora + zero3 with use_reentrant: false may result in a CheckpointError about recomputed values"
@@ -218,7 +217,7 @@ class TestValidation(BaseValidation):
             }
         )
 
-        with self._caplog.at_level(logging.WARNING):
+        with self._caplog.at_level("WARNING"):
             validate_config(cfg)
             assert "batch_size is not recommended" in self._caplog.records[0].message
 
@@ -513,7 +512,7 @@ class TestValidation(BaseValidation):
             | minimal_cfg
         )
 
-        with self._caplog.at_level(logging.WARNING):
+        with self._caplog.at_level("WARNING"):
             validate_config(cfg)
             assert any(
                 "BetterTransformers probably doesn't work with PEFT adapters"
@@ -531,7 +530,7 @@ class TestValidation(BaseValidation):
             | minimal_cfg
         )
 
-        with self._caplog.at_level(logging.WARNING):
+        with self._caplog.at_level("WARNING"):
             validate_config(cfg)
             assert any(
                 "probably set bfloat16 or float16" in record.message
@@ -577,7 +576,7 @@ class TestValidation(BaseValidation):
             | minimal_cfg
         )
 
-        with self._caplog.at_level(logging.WARNING):
+        with self._caplog.at_level("WARNING"):
             validate_config(cfg)
             assert any(
                 "adamw hyperparameters found, but no adamw optimizer set"
@@ -595,7 +594,7 @@ class TestValidation(BaseValidation):
             | minimal_cfg
         )
 
-        with self._caplog.at_level(logging.WARNING):
+        with self._caplog.at_level("WARNING"):
             validate_config(cfg)
             assert any(
                 "adamw hyperparameters found, but no adamw optimizer set"
@@ -654,7 +653,7 @@ class TestValidation(BaseValidation):
             )
             | minimal_cfg
         )
-        with self._caplog.at_level(logging.WARNING):
+        with self._caplog.at_level("WARNING"):
             validate_config(cfg)
             assert any(
                 "`pad_to_sequence_len: true` is recommended when using sample_packing"
@@ -673,7 +672,7 @@ class TestValidation(BaseValidation):
             )
             | minimal_cfg
         )
-        with self._caplog.at_level(logging.INFO):
+        with self._caplog.at_level("INFO"):
             cfg = validate_config(cfg)
             assert any(
                 "Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing"
@@ -693,7 +692,7 @@ class TestValidation(BaseValidation):
                     "bf16": True,
                     "capabilities": {"bf16": False},
                     "env_capabilities": {
-                        "torch_version": "2.5.1",
+                        "torch_version": "2.6.0",
                     },
                 }
             )
@@ -1109,7 +1108,7 @@ class TestValidation(BaseValidation):
     def test_hub_model_id_save_value_warns_save_stragey_no(self, minimal_cfg):
         cfg = DictDefault({"hub_model_id": "test", "save_strategy": "no"}) | minimal_cfg
 
-        with self._caplog.at_level(logging.WARNING):
+        with self._caplog.at_level("WARNING"):
             validate_config(cfg)
             assert len(self._caplog.records) == 1
 
@@ -1118,7 +1117,7 @@ class TestValidation(BaseValidation):
             DictDefault({"hub_model_id": "test", "save_strategy": "test"}) | minimal_cfg
         )
 
-        with self._caplog.at_level(logging.WARNING):
+        with self._caplog.at_level("WARNING"):
             validate_config(cfg)
             assert len(self._caplog.records) == 1
 
@@ -1128,7 +1127,7 @@ class TestValidation(BaseValidation):
             | minimal_cfg
         )
 
-        with self._caplog.at_level(logging.WARNING):
+        with self._caplog.at_level("WARNING"):
             validate_config(cfg)
             assert len(self._caplog.records) == 0
 
@@ -1138,28 +1137,28 @@ class TestValidation(BaseValidation):
             | minimal_cfg
         )
 
-        with self._caplog.at_level(logging.WARNING):
+        with self._caplog.at_level("WARNING"):
             validate_config(cfg)
             assert len(self._caplog.records) == 0
 
     def test_hub_model_id_save_value_none(self, minimal_cfg):
         cfg = DictDefault({"hub_model_id": "test", "save_strategy": None}) | minimal_cfg
 
-        with self._caplog.at_level(logging.WARNING):
+        with self._caplog.at_level("WARNING"):
             validate_config(cfg)
             assert len(self._caplog.records) == 0
 
     def test_hub_model_id_save_value_no_set_save_strategy(self, minimal_cfg):
         cfg = DictDefault({"hub_model_id": "test"}) | minimal_cfg
 
-        with self._caplog.at_level(logging.WARNING):
+        with self._caplog.at_level("WARNING"):
             validate_config(cfg)
             assert len(self._caplog.records) == 0
 
     def test_dpo_beta_deprecation(self, minimal_cfg):
         cfg = DictDefault({"dpo_beta": 0.2}) | minimal_cfg
 
-        with self._caplog.at_level(logging.WARNING):
+        with self._caplog.at_level("WARNING"):
             new_cfg = validate_config(cfg)
             assert new_cfg["rl_beta"] == 0.2
             assert new_cfg["dpo_beta"] is None
@@ -1175,7 +1174,7 @@ class TestValidation(BaseValidation):
             | minimal_cfg
         )
 
-        with self._caplog.at_level(logging.WARNING):
+        with self._caplog.at_level("WARNING"):
             new_cfg = validate_config(cfg)
             assert new_cfg.eval_strategy == "steps"
             assert (
@@ -1203,7 +1202,7 @@ class TestValidation(BaseValidation):
                 cfg, capabilities=capabilities, env_capabilities=env_capabilities
             )
 
-        env_capabilities = {"torch_version": "2.5.1"}
+        env_capabilities = {"torch_version": "2.6.0"}
         capabilities = {"bf16": False}
         _ = validate_config(
             cfg, capabilities=capabilities, env_capabilities=env_capabilities
@@ -1245,7 +1244,7 @@ class TestTorchCompileValidation(BaseValidation):
             | minimal_cfg
         )
 
-        env_capabilities = {"torch_version": "2.5.1"}
+        env_capabilities = {"torch_version": "2.6.0"}
         capabilities = {"bf16": True}
         updated_cfg = validate_config(
             cfg, capabilities=capabilities, env_capabilities=env_capabilities
@@ -1455,7 +1454,7 @@ class TestValidationWandb(BaseValidation):
             | minimal_cfg
         )
 
-        with self._caplog.at_level(logging.WARNING):
+        with self._caplog.at_level("WARNING"):
             new_cfg = validate_config(cfg)
             assert any(
                 "wandb_run_id sets the ID of the run. If you would like to set the name, please use wandb_name instead."
@@ -1505,7 +1504,6 @@ class TestValidationWandb(BaseValidation):
         assert os.environ.get("WANDB_MODE", "") == "online"
         assert os.environ.get("WANDB_WATCH", "") == "false"
         assert os.environ.get("WANDB_LOG_MODEL", "") == "checkpoint"
-        assert os.environ.get("WANDB_DISABLED", "") != "true"
 
         os.environ.pop("WANDB_PROJECT", None)
         os.environ.pop("WANDB_NAME", None)
@@ -1514,16 +1512,12 @@ class TestValidationWandb(BaseValidation):
         os.environ.pop("WANDB_MODE", None)
         os.environ.pop("WANDB_WATCH", None)
         os.environ.pop("WANDB_LOG_MODEL", None)
-        os.environ.pop("WANDB_DISABLED", None)
 
     def test_wandb_set_disabled(self, minimal_cfg):
         cfg = DictDefault({}) | minimal_cfg
-
         new_cfg = validate_config(cfg)
-
         setup_wandb_env_vars(new_cfg)
-
-        assert os.environ.get("WANDB_DISABLED", "") == "true"
+        assert new_cfg.use_wandb is None
 
         cfg = (
             DictDefault(
@@ -1535,13 +1529,10 @@ class TestValidationWandb(BaseValidation):
         )
 
         new_cfg = validate_config(cfg)
-
         setup_wandb_env_vars(new_cfg)
-
-        assert os.environ.get("WANDB_DISABLED", "") != "true"
+        assert new_cfg.use_wandb is True
 
         os.environ.pop("WANDB_PROJECT", None)
-        os.environ.pop("WANDB_DISABLED", None)
 
 
 @pytest.mark.skipif(is_comet_available() is False, reason="comet_ml is not installed")
@@ -1699,3 +1690,18 @@ class TestValidationMLflow(BaseValidation):
         assert new_cfg.use_mlflow is True
 
         os.environ.pop("MLFLOW_EXPERIMENT_NAME", None)
+
+
+class TestDataloaderValidation(BaseValidation):
+    """
+    tests for dataloader_* sane defaults
+    """
+
+    def test_dataloader_auto_defaults(self, minimal_cfg):
+        cfg = minimal_cfg
+
+        new_cfg = validate_config(cfg, {"n_gpu": 8}, {"torch_version": "2.6.0"})
+
+        assert new_cfg.dataloader_num_workers == 8
+        assert new_cfg.dataloader_pin_memory is True
+        assert new_cfg.dataloader_prefetch_factor == 256
diff --git a/tests/prompt_strategies/conftest.py b/tests/prompt_strategies/conftest.py
index fe59e00d8..7f942e0ef 100644
--- a/tests/prompt_strategies/conftest.py
+++ b/tests/prompt_strategies/conftest.py
@@ -143,6 +143,12 @@ def fixture_phi35_tokenizer():
     return tokenizer
 
 
+@pytest.fixture(name="phi4_tokenizer", scope="session", autouse=True)
+def fixture_phi4_tokenizer():
+    tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-4-reasoning")
+    return tokenizer
+
+
 @pytest.fixture(name="gemma2_tokenizer", scope="session", autouse=True)
 def fixture_gemma2_tokenizer():
     tokenizer = AutoTokenizer.from_pretrained("mlx-community/gemma-2-9b-it-4bit")
@@ -150,6 +156,30 @@ def fixture_gemma2_tokenizer():
     return tokenizer
 
 
+@pytest.fixture(name="magistral_tokenizer")
+def fixture_magistral_tokenizer():
+    from axolotl.utils.mistral import HFMistralTokenizer
+
+    tokenizer = HFMistralTokenizer.from_pretrained("mistralai/Magistral-Small-2506")
+    return tokenizer
+
+
+@pytest.fixture(name="devstral_tokenizer")
+def fixture_devstral_tokenizer():
+    from axolotl.utils.mistral import HFMistralTokenizer
+
+    tokenizer = HFMistralTokenizer.from_pretrained("mistralai/Devstral-Small-2505")
+    return tokenizer
+
+
+@pytest.fixture(name="devstral_1_1_tokenizer")
+def fixture_devstral_1_1_tokenizer():
+    from axolotl.utils.mistral import HFMistralTokenizer
+
+    tokenizer = HFMistralTokenizer.from_pretrained("mistralai/Devstral-Small-2507")
+    return tokenizer
+
+
 @pytest.fixture(name="mistralv03_tokenizer_chat_template_jinja")
 def fixture_mistralv03_chat_template_jinja_w_system() -> str:
     return '{%- if messages[0]["role"] == "system" %}\n    {%- set system_message = messages[0]["content"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n    {%- if not (message.role == "tool" or message.role == "tool_results" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n        {%- if (message["role"] == "user") != (ns.index % 2 == 0) %}\n            {{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }}\n        {%- endif %}\n        {%- set ns.index = ns.index + 1 %}\n    {%- endif %}\n{%- endfor %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if message["role"] == "user" %}\n        {%- if tools is not none and (message == user_messages[-1]) %}\n            {{- "[AVAILABLE_TOOLS] [" }}\n            {%- for tool in tools %}\n                {%- set tool = tool.function %}\n                {{- \'{"type": "function", "function": {\' }}\n                {%- for key, val in tool.items() if key != "return" %}\n                    {%- if val is string %}\n                        {{- \'"\' + key + \'": "\' + val + \'"\' }}\n                    {%- else %}\n                        {{- \'"\' + key + \'": \' + val|tojson }}\n                    {%- endif %}\n                    {%- if not loop.last %}\n                        {{- ", " }}\n                    {%- endif %}\n                {%- endfor %}\n                {{- "}}" }}\n                {%- if not loop.last %}\n                    {{- ", " }}\n                {%- else %}\n                    {{- "]" }}\n                {%- endif %}\n            {%- endfor %}\n            {{- "[/AVAILABLE_TOOLS]" }}\n            {%- endif %}\n        {%- if loop.first and system_message is defined %}\n            {{- "[INST] " + system_message + "\\n\\n" + message["content"] + "[/INST]" }}\n        {%- else %}\n            {{- "[INST] " + message["content"] + "[/INST]" }}\n        {%- endif %}\n    {%- elif message.tool_calls is defined and message.tool_calls is not none %}\n        {{- "[TOOL_CALLS] [" }}\n        {%- for tool_call in message.tool_calls %}\n            {%- set out = tool_call.function|tojson %}\n            {{- out[:-1] }}\n            {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n                {{- raise_exception("Tool call IDs should be alphanumeric strings with length 9!") }}\n            {%- endif %}\n            {{- \', "id": "\' + tool_call.id + \'"}\' }}\n            {%- if not loop.last %}\n                {{- ", " }}\n            {%- else %}\n                {{- "]" + eos_token }}\n            {%- endif %}\n        {%- endfor %}\n    {%- elif message["role"] == "assistant" %}\n        {{- " " + message["content"]|trim + eos_token}}\n    {%- elif message["role"] == "tool_results" or message["role"] == "tool" %}\n        {%- if message.content is defined and message.content.content is defined %}\n            {%- set content = message.content.content %}\n        {%- else %}\n            {%- set content = message.content %}\n        {%- endif %}\n        {{- \'[TOOL_RESULTS] {"content": \' + content|string + ", " }}\n        {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n            {{- raise_exception("Tool call IDs should be alphanumeric strings with length 9!") }}\n        {%- endif %}\n        {{- \'"call_id": "\' + message.tool_call_id + \'"}[/TOOL_RESULTS]\' }}\n    {%- else %}\n        {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}\n    {%- endif %}\n{%- endfor %}\n'
diff --git a/tests/prompt_strategies/messages/test_chat.py b/tests/prompt_strategies/messages/test_chat.py
index 2681bb743..a4c2ae67f 100644
--- a/tests/prompt_strategies/messages/test_chat.py
+++ b/tests/prompt_strategies/messages/test_chat.py
@@ -3,14 +3,13 @@ tests for chat_template prompt strategy
 """
 
 # pylint: disable=duplicate-code
-import logging
 import unittest
 
 from axolotl.prompt_strategies.messages.chat import load
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
 
-logging.basicConfig(level=logging.DEBUG)
-LOG = logging.getLogger("axolotl")
+LOG = get_logger(__name__, log_level="DEBUG")
 
 
 class TestMessagesChatLlama3:
diff --git a/tests/prompt_strategies/test_chat_template_ds_schema_unification.py b/tests/prompt_strategies/test_chat_template_ds_schema_unification.py
new file mode 100644
index 000000000..502efae4b
--- /dev/null
+++ b/tests/prompt_strategies/test_chat_template_ds_schema_unification.py
@@ -0,0 +1,75 @@
+"""
+Tests for chat template prompt strategy with schema unification for none fields
+"""
+
+import json
+
+import pytest
+from datasets import Dataset
+from transformers import AutoTokenizer
+
+from axolotl.prompt_strategies.chat_template import StrategyLoader
+from axolotl.utils.dict import DictDefault
+
+
+@pytest.fixture(name="messages_w_tools")
+def fixture_messages_w_tools():
+    jsons = """
+{"messages":[{"role":"user","content":"move to (0, 1)"},{"role":"assistant","content":"","tool_calls":[{"function":{"name":"move","arguments":{"x":0,"y":1}}}]}],"tools":[{"type":"function","function":{"name":"move","description":"Move to a given location measured in meters","parameters":{"type":"object","properties":{"x":{"type":"number","description":"The x coordinate of the location, negative values are to the left, positive values are to the right"},"y":{"type":"number","description":"The y coordinate of the location, negative values are backward, positive values are forward"}},"required":["x","y"]}}},{"type":"function","function":{"name":"turn","description":"Turn the robot to a given direction","parameters":{"type":"object","properties":{"theta":{"type":"integer","description":"The angle to turn to, in degrees, positive values are counter-clockwise, negative values are clockwise"}},"required":["theta"]}}},{"type":"function","function":{"name":"invalid_prompt","description":"call when the user's prompt is invalid","parameters":{"type":"object","properties":{"message":{"type":"string","description":"why the prompt is invalid"}},"required":["message"]}}}],"add_generation_prompt":false}
+{"messages":[{"role":"user","content":"turn 270 degree"},{"role":"assistant","content":"","tool_calls":[{"function":{"name":"turn","arguments":{"theta": 270}}}]}],"tools":[{"type":"function","function":{"name":"move","description":"Move to a given location measured in meters","parameters":{"type":"object","properties":{"x":{"type":"number","description":"The x coordinate of the location, negative values are to the left, positive values are to the right"},"y":{"type":"number","description":"The y coordinate of the location, negative values are backward, positive values are forward"}},"required":["x","y"]}}},{"type":"function","function":{"name":"turn","description":"Turn the robot to a given direction","parameters":{"type":"object","properties":{"theta":{"type":"integer","description":"The angle to turn to, in degrees, positive values are counter-clockwise, negative values are clockwise"}},"required":["theta"]}}},{"type":"function","function":{"name":"invalid_prompt","description":"call when the user's prompt is invalid","parameters":{"type":"object","properties":{"message":{"type":"string","description":"why the prompt is invalid"}},"required":["message"]}}}],"add_generation_prompt":false}
+{"messages":[{"role":"user","content":"jump high"},{"role":"assistant","content":"","tool_calls":[{"function":{"name":"invalid_prompt","arguments":{"message": "jump is not a valid action"}}}]}],"tools":[{"type":"function","function":{"name":"move","description":"Move to a given location measured in meters","parameters":{"type":"object","properties":{"x":{"type":"number","description":"The x coordinate of the location, negative values are to the left, positive values are to the right"},"y":{"type":"number","description":"The y coordinate of the location, negative values are backward, positive values are forward"}},"required":["x","y"]}}},{"type":"function","function":{"name":"turn","description":"Turn the robot to a given direction","parameters":{"type":"object","properties":{"theta":{"type":"integer","description":"The angle to turn to, in degrees, positive values are counter-clockwise, negative values are clockwise"}},"required":["theta"]}}},{"type":"function","function":{"name":"invalid_prompt","description":"call when the user's prompt is invalid","parameters":{"type":"object","properties":{"message":{"type":"string","description":"why the prompt is invalid"}},"required":["message"]}}}],"add_generation_prompt":false}
+    """.strip().split(
+        "\n"
+    )
+    rows = [json.loads(row) for row in jsons]
+    return Dataset.from_list(rows)
+
+
+@pytest.fixture(name="qwen3_tokenizer")
+def qwen3_tokenizer_fixture(
+    download_qwen3_half_billion_model,
+):  # pylint: disable=unused-argument
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
+
+    return tokenizer
+
+
+@pytest.fixture(name="qwen3_prompt_strategy")
+def qwen3_chat_template_strategy(qwen3_tokenizer):
+    cfg = DictDefault(
+        sequence_len=2048,
+        chat_template="qwen3",
+        eot_tokens=["<|im_end|>"],
+    )
+    ds_cfg = DictDefault(
+        type="chat_template",
+    )
+    load = StrategyLoader()
+    strat = load(qwen3_tokenizer, cfg, ds_cfg)
+    return strat
+
+
+class TestSchemaUnification:
+    """
+    Test class on handling null fields for tool calling
+    """
+
+    def test_schema_unification_single_prompt(
+        self, messages_w_tools, qwen3_prompt_strategy, qwen3_tokenizer
+    ):
+        for row in messages_w_tools:
+            inputs = qwen3_prompt_strategy.tokenize_prompt(row)
+            decoded = qwen3_tokenizer.decode(inputs["input_ids"])
+            tool_call = decoded.split("<tool_call>")[-1].split("</tool_call>")[0]
+            assert '"message": null' not in tool_call
+            assert '"theta": null' not in tool_call
+
+    def test_schema_unification_batched(
+        self, messages_w_tools, qwen3_prompt_strategy, qwen3_tokenizer
+    ):
+        rows = messages_w_tools.map(qwen3_prompt_strategy.tokenize_prompt, batched=True)
+        for row in rows:
+            decoded = qwen3_tokenizer.decode(row["input_ids"])
+            tool_call = decoded.split("<tool_call>")[-1].split("</tool_call>")[0]
+            assert '"message": null' not in tool_call
+            assert '"theta": null' not in tool_call
diff --git a/tests/prompt_strategies/test_chat_templates.py b/tests/prompt_strategies/test_chat_templates.py
index 68772b56b..371ccf616 100644
--- a/tests/prompt_strategies/test_chat_templates.py
+++ b/tests/prompt_strategies/test_chat_templates.py
@@ -2,7 +2,6 @@
 tests for chat_template prompt strategy
 """
 
-import logging
 import unittest
 
 from axolotl.prompt_strategies.chat_template import (
@@ -13,9 +12,9 @@ from axolotl.prompt_strategies.chat_template import (
 from axolotl.prompters import IGNORE_TOKEN_ID
 from axolotl.utils.chat_templates import get_chat_template
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
 
-logging.basicConfig(level=logging.DEBUG)
-LOG = logging.getLogger("axolotl")
+LOG = get_logger(__name__)
 
 
 class TestAssistantChatTemplateLlama3:
diff --git a/tests/prompt_strategies/test_chat_templates_advanced.py b/tests/prompt_strategies/test_chat_templates_advanced.py
index 38a5b6c43..f847cab4a 100644
--- a/tests/prompt_strategies/test_chat_templates_advanced.py
+++ b/tests/prompt_strategies/test_chat_templates_advanced.py
@@ -4,7 +4,6 @@ tests for chat_template prompt strategy
 
 # pylint: disable=too-many-lines
 
-import logging
 from copy import deepcopy
 
 import pytest
@@ -18,11 +17,11 @@ from axolotl.prompt_strategies.chat_template import (
 )
 from axolotl.prompters import IGNORE_TOKEN_ID
 from axolotl.utils.chat_templates import get_chat_template
+from axolotl.utils.logging import get_logger
 
 from tests.hf_offline_utils import enable_hf_offline
 
-logging.basicConfig(level=logging.DEBUG)
-LOG = logging.getLogger("axolotl")
+LOG = get_logger(__name__)
 
 PARAMETRIZE_KEYS = "tokenizer, chat_template, chat_template_jinja, eos_token"
 PARAMETRIZE_PARAMS = [
@@ -34,15 +33,14 @@ PARAMETRIZE_PARAMS = [
         "mistralv03_tokenizer_chat_template_jinja",
         "[/INST]",
     ),
-    # TODO: temporarily skip gemma due to gemma3 template
-    # Re-enable on new chat_template implementation for perf
-    # (
-    #     "gemma2_tokenizer",
-    #     "jinja",
-    #     "gemma2_tokenizer_chat_template_jinja",
-    #     "<end_of_turn>",
-    # ),
+    (
+        "gemma2_tokenizer",
+        "jinja",
+        "gemma2_tokenizer_chat_template_jinja",
+        "<end_of_turn>",
+    ),
     ("phi35_tokenizer", "phi_35", None, "<|end|>"),
+    ("phi4_tokenizer", "phi_4", None, "<|im_end|>"),
 ]
 
 
@@ -96,11 +94,7 @@ class TestChatTemplateConfigurations:
         if (
             turn_idx == 0
             and turn.get("from") in ["system", "context"]
-            and (
-                "mistral" in tokenizer.name_or_path.lower()
-                or "gemma"
-                in tokenizer.name_or_path.lower()  # temporarily skip gemma due to gemma3 template
-            )
+            and ("mistral" in tokenizer.name_or_path.lower())
         ):
             assert (
                 start_idx == -1 and end_idx == -1
@@ -936,36 +930,14 @@ class TestChatTemplateConfigurations:
             "messages",
         )
 
-        if chat_template == "llama3":
-            assert variables == {"role", "content"}, (
-                f"Expected variables: {'role', 'content'} from {tokenizer}/{chat_template}\n"
-                f"Got: {variables}\n"
-                f"Chat template: {actual_jinja_template}"
-            )
-        elif chat_template == "chatml":
-            assert variables == {"role", "content"}, (
-                f"Expected variables: {'role', 'content'} from {tokenizer}/{chat_template}\n"
-                f"Got: {variables}\n"
-                f"Chat template: {actual_jinja_template}"
-            )
-        elif chat_template == "jinja" and tokenizer == "mistralv03_tokenizer":
-            assert variables == {"role", "content", "tool_call_id", "tool_calls"}, (
-                f"Expected variables: {'role', 'content', 'tool_call_id', 'tool_calls'} from {tokenizer}/{chat_template}\n"
-                f"Got: {variables}\n"
-                f"Chat template: {actual_jinja_template}"
-            )
-        elif chat_template == "jinja" and tokenizer == "gemma2_tokenizer":
-            assert variables == {"role", "content"}, (
-                f"Expected variables: {'role', 'content'} from {tokenizer}/{chat_template}\n"
-                f"Got: {variables}\n"
-                f"Chat template: {actual_jinja_template}"
-            )
-        elif chat_template == "phi_35":
-            assert variables == {"role", "content"}, (
-                f"Expected variables: {'role', 'content'} from {tokenizer}/{chat_template}\n"
-                f"Got: {variables}\n"
-                f"Chat template: {actual_jinja_template}"
-            )
+        # Special case for Mistral with additional tool variables
+        if chat_template == "jinja" and tokenizer == "mistralv03_tokenizer":
+            expected_variables = {"role", "content", "tool_call_id", "tool_calls"}
+        # Most chat templates use the standard role and content variables
+        elif chat_template in ["llama3", "chatml", "phi_35", "phi_4"] or (
+            chat_template == "jinja" and tokenizer == "gemma2_tokenizer"
+        ):
+            expected_variables = {"role", "content"}
         else:
             LOG.warning(
                 f"Unsupported chat template: {chat_template} with {chat_template_jinja}"
@@ -974,6 +946,12 @@ class TestChatTemplateConfigurations:
                 f"Unsupported chat template: {chat_template} with {chat_template_jinja}"
             )
 
+        assert variables == expected_variables, (
+            f"Expected variables: {expected_variables} from {tokenizer}/{chat_template}\n"
+            f"Got: {variables}\n"
+            f"Chat template: {actual_jinja_template}"
+        )
+
     def test_eot_tokens_conflict_with_eos_token(
         self,
         tokenizer,
@@ -1281,3 +1259,162 @@ class TestChatTemplateConfigurations:
                     assert (
                         labels[eos_idx] != IGNORE_TOKEN_ID
                     ), f"Expected EOT token at index {eos_idx} to be labeled with train_on_eot='{setting}'"
+
+
+class TestChatTemplateToolCalling:
+    """
+    Test class for tool calling functionality with chat templates.
+    """
+
+    def test_tool_calling_with_llama4_template(
+        self,
+        llama3_tokenizer,
+    ):
+        LOG.info("Testing tool calling with llama3 tokenizer and llama4 chat template")
+
+        # Create tool calling dataset
+        tool_calling_dataset = [
+            {
+                "tools": [
+                    {
+                        "type": "function",
+                        "function": {
+                            "name": "xml_escape",
+                            "description": 'Replaces any "<", ">", or "&" characters in the input string with their corresponding XML entities.',
+                            "parameters": {
+                                "type": "object",
+                                "properties": {
+                                    "s": {
+                                        "type": "string",
+                                        "description": "The input string to be XML-escaped.",
+                                    }
+                                },
+                                "required": ["s"],
+                            },
+                        },
+                    },
+                    {
+                        "type": "function",
+                        "function": {
+                            "name": "multiples",
+                            "description": "Generates a list of all the multiples of a number that are less than a given limit.",
+                            "parameters": {
+                                "type": "object",
+                                "properties": {
+                                    "number": {
+                                        "type": "integer",
+                                        "description": "The number to find multiples of.",
+                                    },
+                                    "limit": {
+                                        "type": "integer",
+                                        "description": "The upper limit for the multiples.",
+                                    },
+                                },
+                                "required": ["number", "limit"],
+                            },
+                        },
+                    },
+                ],
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": "Can you help me find multiples of 5 that are less than 20?",
+                    },
+                    {
+                        "role": "assistant",
+                        "tool_calls": [
+                            {
+                                "type": "function",
+                                "function": {
+                                    "name": "multiples",
+                                    "arguments": {
+                                        "number": 5,
+                                        "limit": 20,
+                                    },
+                                },
+                            }
+                        ],
+                    },
+                    {"role": "tool", "name": "multiples", "content": "5,10,15"},
+                    {
+                        "role": "assistant",
+                        "content": "The multiples of 5 less than 20 are: 5, 10, and 15.",
+                    },
+                ],
+            }
+        ]
+
+        # Setup tokenizer with llama4 chat template
+        tokenizer = deepcopy(llama3_tokenizer)
+
+        # Add EOS token to the tokenizer
+        eot_token = "<|eot_id|>"
+        tokenizer.add_special_tokens({"additional_special_tokens": [eot_token]})
+
+        strategy = ChatTemplateStrategy(
+            ChatTemplatePrompter(
+                tokenizer,
+                chat_template=get_chat_template("llama4"),
+                message_property_mappings={"role": "role", "content": "content"},
+                field_messages="messages",
+                field_tools="tools",
+            ),
+            tokenizer=tokenizer,
+            train_on_inputs=False,
+            sequence_len=512,
+            roles_to_train=["assistant"],
+            eot_tokens=[eot_token],
+        )
+
+        res = strategy.tokenize_prompt(tool_calling_dataset[0])
+        input_ids = res["input_ids"]
+        labels = res["labels"]
+
+        # Verify that the input_ids contain expected tokens
+        assert len(input_ids) > 0, "Input IDs should not be empty"
+        assert len(labels) == len(input_ids), "Labels should match input_ids length"
+
+        # Decode the full conversation to verify structure
+        decoded_conversation = tokenizer.decode(input_ids)
+
+        # Verify tool calling structure is present in the decoded conversation
+        assert (
+            '"type": "function",' in decoded_conversation
+        ), "Tool type function should be in conversation"
+        assert (
+            '"name": "multiples",' in decoded_conversation
+        ), "Tool function name should be in conversation"
+
+        assert (
+            '<|python_start|><|python_end|>{"name": "multiples", "parameters": {"number": 5, "limit": 20}}<|eot|>'
+            in decoded_conversation
+        ), "Assistant tool call should be in conversation"
+        assert (
+            "<|header_start|>ipython<|header_end|>" in decoded_conversation
+        ), "IPython header should be in conversation"
+        assert (
+            '"5,10,15"' in decoded_conversation
+        ), "Tool response should be in conversation"
+
+        # Get conversation turns to verify labeling
+        turns = strategy.get_conversation_thread(tool_calling_dataset[0])
+        tools = strategy._get_tools(  # pylint: disable=protected-access
+            tool_calling_dataset[0]
+        )
+
+        # Check that assistant responses are properly labeled
+        for i, turn in enumerate(tool_calling_dataset[0]["messages"]):
+            if turn["role"] == "assistant":
+                start_idx, end_idx = strategy.find_turn(
+                    turns=turns, turn_idx=i, tools=tools
+                )
+
+                assert (
+                    start_idx != -1 and end_idx != -1
+                ), f"Assistant turn {i} should be found"
+
+                # Verify that assistant responses have proper labels
+                turn_labels = labels[start_idx:end_idx]
+                assert all(
+                    label != IGNORE_TOKEN_ID for label in turn_labels
+                ), f"Assistant turn {i} should be unmasked"
diff --git a/tests/prompt_strategies/test_chat_templates_mistral.py b/tests/prompt_strategies/test_chat_templates_mistral.py
new file mode 100644
index 000000000..a5b31a771
--- /dev/null
+++ b/tests/prompt_strategies/test_chat_templates_mistral.py
@@ -0,0 +1,851 @@
+"""Test chat templates for mistral-common wrapper tokenizer"""
+
+import unittest
+from typing import TYPE_CHECKING
+
+import pytest
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer
+
+    from axolotl.utils.mistral import HFMistralTokenizer
+
+
+# fmt: off
+@pytest.mark.parametrize(
+    ("tokenizer_str", "assistant_toolcall_ids", "tool_result_ids"),
+    (
+        ("magistral_tokenizer", (9, 44627, 3684, 33, 19881, 1049, 1050, 1051, 1052, 1053, 32, 19227, 12856, 2811, 1032, 1049, 1054, 1044, 1429, 33319, 2811, 1032, 1050, 1125, 2), (7, 19881, 1049, 1050, 1051, 1052, 1053, 19, 1049, 1044, 1050, 8)),
+        ("devstral_tokenizer", (9, 1091, 19227, 2391, 2811, 1429, 44627, 3684, 1897, 1429, 61906, 2811, 16753, 12856, 2811, 1032, 1049, 1054, 1044, 1429, 33319, 2811, 1032, 1050, 4179, 1429, 1327, 2811, 1429, 19881, 1049, 1050, 1051, 1052, 1053, 1034, 27028, 2), (7, 19881, 1049, 1050, 1051, 1052, 1053, 19, 1049, 1044, 1050, 8)),
+        ("devstral_1_1_tokenizer", (9, 44627, 3684, 32, 19227, 12856, 2811, 1032, 1049, 1054, 1044, 1429, 33319, 2811, 1032, 1050, 1125, 2,), (7, 1049, 1044, 1050, 8)),
+    )
+)
+# fmt: on
+def test_mistral_chat_template(
+    tokenizer_str: str,
+    assistant_toolcall_ids: tuple[int, ...],
+    tool_result_ids: tuple[int, ...],
+    request: pytest.FixtureRequest,
+):
+    """Test chat template with the Magistral/Devstral tokenizer"""
+    # pylint: disable=duplicate-code
+    from axolotl.prompt_strategies.chat_template import MistralPrompter, MistralStrategy
+
+    tokenizer: HFMistralTokenizer = request.getfixturevalue(tokenizer_str)
+
+    # check bos, eos, pad, unk are accessible properties
+    assert tokenizer.bos_token_id == 1
+    assert tokenizer.eos_token_id == 2
+    assert tokenizer.pad_token_id == 11
+    assert tokenizer.unk_token_id == 0
+
+    assert tokenizer.pad_token == "<pad>"
+    assert tokenizer.eos_token == "</s>"
+    assert tokenizer.bos_token == "<s>"
+    assert tokenizer.unk_token == "<unk>"
+
+    strategy = MistralStrategy(
+        MistralPrompter(
+            tokenizer,
+            chat_template=None,
+            message_property_mappings={"role": "role", "content": "content"},
+        ),
+        tokenizer=tokenizer,
+        train_on_inputs=False,
+        train_on_eos="turn",
+        sequence_len=512,
+        roles_to_train=["assistant"],
+    )
+
+    # test chat template masking without system prompt
+    res = strategy.tokenize_prompt(
+        {
+            "messages": [
+                {"role": "user", "content": "Hello, how are you?"},
+                {"role": "assistant", "content": "I'm doing great, thank you!"},
+            ]
+        }
+    )
+
+    assert res["input_ids"] == [
+        1,  # bos
+        3,  # [INST]
+        22177,  # Hello
+        1044,  # ,
+        2606,  # how
+        1584,  # are
+        1636,  # you
+        1063,  # ?
+        4,  # [/INST]
+        1073,  # I
+        4525,  # 'm
+        6965,  # doing
+        4824,  # great
+        1044,  # ,
+        15412,  # thank
+        1636,  # you
+        1033,  # !
+        2,  # </s>
+    ]
+
+    assert res["labels"] == [
+        -100,  # bos
+        -100,  # [INST]
+        -100,  # Hello
+        -100,  # ,
+        -100,  # how
+        -100,  # are
+        -100,  # you
+        -100,  # ?
+        -100,  # [/INST]
+        1073,  # I
+        4525,  # 'm
+        6965,  # doing
+        4824,  # great
+        1044,  # ,
+        15412,  # thank
+        1636,  # you
+        1033,  # !
+        2,  # </s>
+    ]
+
+    # test chat template masking with system prompt
+    res = strategy.tokenize_prompt(
+        {
+            "messages": [
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "Hello, how are you?"},
+                {"role": "assistant", "content": "I'm doing great, thank you!"},
+            ]
+        }
+    )
+
+    assert res["input_ids"] == [
+        1,  # bos
+        17,  # [SYSTEM_PROMPT]
+        4568,  # You
+        1584,  # are
+        1261,  # a
+        20351,  # helpful
+        27089,  # assistant
+        1046,  # .
+        18,  # [/SYSTEM_PROMPT]
+        3,  # [INST]
+        22177,  # Hello
+        1044,  # ,
+        2606,  # how
+        1584,  # are
+        1636,  # you
+        1063,  # ?
+        4,  # [/INST]
+        1073,  # I
+        4525,  # 'm
+        6965,  # doing
+        4824,  # great
+        1044,  # ,
+        15412,  # thank
+        1636,  # you
+        1033,  # !
+        2,  # </s>
+    ]
+
+    assert res["labels"] == [
+        -100,  # bos
+        -100,  # [SYSTEM_PROMPT]
+        -100,  # You
+        -100,  # are
+        -100,  # a
+        -100,  # helpful
+        -100,  # assistant
+        -100,  # .
+        -100,  # [/SYSTEM_PROMPT]
+        -100,  # [INST]
+        -100,  # Hello
+        -100,  # ,
+        -100,  # how
+        -100,  # are
+        -100,  # you
+        -100,  # ?
+        -100,  # [/INST]
+        1073,  # I
+        4525,  # 'm
+        6965,  # doing
+        4824,  # great
+        1044,  # ,
+        15412,  # thank
+        1636,  # you
+        1033,  # !
+        2,  # </s>
+    ]
+
+    # test chat template with tools
+    res = strategy.tokenize_prompt(
+        {
+            "tools": [
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "multiples",
+                        "description": "Generates a list of all the multiples of a number that are less than a given limit.",
+                        "parameters": {
+                            "type": "object",
+                            "properties": {
+                                "number": {
+                                    "type": "integer",
+                                    "description": "The number to find multiples of.",
+                                },
+                                "limit": {
+                                    "type": "integer",
+                                    "description": "The upper limit for the multiples.",
+                                },
+                            },
+                            "required": ["number", "limit"],
+                        },
+                    },
+                },
+            ],
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "Hey, can you give me a breakdown of how to throw an awesome themed party? Like, what themes work best, and how can I set everything up to really wow my guests? I want some ideas on decorations, food, and activities that will make the party unforgettable!",
+                },
+                {
+                    "role": "assistant",
+                    "tool_calls": [
+                        {
+                            "id": "call12345",
+                            "type": "function",
+                            "function": {
+                                "name": "multiples",
+                                "arguments": {
+                                    "number": 16,
+                                    "limit": 2,
+                                },
+                            },
+                        }
+                    ],
+                },
+                {
+                    "role": "tool",
+                    "tool_call_id": "call12345",
+                    "name": "multiples",
+                    "content": "1,2",
+                },
+                {"role": "assistant", "content": "The multiples of 16 is 1 and 2."},
+            ],
+        }
+    )
+
+    # fmt: off
+    assert res["input_ids"] == [
+        1,  # bos
+        5, 1091, 19227, 4994, 2811, 1429, 5165, 1897, 1429, 5165, 2811, 16753, 2391, 2811, 1429, 44627, 3684, 1897, 1429, 14653, 2811, 1429, 10639, 2130, 1261, 2951, 1307, 1747, 1278, 60092, 1307, 1261, 2782, 1455, 1584, 4289, 2224, 1261, 4265, 6139, 39249, 1429, 26204, 2811, 16753, 4994, 2811, 1429, 6371, 1897, 1429, 48649, 2811, 16753, 12856, 2811, 16753, 4994, 2811, 1429, 49039, 1897, 1429, 14653, 2811, 1429, 1784, 2782, 1317, 3081, 60092, 1307, 2613, 4179, 1429, 33319, 2811, 16753, 4994, 2811, 1429, 49039, 1897, 1429, 14653, 2811, 1429, 1784, 9229, 6139, 1394, 1278, 60092, 2613, 47579, 1429, 15760, 2811, 12161, 12856, 1897, 1429, 33319, 4964, 2821, 27028, 6,  # tool prompt
+        3, 46634, 1044, 1710, 1636, 5628, 1639, 1261, 44433, 1307, 2606, 1317, 5388, 1420, 54191, 2424, 1286, 8967, 1063, 15621, 1044, 2549, 30305, 2196, 3560, 1044, 1321, 2606, 1710, 1362, 2016, 8605, 2015, 1317, 5524, 118931, 2036, 32951, 1063, 1362, 2933, 2269, 12106, 1408, 101987, 1044, 6939, 1044, 1321, 9216, 1455, 2084, 3180, 1278, 8967, 119141, 1689, 5935, 1033, 4,  # user
+        *assistant_toolcall_ids,  # assistant tool calling
+        *tool_result_ids,  # tool result
+        1784, 60092, 1307, 1032, 1049, 1054, 1395, 1032, 1049, 1321, 1032, 1050, 1046,  # assistant
+        2  # eos
+    ]
+
+    assert res["labels"] == [
+        -100,  # bos
+        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,  # tool prompt
+        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,  # user prompt
+        *assistant_toolcall_ids,  # assistant tool calling
+        *([-100] * len(tool_result_ids)),  # tool result
+        1784, 60092, 1307, 1032, 1049, 1054, 1395, 1032, 1049, 1321, 1032, 1050, 1046,  # assistant
+        2  # eos
+    ]
+    # fmt: on
+
+    # test chat template with tokenize=False
+    res = tokenizer.apply_chat_template(
+        [
+            {"role": "user", "content": "Hello, how are you?"},
+            {"role": "assistant", "content": "I'm doing great, thank you!"},
+        ],
+        tokenize=False,
+    )
+
+    assert res == "<s>[INST]Hello, how are you?[/INST]I'm doing great, thank you!</s>"
+
+    # test encode
+    res = tokenizer.encode("Hello, how are you?", add_special_tokens=True)
+    assert res == [
+        1,  # bos
+        22177,  # Hello
+        1044,  # ,
+        2606,  # how
+        1584,  # are
+        1636,  # you
+        1063,  # ?
+        2,  # eos
+    ]
+
+    # test decode no skip special tokens
+    decoded_res = tokenizer.decode(res, skip_special_tokens=False)
+
+    assert decoded_res == "<s>Hello, how are you?</s>"
+
+    # test decode skip special tokens
+    decoded_res = tokenizer.decode(res, skip_special_tokens=True)
+    assert decoded_res == "Hello, how are you?"
+
+    # test encode no special tokens
+    res = tokenizer.encode("Hello, how are you?", add_special_tokens=False)
+    assert res == [
+        22177,  # Hello
+        1044,  # ,
+        2606,  # how
+        1584,  # are
+        1636,  # you
+        1063,  # ?
+    ]
+
+    # test convert ids to tokens
+    res = tokenizer.convert_ids_to_tokens(res)
+    # spacing are needed as we are converting without decoding
+    assert res == ["Hello", ",", " how", " are", " you", "?"]
+
+
+@pytest.mark.skip(reason="TODO, fix for new HF wrapper call")
+def test_magistral_tokenizer_pad_method(magistral_tokenizer: "HFMistralTokenizer"):
+    """Test the MistralTokenizer pad method"""
+    from axolotl.utils.collators.core import IGNORE_INDEX
+
+    magistral_pad_token_id = 11  # taken from tokenizer.pad_token_id
+
+    # Test padding with input_ids and labels only
+    features = [
+        {"input_ids": [1, 2, 3], "labels": [4, 5, 6]},
+        {"input_ids": [7, 8], "labels": [9, 10]},
+    ]
+
+    result = magistral_tokenizer.pad(features, padding=True, return_tensors="pt")
+
+    # Check that input_ids are padded correctly
+    assert result["input_ids"].shape == (2, 3)
+    assert result["input_ids"].tolist() == [[1, 2, 3], [7, 8, magistral_pad_token_id]]
+
+    # Check that labels are padded correctly
+    assert result["labels"].shape == (2, 3)
+    assert result["labels"].tolist() == [[4, 5, 6], [9, 10, IGNORE_INDEX]]
+
+    # Check that attention_mask and position_ids are NOT created
+    assert "attention_mask" not in result
+    assert "position_ids" not in result
+
+    # Test padding with attention_mask
+    features_with_attention = [
+        {"input_ids": [1, 2, 3], "labels": [4, 5, 6], "attention_mask": [1, 1, 1]},
+        {"input_ids": [7, 8], "labels": [9, 10], "attention_mask": [1, 1]},
+    ]
+
+    result = magistral_tokenizer.pad(
+        features_with_attention, padding=True, return_tensors="pt"
+    )
+
+    # Check that attention_mask is padded correctly
+    assert result["attention_mask"].shape == (2, 3)
+    assert result["attention_mask"].tolist() == [[1, 1, 1], [1, 1, 0]]
+
+    # Test padding with position_ids
+    features_with_position = [
+        {"input_ids": [1, 2, 3], "labels": [4, 5, 6], "position_ids": [0, 1, 2]},
+        {"input_ids": [7, 8], "labels": [9, 10], "position_ids": [0, 1]},
+    ]
+
+    result = magistral_tokenizer.pad(
+        features_with_position, padding=True, return_tensors="pt"
+    )
+
+    # Check that position_ids are padded correctly (continuing sequence)
+    assert result["position_ids"].shape == (2, 3)
+    assert result["position_ids"].tolist() == [[0, 1, 2], [0, 1, 2]]
+
+    # Test padding with all fields
+    features_all = [
+        {
+            "input_ids": [1, 2, 3],
+            "labels": [4, 5, 6],
+            "attention_mask": [1, 1, 1],
+            "position_ids": [0, 1, 2],
+        },
+        {
+            "input_ids": [7, 8],
+            "labels": [9, 10],
+            "attention_mask": [1, 1],
+            "position_ids": [0, 1],
+        },
+    ]
+
+    result = magistral_tokenizer.pad(features_all, padding=True, return_tensors="pt")
+
+    # All fields should be present and correctly padded
+    assert "input_ids" in result
+    assert "labels" in result
+    assert "attention_mask" in result
+    assert "position_ids" in result
+
+    # Test padding with all sequences same length
+    features_same_length = [
+        {"input_ids": [1, 2, 3], "labels": [4, 5, 6]},
+        {"input_ids": [7, 8, 9], "labels": [10, 11, 12]},
+    ]
+
+    result = magistral_tokenizer.pad(
+        features_same_length, padding=True, return_tensors="pt"
+    )
+
+    # Check match when no padding is needed
+    assert result["input_ids"][0].tolist() == features_same_length[0]["input_ids"]
+    assert result["labels"][0].tolist() == features_same_length[0]["labels"]
+
+    assert result["input_ids"][1].tolist() == features_same_length[1]["input_ids"]
+    assert result["labels"][1].tolist() == features_same_length[1]["labels"]
+
+    # Test padding with max_length parameter
+    result = magistral_tokenizer.pad(
+        features, padding="max_length", max_length=5, return_tensors="pt"
+    )
+
+    # Should pad to max_length
+    assert result["input_ids"].shape == (2, 5)
+    assert result["labels"].shape == (2, 5)
+
+    # Test numpy return type
+    result = magistral_tokenizer.pad(features, padding=True, return_tensors="np")
+
+    # Should return numpy arrays
+    import numpy as np
+
+    assert isinstance(result["input_ids"], np.ndarray)
+    assert isinstance(result["labels"], np.ndarray)
+
+    # Test unsupported field rejection
+    features_unsupported = [
+        {"input_ids": [1, 2, 3], "labels": [4, 5, 6], "unsupported_field": [7, 8, 9]},
+    ]
+
+    with pytest.raises(NotImplementedError, match="unsupported_field"):
+        magistral_tokenizer.pad(features_unsupported, padding=True, return_tensors="pt")
+
+    # Test token_type_ids rejection
+    features_token_type = [
+        {"input_ids": [1, 2, 3], "labels": [4, 5, 6], "token_type_ids": [0, 0, 0]},
+    ]
+
+    with pytest.raises(ValueError, match="token_type_ids is not supported"):
+        magistral_tokenizer.pad(features_token_type, padding=True, return_tensors="pt")
+
+
+def test_magistral_tool_calling(magistral_tokenizer: "HFMistralTokenizer"):
+    """Test tool calling with the Magistral tokenizer"""
+    from axolotl.prompt_strategies.chat_template import MistralPrompter, MistralStrategy
+
+    strategy = MistralStrategy(
+        MistralPrompter(
+            magistral_tokenizer,
+            chat_template=None,
+            message_property_mappings={"role": "role", "content": "content"},
+        ),
+        tokenizer=magistral_tokenizer,
+        train_on_inputs=False,
+        train_on_eos="turn",
+        sequence_len=512,
+        roles_to_train=["assistant"],
+    )
+
+    # Test basic tool calling with single function
+    basic_tool_calling = {
+        "tools": [
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_weather",
+                    "description": "Get the current weather for a location",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "location": {
+                                "type": "string",
+                                "description": "The city and state, e.g. San Francisco, CA",
+                            },
+                        },
+                        "required": ["location"],
+                    },
+                },
+            },
+        ],
+        "messages": [
+            {
+                "role": "user",
+                "content": "What's the weather like in San Francisco?",
+            },
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "id": "call12345",
+                        "type": "function",
+                        "function": {
+                            "name": "get_weather",
+                            "arguments": {
+                                "location": "San Francisco, CA",
+                            },
+                        },
+                    }
+                ],
+            },
+            {
+                "role": "tool",
+                "tool_call_id": "call12345",
+                "name": "get_weather",
+                "content": "Sunny, 72°F",
+            },
+            {
+                "role": "assistant",
+                "content": "The weather in San Francisco is sunny and 72°F.",
+            },
+        ],
+    }
+
+    res = strategy.tokenize_prompt(basic_tool_calling)
+
+    # Basic validation
+    assert "input_ids" in res
+    assert "labels" in res
+    assert len(res["input_ids"]) > 0
+    assert len(res["labels"]) == len(res["input_ids"])
+
+    # Decode and verify structure
+    decoded = magistral_tokenizer.decode(res["input_ids"])
+    assert (
+        '<s>[AVAILABLE_TOOLS][{"type": "function", "function": {"name": "get_weather", "description": "Get the current weather for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city and state, e.g. San Francisco, CA"}}, "required": ["location"]}}}][/AVAILABLE_TOOLS]'
+        in decoded
+    )
+    assert (
+        '[TOOL_CALLS]get_weather[CALL_ID]call12345[ARGS]{"location": "San Francisco, CA"}</s>'
+        in decoded
+    )
+    assert "[TOOL_RESULTS]call12345[TOOL_CONTENT]Sunny, 72°F[/TOOL_RESULTS]" in decoded
+    assert "The weather in San Francisco is sunny and 72°F.</s>" in decoded
+
+    # Test multiple tool calls in sequence
+    multi_tool_calling = {
+        "tools": [
+            {
+                "type": "function",
+                "function": {
+                    "name": "add_numbers",
+                    "description": "Add two numbers together",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "a": {"type": "number", "description": "First number"},
+                            "b": {"type": "number", "description": "Second number"},
+                        },
+                        "required": ["a", "b"],
+                    },
+                },
+            },
+            {
+                "type": "function",
+                "function": {
+                    "name": "multiply_numbers",
+                    "description": "Multiply two numbers",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "x": {"type": "number", "description": "First number"},
+                            "y": {"type": "number", "description": "Second number"},
+                        },
+                        "required": ["x", "y"],
+                    },
+                },
+            },
+        ],
+        "messages": [
+            {
+                "role": "user",
+                "content": "Add 5 and 3, then multiply the result by 2",
+            },
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "id": "call12345",
+                        "type": "function",
+                        "function": {
+                            "name": "add_numbers",
+                            "arguments": {"a": 5, "b": 3},
+                        },
+                    }
+                ],
+            },
+            {
+                "role": "tool",
+                "tool_call_id": "call12345",
+                "name": "add_numbers",
+                "content": "8",
+            },
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "id": "call23456",
+                        "type": "function",
+                        "function": {
+                            "name": "multiply_numbers",
+                            "arguments": {"x": 8, "y": 2},
+                        },
+                    }
+                ],
+            },
+            {
+                "role": "tool",
+                "tool_call_id": "call23456",
+                "name": "multiply_numbers",
+                "content": "16",
+            },
+            {
+                "role": "assistant",
+                "content": "The result is 16. I first added 5 and 3 to get 8, then multiplied 8 by 2 to get 16.",
+            },
+        ],
+    }
+
+    res = strategy.tokenize_prompt(multi_tool_calling)
+
+    # Validation
+    assert len(res["input_ids"]) > 0
+    assert len(res["labels"]) == len(res["input_ids"])
+
+    decoded = magistral_tokenizer.decode(res["input_ids"])
+    assert (
+        '<s>[AVAILABLE_TOOLS][{"type": "function", "function": {"name": "add_numbers", "description": "Add two numbers together", "parameters": {"type": "object", "properties": {"a": {"type": "number", "description": "First number"}, "b": {"type": "number", "description": "Second number"}}, "required": ["a", "b"]}}}, {"type": "function", "function": {"name": "multiply_numbers", "description": "Multiply two numbers", "parameters": {"type": "object", "properties": {"x": {"type": "number", "description": "First number"}, "y": {"type": "number", "description": "Second number"}}, "required": ["x", "y"]}}}][/AVAILABLE_TOOLS]'
+        in decoded
+    )
+    assert (
+        '[TOOL_CALLS]add_numbers[CALL_ID]call12345[ARGS]{"a": 5, "b": 3}</s>' in decoded
+    )
+    assert "[TOOL_RESULTS]call12345[TOOL_CONTENT]8[/TOOL_RESULTS]" in decoded
+    assert (
+        '[TOOL_CALLS]multiply_numbers[CALL_ID]call23456[ARGS]{"x": 8, "y": 2}</s>'
+        in decoded
+    )
+    assert "[TOOL_RESULTS]call23456[TOOL_CONTENT]16[/TOOL_RESULTS]" in decoded
+    assert (
+        "The result is 16. I first added 5 and 3 to get 8, then multiplied 8 by 2 to get 16.</s>"
+        in decoded
+    )
+
+    # Test tool calling with system message
+    system_tool_calling = {
+        "tools": [
+            {
+                "type": "function",
+                "function": {
+                    "name": "search_database",
+                    "description": "Search for information in database",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "query": {"type": "string", "description": "Search query"},
+                        },
+                        "required": ["query"],
+                    },
+                },
+            },
+        ],
+        "messages": [
+            {
+                "role": "system",
+                "content": "You are a helpful assistant with access to a database.",
+            },
+            {
+                "role": "user",
+                "content": "Find information about Python programming",
+            },
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "id": "search123",
+                        "type": "function",
+                        "function": {
+                            "name": "search_database",
+                            "arguments": {"query": "Python programming"},
+                        },
+                    }
+                ],
+            },
+            {
+                "role": "tool",
+                "tool_call_id": "search123",
+                "name": "search_database",
+                "content": "Python is a high-level programming language known for its simplicity.",
+            },
+            {
+                "role": "assistant",
+                "content": "Based on the database search, Python is a high-level programming language known for its simplicity and readability.",
+            },
+        ],
+    }
+
+    res = strategy.tokenize_prompt(system_tool_calling)
+
+    # Validation
+    assert len(res["input_ids"]) > 0
+    assert len(res["labels"]) == len(res["input_ids"])
+
+    decoded = magistral_tokenizer.decode(res["input_ids"])
+
+    assert (
+        '<s>[SYSTEM_PROMPT]You are a helpful assistant with access to a database.[/SYSTEM_PROMPT][AVAILABLE_TOOLS][{"type": "function", "function": {"name": "search_database", "description": "Search for information in database", "parameters": {"type": "object", "properties": {"query": {"type": "string", "description": "Search query"}}, "required": ["query"]}}}][/AVAILABLE_TOOLS]'
+        in decoded
+    )
+
+    # Test error handling - missing tool response
+    incomplete_tool_calling = {
+        "tools": [
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_time",
+                    "description": "Get current time",
+                    "parameters": {"type": "object", "properties": {}},
+                },
+            },
+        ],
+        "messages": [
+            {
+                "role": "user",
+                "content": "What time is it?",
+            },
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "id": "time12345",
+                        "type": "function",
+                        "function": {
+                            "name": "get_time",
+                            "arguments": {},
+                        },
+                    }
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "The current time is 12:00 PM.",
+            },
+        ],
+    }
+
+    from mistral_common.exceptions import InvalidMessageStructureException
+
+    try:
+        strategy.tokenize_prompt(incomplete_tool_calling)
+    except InvalidMessageStructureException as e:
+        assert "Not the same number of function calls and responses" in str(e)
+
+
+@pytest.mark.skip(reason="TODO, fix for new HF wrapper call")
+def test_magistral_tokenizer_call_method(
+    magistral_tokenizer: "HFMistralTokenizer", llama3_tokenizer: "PreTrainedTokenizer"
+):
+    """Test the __call__ method behavior matches HuggingFace standards"""
+    from copy import deepcopy
+
+    import numpy as np
+    import torch
+
+    hf_tokenizer = deepcopy(llama3_tokenizer)
+    hf_tokenizer.pad_token = hf_tokenizer.eos_token
+
+    test_text = "Hello, how are you?"
+    batch_texts = ["Hello world", "How are you?"]
+
+    # Test single string with return_tensors=None
+    hf_result: dict[str, list[int]] = hf_tokenizer(test_text, return_tensors=None)
+    mistral_result: dict[str, list[int]] = magistral_tokenizer(
+        test_text, return_tensors=None
+    )
+
+    assert isinstance(mistral_result, dict)
+    assert set(mistral_result.keys()) == {"input_ids", "attention_mask"}
+    assert isinstance(mistral_result["input_ids"], type(hf_result["input_ids"]))  # list
+    assert isinstance(
+        mistral_result["attention_mask"], type(hf_result["attention_mask"])
+    )
+    assert len(mistral_result["input_ids"]) == len(mistral_result["attention_mask"])
+    assert np.all(mistral_result["attention_mask"])
+    assert len(np.array(mistral_result["input_ids"]).shape) == 1  # 1D array
+
+    # Test single string with return_tensors='pt'
+    hf_result_pt: dict[str, torch.Tensor] = hf_tokenizer(test_text, return_tensors="pt")
+    mistral_result_pt: dict[str, torch.Tensor] = magistral_tokenizer(
+        test_text, return_tensors="pt"
+    )
+
+    # Check structure and types
+    assert isinstance(mistral_result_pt["input_ids"], torch.Tensor)
+    assert isinstance(mistral_result_pt["attention_mask"], torch.Tensor)
+
+    # Check shapes match (don't compare token dimension)
+    assert len(hf_result_pt["input_ids"].shape) == len(
+        mistral_result_pt["input_ids"].shape
+    )
+    assert hf_result_pt["input_ids"].shape[0] == mistral_result_pt["input_ids"].shape[0]
+    assert (
+        mistral_result_pt["attention_mask"].shape
+        == mistral_result_pt["input_ids"].shape
+    )
+    assert torch.all(mistral_result_pt["attention_mask"] == 1)
+
+    # Test batch input with padding
+    hf_batch: dict[str, torch.Tensor] = hf_tokenizer(
+        batch_texts, return_tensors="pt", padding=True
+    )
+    mistral_batch: dict[str, torch.Tensor] = magistral_tokenizer(
+        batch_texts, return_tensors="pt", padding=True
+    )
+
+    # Check batch behavior
+    assert len(hf_batch["input_ids"].shape) == len(mistral_batch["input_ids"].shape)
+    assert hf_batch["input_ids"].shape[0] == mistral_batch["input_ids"].shape[0]
+    assert mistral_batch["attention_mask"].shape == mistral_batch["input_ids"].shape
+    assert torch.any(
+        mistral_batch["attention_mask"][0] == 0
+    )  # padding in shorter sequence
+    assert torch.all(
+        mistral_batch["attention_mask"][1] == 1
+    )  # no padding in longer sequence
+
+    # Test numpy tensors
+    mistral_result_np: dict[str, np.ndarray] = magistral_tokenizer(
+        test_text, return_tensors="np"
+    )
+    assert isinstance(mistral_result_np["input_ids"], np.ndarray)
+    assert isinstance(mistral_result_np["attention_mask"], np.ndarray)
+
+    # Test consistency with encode()
+    encoded: list[int] = magistral_tokenizer.encode(test_text, add_special_tokens=True)
+    called: dict[str, torch.Tensor] = magistral_tokenizer(
+        test_text, return_tensors="pt"
+    )
+    assert encoded == called["input_ids"][0].tolist()
+
+    # Test Error handling
+    with pytest.raises(ValueError, match="Unsupported kwargs"):
+        magistral_tokenizer(test_text, unsupported_param=True)
+
+    with pytest.raises(
+        ValueError, match="return_tensors='pt' or 'np' requires padding or truncation"
+    ):
+        magistral_tokenizer(batch_texts, return_tensors="pt")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/prompt_strategies/test_chat_templates_thinking.py b/tests/prompt_strategies/test_chat_templates_thinking.py
index 9fe292317..e807111aa 100644
--- a/tests/prompt_strategies/test_chat_templates_thinking.py
+++ b/tests/prompt_strategies/test_chat_templates_thinking.py
@@ -2,8 +2,6 @@
 Tests for splitting reasoning/thinking from content into separate field
 """
 
-import logging
-
 import pytest
 from datasets import Dataset
 from transformers import AutoTokenizer
@@ -13,11 +11,6 @@ from axolotl.prompt_strategies.chat_template import (
 )
 from axolotl.utils.dict import DictDefault
 
-from tests.hf_offline_utils import enable_hf_offline
-
-logging.basicConfig(level=logging.DEBUG)
-LOG = logging.getLogger("axolotl")
-
 
 @pytest.fixture(name="messages_w_reasoning")
 def messages_w_reasoning_fixture():
@@ -64,7 +57,6 @@ def messages_w_reasoning_fixture():
 
 
 @pytest.fixture(name="qwen3_tokenizer")
-@enable_hf_offline
 def qwen3_tokenizer_fixture(
     download_qwen3_half_billion_model,
 ):  # pylint: disable=unused-argument
diff --git a/tests/prompt_strategies/test_dpo_chat_templates.py b/tests/prompt_strategies/test_dpo_chat_templates.py
index b1802faa0..e5f30a6c4 100644
--- a/tests/prompt_strategies/test_dpo_chat_templates.py
+++ b/tests/prompt_strategies/test_dpo_chat_templates.py
@@ -103,7 +103,7 @@ class TestAssistantDPOChatTemplateLlama3:
 
     def test_llama3_defaults(self, llama3_tokenizer, assistant_dataset):
         # pylint: disable=duplicate-code
-        transform_fn = default(
+        transform_fn, _ = default(
             DictDefault(
                 {
                     "chat_template": "llama3",
@@ -128,7 +128,7 @@ class TestAssistantDPOChatTemplateLlama3:
 
     def test_llama3_configured(self, llama3_tokenizer, custom_assistant_dataset):
         # pylint: disable=duplicate-code
-        transform_fn = default(
+        transform_fn, _ = default(
             DictDefault(
                 {
                     "chat_template": "llama3",
@@ -169,7 +169,7 @@ class TestAssistantDPOChatTemplatePhi3:
 
     def test_phi3_defaults(self, phi3_tokenizer, assistant_dataset):
         # pylint: disable=duplicate-code
-        transform_fn = default(
+        transform_fn, _ = default(
             DictDefault(
                 {
                     "chat_template": "tokenizer_default",
@@ -199,7 +199,7 @@ class TestAssistantDPOChatTemplateGemma:
 
     def test_gemma_defaults(self, gemma_tokenizer, assistant_dataset):
         # pylint: disable=duplicate-code
-        transform_fn = default(
+        transform_fn, _ = default(
             DictDefault(
                 {
                     "chat_template": "tokenizer_default",
diff --git a/tests/prompt_strategies/test_dpo_chatml.py b/tests/prompt_strategies/test_dpo_chatml.py
index b313a4b64..2c089067f 100644
--- a/tests/prompt_strategies/test_dpo_chatml.py
+++ b/tests/prompt_strategies/test_dpo_chatml.py
@@ -6,8 +6,9 @@ import unittest
 
 import pytest
 
+from axolotl.loaders.tokenizer import load_tokenizer
 from axolotl.prompt_strategies.dpo import load as load_dpo
-from axolotl.utils.data.rl import load_prepare_preference_datasets
+from axolotl.utils.data.rl import prepare_preference_datasets
 from axolotl.utils.dict import DictDefault
 
 from tests.hf_offline_utils import enable_hf_offline
@@ -55,7 +56,8 @@ class TestDPOChatml:
         # test that dpo.load works
         load_dpo("chatml", cfg)
         # now actually load the datasets with the strategy
-        train_ds, _ = load_prepare_preference_datasets(cfg)
+        tokenizer = load_tokenizer(cfg)
+        train_ds, _ = prepare_preference_datasets(cfg, tokenizer)
         assert train_ds[0]["prompt"].startswith("<|im_start|>")
         assert train_ds[0]["prompt"].endswith("<|im_start|>assistant\n")
         assert "chosen" in train_ds[0]
diff --git a/tests/prompt_strategies/test_jinja_template_analyzer.py b/tests/prompt_strategies/test_jinja_template_analyzer.py
index f666c738c..41b9a0203 100644
--- a/tests/prompt_strategies/test_jinja_template_analyzer.py
+++ b/tests/prompt_strategies/test_jinja_template_analyzer.py
@@ -2,14 +2,12 @@
 tests for jinja_template_analyzer
 """
 
-import logging
-
 import pytest
 
 from axolotl.prompt_strategies.jinja_template_analyzer import JinjaTemplateAnalyzer
+from axolotl.utils.logging import get_logger
 
-logging.basicConfig(level=logging.DEBUG)
-LOG = logging.getLogger("axolotl")
+LOG = get_logger(__name__, log_level="DEBUG")
 
 
 class TestJinjaTemplateAnalyzer:
diff --git a/tests/test_chunked_xentropy.py b/tests/test_chunked_xentropy.py
new file mode 100644
index 000000000..3e439f0a3
--- /dev/null
+++ b/tests/test_chunked_xentropy.py
@@ -0,0 +1,40 @@
+"""
+test suite for chunked cross entropy
+"""
+
+import pytest
+import torch
+from torch import nn
+
+from axolotl.monkeypatch.loss.chunked import get_causal_lm_loss
+
+
+@pytest.fixture
+def chunked_fixtures():
+    model_dim = 512
+    vocab_size = 1024 * 256
+    seq_len = 2048
+    batch_size = 1
+
+    lm_head = nn.Linear(model_dim, vocab_size)
+    hidden_state = torch.randn(batch_size, seq_len, model_dim)
+    labels = torch.randint(low=0, high=vocab_size, size=(batch_size, seq_len))
+    return lm_head, hidden_state, labels, vocab_size
+
+
+def test_chunked_forward(chunked_fixtures):  # pylint: disable=redefined-outer-name
+    lm_head, hidden_state, labels, vocab_size = chunked_fixtures
+    lm_loss = get_causal_lm_loss()
+
+    logits = lm_head(hidden_state)
+
+    chunked_lm_loss = lm_loss(logits, labels)
+
+    logits_flattened = logits.view(-1, vocab_size)
+    labels_flattened = labels.view(-1)
+
+    loss = nn.functional.cross_entropy(
+        logits_flattened.float(), labels_flattened, reduction="mean"
+    )
+
+    assert torch.allclose(chunked_lm_loss, loss, atol=1e-2, rtol=1e-2)
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 88d196ad1..719dfdc19 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -1,10 +1,9 @@
-"""
-Test dataset loading under various conditions.
-"""
+"""Test dataset loading under various conditions."""
 
 import shutil
 import tempfile
 from pathlib import Path
+from typing import Any, Generator
 from unittest.mock import patch
 
 import pytest
@@ -12,8 +11,9 @@ from datasets import Dataset
 from huggingface_hub import snapshot_download
 from transformers import PreTrainedTokenizer
 
-from axolotl.utils.data import load_tokenized_prepared_datasets
-from axolotl.utils.data.rl import load_prepare_preference_datasets
+from axolotl.loaders.tokenizer import load_tokenizer
+from axolotl.utils.data.rl import prepare_preference_datasets
+from axolotl.utils.data.sft import _load_tokenized_prepared_datasets
 from axolotl.utils.dict import DictDefault
 
 from tests.constants import (
@@ -28,7 +28,9 @@ class TestDatasetPreparation:
     """Test a configured dataloader."""
 
     @pytest.fixture
-    def tokenizer(self, tokenizer_huggyllama) -> PreTrainedTokenizer:
+    def tokenizer(
+        self, tokenizer_huggyllama
+    ) -> Generator[PreTrainedTokenizer, Any, Any]:
         tokenizer_huggyllama.add_special_tokens(SPECIAL_TOKENS)
         yield tokenizer_huggyllama
 
@@ -63,7 +65,10 @@ class TestDatasetPreparation:
                 }
             )
 
-            dataset, _ = load_tokenized_prepared_datasets(tokenizer, cfg, prepared_path)
+            with patch(
+                "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path)
+            ):
+                dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg)
 
             assert len(dataset) == 2000
             assert "input_ids" in dataset.features
@@ -107,7 +112,10 @@ class TestDatasetPreparation:
                 }
             )
 
-            dataset, _ = load_tokenized_prepared_datasets(tokenizer, cfg, prepared_path)
+            with patch(
+                "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path)
+            ):
+                dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg)
 
             assert len(dataset) == 2000
             assert "input_ids" in dataset.features
@@ -133,10 +141,14 @@ class TestDatasetPreparation:
                             "type": "alpaca",
                         },
                     ],
+                    "dataset_processes": 4,
                 }
             )
 
-            dataset, _ = load_tokenized_prepared_datasets(tokenizer, cfg, prepared_path)
+            with patch(
+                "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path)
+            ):
+                dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg)
 
             assert len(dataset) == 1
             assert "input_ids" in dataset.features
@@ -145,7 +157,7 @@ class TestDatasetPreparation:
 
     @enable_hf_offline
     def test_load_from_dir_of_parquet(self, tokenizer, dataset_fixture):
-        """Usual use case.  Verify a directory of parquet files can be loaded."""
+        """Usual use case. Verify a directory of parquet files can be loaded."""
         with tempfile.TemporaryDirectory() as tmp_dir:
             tmp_ds_dir = Path(tmp_dir) / "tmp_dataset"
             tmp_ds_dir.mkdir()
@@ -168,10 +180,14 @@ class TestDatasetPreparation:
                             "type": "alpaca",
                         },
                     ],
+                    "dataset_processes": 4,
                 }
             )
 
-            dataset, _ = load_tokenized_prepared_datasets(tokenizer, cfg, prepared_path)
+            with patch(
+                "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path)
+            ):
+                dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg)
 
             assert len(dataset) == 1
             assert "input_ids" in dataset.features
@@ -203,10 +219,14 @@ class TestDatasetPreparation:
                             "type": "alpaca",
                         },
                     ],
+                    "dataset_processes": 4,
                 }
             )
 
-            dataset, _ = load_tokenized_prepared_datasets(tokenizer, cfg, prepared_path)
+            with patch(
+                "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path)
+            ):
+                dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg)
 
             assert len(dataset) == 1
             assert "input_ids" in dataset.features
@@ -232,10 +252,14 @@ class TestDatasetPreparation:
                             "type": "alpaca",
                         },
                     ],
+                    "dataset_processes": 4,
                 }
             )
 
-            dataset, _ = load_tokenized_prepared_datasets(tokenizer, cfg, prepared_path)
+            with patch(
+                "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path)
+            ):
+                dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg)
 
             assert len(dataset) == 1
             assert "input_ids" in dataset.features
@@ -261,10 +285,14 @@ class TestDatasetPreparation:
                             "type": "alpaca",
                         },
                     ],
+                    "dataset_processes": 4,
                 }
             )
 
-            dataset, _ = load_tokenized_prepared_datasets(tokenizer, cfg, prepared_path)
+            with patch(
+                "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path)
+            ):
+                dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg)
 
             assert len(dataset) == 1
             assert "input_ids" in dataset.features
@@ -286,10 +314,14 @@ class TestDatasetPreparation:
             }
         )
 
-        train_dataset, _ = load_prepare_preference_datasets(cfg)
+        tokenizer = load_tokenizer(cfg)
+        train_dataset, _ = prepare_preference_datasets(cfg, tokenizer)
 
         assert len(train_dataset) == 1800
-        assert "conversation" in train_dataset.features
+        assert "conversation" not in train_dataset.features
+        assert "chosen" in train_dataset.features
+        assert "rejected" in train_dataset.features
+        assert "prompt" in train_dataset.features
 
     @pytest.mark.skip(reason="TODO: fix hf hub offline to work with HF rate limits")
     @enable_hf_offline
@@ -315,7 +347,10 @@ class TestDatasetPreparation:
                 }
             )
 
-            dataset, _ = load_tokenized_prepared_datasets(tokenizer, cfg, prepared_path)
+            with patch(
+                "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path)
+            ):
+                dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg)
 
             assert len(dataset) == 2000
             assert "input_ids" in dataset.features
@@ -335,20 +370,27 @@ class TestDatasetPreparation:
                 "rl": "dpo",
                 "chat_template": "llama3",
                 "datasets": [ALPACA_MESSAGES_CONFIG_REVISION],
+                "dataset_processes": 4,
             }
         )
 
         # pylint: disable=duplicate-code
-        with patch("axolotl.utils.data.rl.load_dataset_w_config") as mock_load_dataset:
+        with patch(
+            "axolotl.utils.data.rl.load_dataset_with_config"
+        ) as mock_load_dataset:
             # Set up the mock to return different values on successive calls
             mock_load_dataset.return_value = (
                 dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff
             )
 
-            train_dataset, _ = load_prepare_preference_datasets(cfg)
+            tokenizer = load_tokenizer(cfg)
+            train_dataset, _ = prepare_preference_datasets(cfg, tokenizer)
 
             assert len(train_dataset) == 1800
-            assert "conversation" in train_dataset.features
+            assert "conversation" not in train_dataset.features
+            assert "chosen" in train_dataset.features
+            assert "rejected" in train_dataset.features
+            assert "prompt" in train_dataset.features
 
     @enable_hf_offline
     @pytest.mark.skip("datasets bug with local datasets when offline")
@@ -387,16 +429,18 @@ class TestDatasetPreparation:
             )
 
             with patch(
-                "axolotl.utils.data.shared.load_dataset_w_config"
+                "axolotl.utils.data.shared.load_dataset_with_config"
             ) as mock_load_dataset:
                 # Set up the mock to return different values on successive calls
                 mock_load_dataset.return_value = (
                     dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff
                 )
 
-                dataset, _ = load_tokenized_prepared_datasets(
-                    tokenizer, cfg, prepared_path
-                )
+                with patch(
+                    "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH",
+                    str(prepared_path),
+                ):
+                    dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg)
 
                 assert len(dataset) == 2000
                 assert "input_ids" in dataset.features
@@ -428,10 +472,14 @@ class TestDatasetPreparation:
                             "type": "alpaca",
                         },
                     ],
+                    "dataset_processes": 4,
                 }
             )
 
-            dataset, _ = load_tokenized_prepared_datasets(tokenizer, cfg, prepared_path)
+            with patch(
+                "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path)
+            ):
+                dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg)
 
             assert len(dataset) == 2000
             assert "input_ids" in dataset.features
diff --git a/tests/test_exact_deduplication.py b/tests/test_exact_deduplication.py
index 29672c9e5..d97aad8ea 100644
--- a/tests/test_exact_deduplication.py
+++ b/tests/test_exact_deduplication.py
@@ -5,7 +5,6 @@ Additionally, this test suite includes tests for functions that indirectly call
 `deduplicate_and_log_datasets` during the execution of the preprocess command.
 """
 
-import hashlib
 import unittest
 from unittest.mock import patch
 
@@ -14,8 +13,7 @@ from datasets import Dataset
 
 from axolotl.loaders import load_processor, load_tokenizer
 from axolotl.utils.config import normalize_config, validate_config
-from axolotl.utils.data import prepare_dataset
-from axolotl.utils.data.rl import load_prepare_preference_datasets
+from axolotl.utils.data import prepare_datasets, prepare_preference_datasets
 from axolotl.utils.data.utils import deduplicate_and_log_datasets
 from axolotl.utils.dict import DictDefault
 
@@ -71,36 +69,14 @@ class TestDeduplicateIndividualFunctions(unittest.TestCase):
         self.expected_dataset = Dataset.from_dict(self.expected_data)
 
     def test_deduplication(self):
-        train_dataset, _, _ = deduplicate_and_log_datasets(train_dataset=self.dataset)
-        _, eval_dataset, _ = deduplicate_and_log_datasets(eval_dataset=self.dataset)
+        train_dataset, _ = deduplicate_and_log_datasets(dataset=self.dataset)
+        eval_dataset, _ = deduplicate_and_log_datasets(
+            dataset=self.dataset, dataset_name="eval"
+        )
 
         verify_deduplication(train_dataset, self.expected_dataset, "train_dataset")
         verify_deduplication(eval_dataset, self.expected_dataset, "eval_dataset")
 
-    def test_datasets_are_none(self):
-        # Test when both datasets are None
-        train_dataset, eval_dataset, _ = deduplicate_and_log_datasets(
-            train_dataset=None, eval_dataset=None
-        )
-        self.assertIsNone(train_dataset, "Expected train_dataset to be None")
-        self.assertIsNone(eval_dataset, "Expected eval_dataset to be None")
-
-    def test_only_train_is_none(self):
-        # Test when only train_dataset is None
-        train_dataset, eval_dataset, _ = deduplicate_and_log_datasets(
-            train_dataset=None, eval_dataset=self.dataset
-        )
-        self.assertIsNone(train_dataset, "Expected train_dataset to be None")
-        verify_deduplication(eval_dataset, self.expected_dataset, "eval_dataset")
-
-    def test_only_eval_is_none(self):
-        # Test when only eval_dataset is None
-        train_dataset, eval_dataset, _ = deduplicate_and_log_datasets(
-            train_dataset=self.dataset, eval_dataset=None
-        )
-        self.assertIsNone(eval_dataset, "Expected eval_dataset to be None")
-        verify_deduplication(train_dataset, self.expected_dataset, "train_dataset")
-
     def test_exact_duplicates(self):
         # Test when datasets are exact duplicates
         duplicate_data = {
@@ -115,8 +91,10 @@ class TestDeduplicateIndividualFunctions(unittest.TestCase):
         expected_dataset = Dataset.from_dict(expected_data)
 
         # Run deduplication
-        train_dataset, _, _ = deduplicate_and_log_datasets(train_dataset=dataset)
-        _, eval_dataset, _ = deduplicate_and_log_datasets(eval_dataset=dataset)
+        train_dataset, _ = deduplicate_and_log_datasets(dataset=dataset)
+        eval_dataset, _ = deduplicate_and_log_datasets(
+            dataset=dataset, dataset_name="eval"
+        )
 
         verify_deduplication(train_dataset, expected_dataset, "train_dataset")
         verify_deduplication(eval_dataset, expected_dataset, "eval_dataset")
@@ -139,8 +117,10 @@ class TestDeduplicateIndividualFunctions(unittest.TestCase):
         expected_dataset = Dataset.from_dict(expected_data)
 
         # Run deduplication
-        train_dataset, _, _ = deduplicate_and_log_datasets(train_dataset=dataset)
-        _, eval_dataset, _ = deduplicate_and_log_datasets(eval_dataset=dataset)
+        train_dataset, _ = deduplicate_and_log_datasets(dataset=dataset)
+        eval_dataset, _ = deduplicate_and_log_datasets(
+            dataset=dataset, dataset_name="eval"
+        )
 
         verify_deduplication(train_dataset, expected_dataset, "train_dataset")
         verify_deduplication(eval_dataset, expected_dataset, "eval_dataset")
@@ -169,8 +149,8 @@ class TestDeduplicateIndividualFunctions(unittest.TestCase):
         expected_dataset_eval = Dataset.from_dict(expected_data_eval)
 
         # Run deduplication
-        train_dataset, eval_dataset, _ = deduplicate_and_log_datasets(
-            train_dataset=dataset, eval_dataset=dataset
+        train_dataset, eval_dataset = deduplicate_and_log_datasets(
+            dataset=dataset, other_dataset=dataset
         )
 
         verify_deduplication(train_dataset, expected_dataset_train, "train_dataset")
@@ -206,8 +186,8 @@ class TestDeduplicateIndividualFunctions(unittest.TestCase):
         expected_dataset_eval = Dataset.from_dict(expected_data_eval)
 
         # Run deduplication
-        train_dataset, eval_dataset, _ = deduplicate_and_log_datasets(
-            train_dataset=dataset_train, eval_dataset=dataset_eval
+        train_dataset, eval_dataset = deduplicate_and_log_datasets(
+            dataset=dataset_train, other_dataset=dataset_eval
         )
 
         verify_deduplication(train_dataset, expected_dataset_train, "train_dataset")
@@ -230,6 +210,7 @@ class TestDeduplicateRLDataset:
                     ALPACA_MESSAGES_CONFIG_REVISION,
                     ALPACA_MESSAGES_CONFIG_REVISION,
                 ],
+                "dataset_processes": 4,
             }
         )
         yield fixture
@@ -245,7 +226,9 @@ class TestDeduplicateRLDataset:
 
         # pylint: disable=duplicate-code
         with (
-            patch("axolotl.utils.data.rl.load_dataset_w_config") as mock_load_dataset,
+            patch(
+                "axolotl.utils.data.rl.load_dataset_with_config"
+            ) as mock_load_dataset,
             patch("axolotl.loaders.load_tokenizer") as mock_load_tokenizer,
         ):
             # Set up the mock to return different values on successive calls
@@ -255,7 +238,8 @@ class TestDeduplicateRLDataset:
             ]
             mock_load_tokenizer.return_value = tokenizer_huggyllama
 
-            train_dataset, _ = load_prepare_preference_datasets(cfg)
+            tokenizer = load_tokenizer(cfg)
+            train_dataset, _ = prepare_preference_datasets(cfg, tokenizer)
 
             # Verify that the dataset has been deduplicated
             assert len(train_dataset) == 1800, "Dataset was not properly deduplicated"
@@ -269,7 +253,9 @@ class TestDeduplicateRLDataset:
     ):
         # pylint: disable=duplicate-code
         with (
-            patch("axolotl.utils.data.rl.load_dataset_w_config") as mock_load_dataset,
+            patch(
+                "axolotl.utils.data.rl.load_dataset_with_config"
+            ) as mock_load_dataset,
             patch("axolotl.loaders.load_tokenizer") as mock_load_tokenizer,
         ):
             # Set up the mock to return different values on successive calls
@@ -279,9 +265,10 @@ class TestDeduplicateRLDataset:
             ]
             mock_load_tokenizer.return_value = tokenizer_huggyllama
 
-            cfg.dataset_exact_deduplication = False
             # Load the dataset without deduplication
-            train_dataset, _ = load_prepare_preference_datasets(cfg)
+            cfg.dataset_exact_deduplication = False
+            tokenizer = load_tokenizer(cfg)
+            train_dataset, _ = prepare_preference_datasets(cfg, tokenizer)
 
             # Verify that the dataset retains duplicates
             assert (
@@ -335,7 +322,7 @@ class TestDeduplicateNonRL(unittest.TestCase):
         )
 
         # Prepare dataset using the prepare_dataset function
-        train_dataset, _, _, _ = prepare_dataset(
+        train_dataset, _, _, _ = prepare_datasets(
             self.cfg_1,
             tokenizer,
             processor=processor,
@@ -362,7 +349,7 @@ class TestDeduplicateNonRL(unittest.TestCase):
         )
 
         # Prepare dataset using the prepare_dataset function
-        _, eval_dataset, _, _ = prepare_dataset(
+        _, eval_dataset, _, _ = prepare_datasets(
             self.cfg_1,
             tokenizer,
             processor=processor,
@@ -389,7 +376,7 @@ class TestDeduplicateNonRL(unittest.TestCase):
         )
 
         # Prepare dataset using the prepare_dataset function
-        train_dataset, eval_dataset, _, _ = prepare_dataset(
+        train_dataset, eval_dataset, _, _ = prepare_datasets(
             self.cfg_1,
             tokenizer,
             processor=processor,
@@ -428,41 +415,8 @@ class TestWrongCollisions(unittest.TestCase):
         self.eval_dataset = Dataset.from_dict(self.eval_data)
         self.dataset = Dataset.from_dict(self.dataset_data)
 
-    @patch(
-        "axolotl.utils.data.utils.sha256",
-        side_effect=lambda x: (
-            hashlib.sha256("forced_collision_hash".encode("utf-8")).hexdigest()
-            if "sample 5" in x
-            else hashlib.sha256(x.encode("utf-8")).hexdigest()
-        ),
-    )
-    def test_deduplication_wrong_collision_train_eval(self, _mock_sha256):
-        dedup_train, dedup_eval, _ = deduplicate_and_log_datasets(
-            train_dataset=self.train_dataset, eval_dataset=self.eval_dataset
-        )
-        self.assertEqual(
-            len(dedup_train),
-            2,
-            "train dataset should not deduplicate rows with forced hash collisions but different labels.",
-        )
-        self.assertEqual(
-            len(dedup_eval),
-            2,
-            "Eval dataset should not deduplicate rows with forced hash collisions but different labels.",
-        )
-        self.assertEqual(
-            len(dedup_eval),
-            len(self.eval_dataset),
-            "The output eval dataset should have the same number of rows as the input eval dataset.",
-        )
-        self.assertEqual(
-            str(dedup_eval),
-            str(self.eval_dataset),
-            "The string representation of the output eval dataset should be identical to the input eval dataset.",
-        )
-
     def test_deduplication_dataset_only(self):
-        _, _, dedup_dataset = deduplicate_and_log_datasets(dataset=self.dataset)
+        dedup_dataset, _ = deduplicate_and_log_datasets(dataset=self.dataset)
         self.assertEqual(
             len(dedup_dataset), 3, "Dataset should have all original values"
         )
diff --git a/tests/test_loaders.py b/tests/test_loaders.py
index 7313a8267..d45f41998 100644
--- a/tests/test_loaders.py
+++ b/tests/test_loaders.py
@@ -9,6 +9,7 @@ from transformers.utils.import_utils import is_torch_mps_available
 
 from axolotl.loaders import ModelLoader
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.distributed import _get_parallel_config_kwargs
 
 
 class TestModelsUtils:
@@ -171,3 +172,42 @@ class TestModelsUtils:
                 message_property_mappings={"content": "different_content"},
             )
         assert "Conflicting message content fields" in str(exc_info.value)
+
+    @pytest.mark.parametrize(
+        "world_size, tensor_parallel_size, context_parallel_size, dp_shard_size, dp_replicate_size, is_fsdp, expected",
+        [
+            (16, 2, 2, 2, 2, True, (2, 2, 2, 2)),
+            (16, 1, 1, None, None, True, (0, 0, 16, 1)),
+            (16, 2, 2, 2, None, True, (2, 2, 2, 2)),
+            (16, 2, 2, None, 2, True, (2, 2, 2, 2)),
+            (16, 1, 1, None, 2, True, (0, 0, 8, 2)),
+            (2, 1, 1, None, None, True, (0, 0, 2, 1)),
+        ],
+    )
+    def test_get_parallel_config_kwargs(
+        self,
+        world_size,
+        tensor_parallel_size,
+        context_parallel_size,
+        dp_shard_size,
+        dp_replicate_size,
+        is_fsdp,
+        expected,
+    ):
+        res = _get_parallel_config_kwargs(  # pylint: disable=protected-access
+            world_size,
+            tensor_parallel_size,
+            context_parallel_size,
+            dp_shard_size,
+            dp_replicate_size,
+            is_fsdp,
+        )
+
+        if expected[0] > 1:
+            assert res["tp_size"] == expected[0]
+        if expected[1] > 1:
+            assert res["cp_size"] == expected[1]
+        if expected[2] > 1:
+            assert res["dp_shard_size"] == expected[2]
+        if expected[3] > 1:
+            assert res["dp_replicate_size"] == expected[3]
diff --git a/tests/test_normalize_config.py b/tests/test_normalize_config.py
index ea98bf97d..658e06fcb 100644
--- a/tests/test_normalize_config.py
+++ b/tests/test_normalize_config.py
@@ -5,7 +5,11 @@ Test classes for checking functionality of the cfg normalization
 import unittest
 from unittest.mock import patch
 
-from axolotl.utils.config import normalize_cfg_datasets, normalize_config
+from axolotl.utils.config import (
+    normalize_cfg_datasets,
+    normalize_config,
+    validate_config,
+)
 from axolotl.utils.dict import DictDefault
 
 
@@ -23,6 +27,13 @@ class NormalizeConfigTestCase(unittest.TestCase):
                 "num_epochs": 1,
                 "micro_batch_size": 1,
                 "gradient_accumulation_steps": 1,
+                "datasets": [
+                    {
+                        "path": "mhenrichsen/alpaca_2k_test",
+                        "type": "alpaca",
+                    },
+                ],
+                "learning_rate": 0.0001,
             }
         )
 
@@ -90,3 +101,103 @@ class NormalizeConfigTestCase(unittest.TestCase):
 
         self.assertTrue(cfg.bf16)
         self.assertFalse(cfg.fp16)
+
+    def test_migrate_fsdp_config(self):
+        """Test basic FSDP config migration with and without fsdp_version"""
+        cfg_with_version = self._get_base_cfg() | DictDefault(
+            {
+                "fsdp_config": {
+                    "fsdp_version": 2,
+                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                    "fsdp_offload_params": False,
+                    "fsdp_cpu_ram_efficient_loading": True,
+                    "regular_param": "value",
+                }
+            }
+        )
+
+        cfg_with_version = validate_config(cfg_with_version)
+
+        self.assertEqual(cfg_with_version.fsdp_version, 2)
+        self.assertEqual(
+            cfg_with_version.fsdp_config.auto_wrap_policy, "TRANSFORMER_BASED_WRAP"
+        )
+        self.assertEqual(cfg_with_version.fsdp_config.offload_params, False)
+        self.assertEqual(cfg_with_version.fsdp_config.cpu_ram_efficient_loading, True)
+        self.assertEqual(cfg_with_version.fsdp_config.regular_param, "value")
+
+        self.assertNotIn("fsdp_auto_wrap_policy", cfg_with_version.fsdp_config)
+        self.assertNotIn("fsdp_offload_params", cfg_with_version.fsdp_config)
+        self.assertNotIn("fsdp_cpu_ram_efficient_loading", cfg_with_version.fsdp_config)
+        self.assertNotIn("fsdp_version", cfg_with_version.fsdp_config)
+        self.assertNotIn("version", cfg_with_version.fsdp_config)
+
+        cfg_without_version = self._get_base_cfg() | DictDefault(
+            {
+                "fsdp_config": {
+                    "fsdp_auto_wrap_policy": "SIZE_BASED_WRAP",
+                    "fsdp_offload_params": True,
+                    "regular_param": "value",
+                }
+            }
+        )
+
+        cfg_without_version = validate_config(cfg_without_version)
+
+        self.assertNotIn("fsdp_version", cfg_without_version)
+        self.assertEqual(
+            cfg_without_version.fsdp_config.auto_wrap_policy, "SIZE_BASED_WRAP"
+        )
+        self.assertEqual(cfg_without_version.fsdp_config.offload_params, True)
+        self.assertEqual(cfg_without_version.fsdp_config.regular_param, "value")
+
+        self.assertNotIn("fsdp_auto_wrap_policy", cfg_without_version.fsdp_config)
+        self.assertNotIn("fsdp_offload_params", cfg_without_version.fsdp_config)
+
+    def test_migrate_fsdp_config_no_fsdp_config(self):
+        """Test that function doesn't crash when no fsdp_config is present"""
+        cfg = self._get_base_cfg()
+
+        cfg = validate_config(cfg)
+
+        self.assertNotIn("fsdp_config", cfg)
+        self.assertNotIn("fsdp_version", cfg)
+
+    def test_migrate_fsdp_config_empty_fsdp_config(self):
+        """Test migration with empty fsdp_config"""
+        cfg = self._get_base_cfg() | DictDefault({"fsdp_config": {}})
+
+        cfg = validate_config(cfg)
+
+        self.assertNotIn("fsdp_version", cfg)
+        self.assertEqual(cfg.fsdp_config, {})
+
+    def test_migrate_fsdp_config_mixed_keys(self):
+        """Test migration with a mix of fsdp_ and non-fsdp_ keys"""
+        cfg = self._get_base_cfg() | DictDefault(
+            {
+                "fsdp_config": {
+                    "fsdp_version": 1,
+                    "fsdp_state_dict_type": "FULL_STATE_DICT",
+                    "mixed_precision_policy": "fp16",
+                    "activation_checkpointing": True,
+                    "fsdp_reshard_after_forward": False,
+                }
+            }
+        )
+
+        cfg = validate_config(cfg)
+
+        self.assertEqual(cfg.fsdp_version, 1)
+        self.assertEqual(cfg.fsdp_config.state_dict_type, "FULL_STATE_DICT")
+        self.assertEqual(cfg.fsdp_config.reshard_after_forward, False)
+        self.assertEqual(cfg.fsdp_config.mixed_precision_policy, "fp16")
+        self.assertEqual(cfg.fsdp_config.activation_checkpointing, True)
+
+        # Check original fsdp_ keys are removed
+        self.assertNotIn("fsdp_version", cfg.fsdp_config)
+        self.assertNotIn("fsdp_state_dict_type", cfg.fsdp_config)
+        self.assertNotIn("fsdp_reshard_after_forward", cfg.fsdp_config)
+
+        # Ensure no duplicate version key
+        self.assertNotIn("version", cfg.fsdp_config)
diff --git a/tests/test_packed_batch_sampler.py b/tests/test_packed_batch_sampler.py
index 2b03c62f8..7cb645db7 100644
--- a/tests/test_packed_batch_sampler.py
+++ b/tests/test_packed_batch_sampler.py
@@ -70,7 +70,7 @@ class TestBatchedSamplerPacking:
         )
         train_dataset = concatenate_datasets([dataset_wrapper])
 
-        train_dataset = drop_long_seq_in_dataset(train_dataset, cfg)
+        train_dataset = drop_long_seq_in_dataset(train_dataset, cfg.sequence_len, cfg)
 
         lengths = get_dataset_lengths(train_dataset)
         batch_sampler = MultipackBatchSampler(
@@ -81,6 +81,7 @@ class TestBatchedSamplerPacking:
             group_size=100000,
             bin_size=200,
             sequential=sequential,
+            drop_last=False,
         )
 
         loader = DataLoader(
diff --git a/tests/test_packed_dataset.py b/tests/test_packed_dataset.py
index 45fc75282..699d5e6cc 100644
--- a/tests/test_packed_dataset.py
+++ b/tests/test_packed_dataset.py
@@ -6,10 +6,16 @@ from pathlib import Path
 from datasets import Dataset, load_dataset
 from transformers import AutoTokenizer
 
+from axolotl.cli.args import TrainerCliArgs
+from axolotl.common.datasets import load_datasets
 from axolotl.datasets import ConstantLengthDataset, TokenizedPromptDataset
 from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
 from axolotl.prompters import AlpacaPrompter
+from axolotl.train import setup_model_and_trainer
+from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.dict import DictDefault
 
+from tests.e2e.utils import with_temp_dir
 from tests.hf_offline_utils import enable_hf_offline
 
 
@@ -67,6 +73,86 @@ class TestPacking(unittest.TestCase):
         assert example["position_ids"][next_bos_index] == 0
         assert example["position_ids"][next_bos_index + 1] == 1
 
+    @with_temp_dir
+    def test_lora_packing(self, temp_dir):
+        # pylint: disable=duplicate-code
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "tokenizer_type": "AutoTokenizer",
+                "sequence_len": 1024,
+                "sample_packing": True,
+                "multipack_real_batches": False,
+                "eval_sample_packing": True,
+                "adapter": "lora",
+                "lora_r": 32,
+                "lora_alpha": 64,
+                "lora_dropout": 0.05,
+                "lora_target_linear": True,
+                "val_set_size": 0.2,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "mhenrichsen/alpaca_2k_test",
+                        "type": "alpaca",
+                    },
+                ],
+                "dataset_processes": 4,
+                "num_epochs": 1,
+                "max_steps": 20,
+                "save_steps": 10,
+                "micro_batch_size": 8,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "fp16": False,
+                "bf16": False,
+            }
+        )
+
+        cfg = validate_config(cfg)
+        normalize_config(cfg)
+        cli_args = TrainerCliArgs()
+        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+
+        (
+            trainer,
+            _,
+            _,
+            _,
+            _,
+        ) = setup_model_and_trainer(cfg, dataset_meta)
+
+        sampler = trainer._get_eval_sampler(  # pylint: disable=protected-access
+            trainer.eval_dataset
+        )
+        assert "MultipackBatchSampler" in sampler.__class__.__name__
+        assert (
+            "V2BatchSamplerDataCollatorForSeq2Seq"
+            in trainer.eval_data_collator.__class__.__name__
+        )
+        dataloader = trainer.get_eval_dataloader(trainer.eval_dataset)
+        dataloader_iter = iter(dataloader)
+        batch = next(dataloader_iter)
+        assert batch["input_ids"].shape == (1, 8192)
+
+        sampler = trainer._get_train_sampler(  # pylint: disable=protected-access
+            trainer.train_dataset
+        )
+        assert "MultipackBatchSampler" in sampler.__class__.__name__
+        assert (
+            "V2BatchSamplerDataCollatorForSeq2Seq"
+            in trainer.train_data_collator.__class__.__name__
+        )
+        dataloader = trainer.get_train_dataloader()
+        dataloader_iter = iter(dataloader)
+        batch = next(dataloader_iter)
+        assert batch["input_ids"].shape == (1, 8192)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_prompt_tokenizers.py b/tests/test_prompt_tokenizers.py
index 3f16bc917..5e5de4ff8 100644
--- a/tests/test_prompt_tokenizers.py
+++ b/tests/test_prompt_tokenizers.py
@@ -1,7 +1,6 @@
 """Module for testing prompt tokenizers."""
 
 import json
-import logging
 from pathlib import Path
 
 from axolotl.prompt_strategies.alpaca_chat import NoSystemPrompter
@@ -20,8 +19,6 @@ from axolotl.utils.dict import DictDefault
 
 from tests.hf_offline_utils import enable_hf_offline
 
-LOG = logging.getLogger("axolotl")
-
 test_data = {
     "multi_turn_sys": {
         "conversations": [
diff --git a/tests/test_train.py b/tests/test_train.py
new file mode 100644
index 000000000..2c29b58ee
--- /dev/null
+++ b/tests/test_train.py
@@ -0,0 +1,39 @@
+"""Test for batch size calculation for multi-gpu training."""
+
+import pytest
+
+from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.dict import DictDefault
+
+
+@pytest.fixture(name="train_base_cfg")
+def fixture_train_base_cfg(min_base_cfg):
+    return (
+        DictDefault(
+            micro_batch_size=2,
+            gradient_accumulation_steps=4,
+            sequence_len=2048,
+            sample_packing=True,
+            num_epochs=1,
+        )
+        | min_base_cfg
+    )
+
+
+class TestTrain:
+    """test class for train related tests"""
+
+    @pytest.mark.parametrize(
+        "world_size, expected_batch_size",
+        [
+            (1, 8),
+            (4, 32),
+        ],
+    )
+    def test_batch_size_ddp(
+        self, train_base_cfg, monkeypatch, world_size, expected_batch_size
+    ):
+        monkeypatch.setenv("WORLD_SIZE", str(world_size))
+        cfg = validate_config(train_base_cfg)
+        normalize_config(cfg)
+        assert cfg.batch_size == expected_batch_size
diff --git a/tests/test_validation_dataset.py b/tests/test_validation_dataset.py
index ba142f3bf..1a4c97314 100644
--- a/tests/test_validation_dataset.py
+++ b/tests/test_validation_dataset.py
@@ -73,7 +73,7 @@ class TestValidationCheckDatasetConfig(BaseValidation):
                 "compute_capability": "8.0",
             },
             env_capabilities={
-                "torch_version": "2.5.1",
+                "torch_version": "2.6.0",
             },
         )
 
@@ -128,7 +128,7 @@ class TestValidationCheckDatasetConfig(BaseValidation):
                 "compute_capability": "8.0",
             },
             env_capabilities={
-                "torch_version": "2.5.1",
+                "torch_version": "2.6.0",
             },
         )
 
@@ -184,7 +184,7 @@ class TestValidationCheckDatasetConfig(BaseValidation):
                 "compute_capability": "8.0",
             },
             env_capabilities={
-                "torch_version": "2.5.1",
+                "torch_version": "2.6.0",
             },
         )
 
@@ -241,7 +241,7 @@ class TestValidationCheckDatasetConfig(BaseValidation):
                 "compute_capability": "8.0",
             },
             env_capabilities={
-                "torch_version": "2.5.1",
+                "torch_version": "2.6.0",
             },
         )
 
diff --git a/tests/utils/schemas/validation/test_activation_offloading.py b/tests/utils/schemas/validation/test_activation_offloading.py
new file mode 100644
index 000000000..433133a80
--- /dev/null
+++ b/tests/utils/schemas/validation/test_activation_offloading.py
@@ -0,0 +1,35 @@
+"""Test for config validation for activation offloading."""
+
+from axolotl.utils.config import validate_config
+from axolotl.utils.dict import DictDefault
+
+
+class TestActivationOffloading:
+    """
+    Test cases for activation offloading schema validation
+    """
+
+    def test_gc_converts_offload_wo_lora(self, min_base_cfg):
+        cfg = (
+            DictDefault(
+                gradient_checkpointing="offload",
+            )
+            | min_base_cfg
+        )
+
+        cfg = validate_config(cfg)
+        assert cfg.gradient_checkpointing is True
+        assert cfg.activation_offloading is True
+
+    def test_ac_offload_impl_noop_wo_adapter(self, min_base_cfg):
+        cfg = (
+            DictDefault(
+                gradient_checkpointing=True,
+                activation_offloading=True,
+            )
+            | min_base_cfg
+        )
+
+        cfg = validate_config(cfg)
+        assert cfg.gradient_checkpointing is True
+        assert cfg.activation_offloading is True
diff --git a/tests/utils/schemas/validation/test_default_values.py b/tests/utils/schemas/validation/test_default_values.py
new file mode 100644
index 000000000..332dfe77f
--- /dev/null
+++ b/tests/utils/schemas/validation/test_default_values.py
@@ -0,0 +1,21 @@
+"""Tests for default values for configurations"""
+
+from axolotl.utils.config import validate_config
+from axolotl.utils.dict import DictDefault
+
+
+class TestDefaultConfigValues:
+    """Tests for default values for configurations"""
+
+    def test_pad_to_sequence_len(self, min_base_cfg):
+        """Tests that sample packing automatically sets pad_to_sequence_len to True"""
+        cfg = (
+            DictDefault(
+                sample_packing=True,
+            )
+            | min_base_cfg
+        )
+
+        cfg = validate_config(cfg)
+
+        assert cfg.pad_to_sequence_len is True
diff --git a/tests/utils/schemas/validation/test_fsdp.py b/tests/utils/schemas/validation/test_fsdp.py
new file mode 100644
index 000000000..5b461a113
--- /dev/null
+++ b/tests/utils/schemas/validation/test_fsdp.py
@@ -0,0 +1,113 @@
+"""
+tests for pydantic fsdp validation
+"""
+
+# pylint: disable=too-many-boolean-expressions
+import pytest
+
+from axolotl.utils.config import validate_config
+from axolotl.utils.dict import DictDefault
+
+
+class TestFSDPValidation:
+    """
+    test class for pydantic fsdp validation
+    """
+
+    def test_fsdp_version_in_fsdp_config(self, min_base_cfg):
+        cfg = min_base_cfg | DictDefault(
+            fsdp_config={
+                "fsdp_version": 2,
+            },
+        )
+        cfg = validate_config(
+            cfg,
+        )
+        assert cfg.fsdp_version == 2
+        assert cfg.fsdp_config.fsdp_version is None
+
+    def test_fsdp_offload_w_8bit_optim(self, min_base_cfg):
+        cfg = min_base_cfg | DictDefault(
+            fsdp_config={
+                "offload_params": True,
+            },
+            optimizer="adamw_8bit",
+            fsdp_version=1,
+        )
+        with pytest.raises(
+            ValueError, match="FSDP Offload not compatible with adamw_8bit"
+        ):
+            validate_config(cfg)
+
+    def test_fsdp2_w_8bit_optim(self, min_base_cfg):
+        cfg = min_base_cfg | DictDefault(
+            fsdp_config={
+                "offload_params": True,
+            },
+            optimizer="adamw_8bit",
+            fsdp_version=2,
+        )
+        with pytest.raises(
+            ValueError,
+            match="FSDP2 not compatible with adamw_8bit, use `adamw_torch_8bit` instead",
+        ):
+            validate_config(cfg)
+
+    def test_fsdp2_w_cpu_ram_efficient_loading(self, min_base_cfg):
+        cfg = min_base_cfg | DictDefault(
+            load_in_8bit=True,
+            adapter="lora",
+            fsdp_config={
+                "cpu_ram_efficient_loading": True,
+            },
+            fsdp_version=2,
+        )
+        with pytest.raises(
+            ValueError,
+            match="FSDP2 does not support load_in_8bit or load_in_4bit with cpu_ram_efficient_loading.",
+        ):
+            validate_config(cfg)
+
+    def test_fsdp_prefixes_removed(self, min_base_cfg):
+        cfg = min_base_cfg | DictDefault(
+            fsdp_config={
+                "fsdp_version": 2,
+                "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
+                "fsdp_reshard_after_forward": True,
+            }
+        )
+        cfg = validate_config(cfg)
+        assert cfg.fsdp_version == 2
+        assert cfg.fsdp_config.fsdp_version is None
+        for keys in cfg.fsdp_config.keys():
+            assert not keys.startswith("fsdp_")
+        assert cfg.fsdp_config.auto_wrap_policy == "TRANSFORMER_BASED_WRAP"
+        assert cfg.fsdp_config.transformer_layer_cls_to_wrap == "LlamaDecoderLayer"
+        assert cfg.fsdp_config.reshard_after_forward is True
+
+    @pytest.mark.parametrize(
+        "rl",
+        [
+            "dpo",
+            "kto",
+            "orpo",
+            "ipo",
+        ],
+    )
+    def test_fsdp2_dpo(self, min_base_cfg, rl):
+        cfg = min_base_cfg | DictDefault(
+            fsdp_version=2,
+            fsdp_config={
+                "reshard_after_forward": True,
+            },
+            rl=rl,
+            load_in_8bit=True,
+            adapter="lora",
+            remove_unused_columns=False,
+        )
+        with pytest.raises(
+            ValueError,
+            match="FSDP2 does not support load_in_8bit or load_in_4bit with ",
+        ):
+            validate_config(cfg)
diff --git a/tests/utils/test_import_helper.py b/tests/utils/test_import_helper.py
new file mode 100644
index 000000000..e1ab8bec5
--- /dev/null
+++ b/tests/utils/test_import_helper.py
@@ -0,0 +1,37 @@
+"""
+test cases for axolotl.utils.import_helper
+"""
+
+import pytest
+
+from axolotl.utils.import_helper import get_cls_from_module_str
+
+
+def test_get_cls_from_module_str():
+    cls = get_cls_from_module_str("axolotl.core.trainers.base.AxolotlTrainer")
+    assert cls.__name__ == "AxolotlTrainer"
+
+
+def test_get_cls_from_module_str_empty_string():
+    with pytest.raises(ValueError, match="module_str must be a non-empty string"):
+        get_cls_from_module_str("")
+
+
+def test_get_cls_from_module_str_whitespace_only():
+    with pytest.raises(ValueError, match="module_str must be a non-empty string"):
+        get_cls_from_module_str("   ")
+
+
+def test_get_cls_from_module_str_invalid_format():
+    with pytest.raises(ValueError, match="Invalid module string format"):
+        get_cls_from_module_str("single_part")
+
+
+def test_get_cls_from_module_str_nonexistent_module():
+    with pytest.raises(ImportError, match="Failed to import module"):
+        get_cls_from_module_str("nonexistent.module.Class")
+
+
+def test_get_cls_from_module_str_nonexistent_class():
+    with pytest.raises(AttributeError, match="Class 'NonExistentClass' not found"):
+        get_cls_from_module_str("axolotl.core.trainers.base.NonExistentClass")