handle empty offset for quant state

2025-05-01 13:01:00 -04:00
141 changed files with 3497 additions and 6953 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -31,11 +31,6 @@ jobs:
            python_version: "3.11"
            pytorch: 2.7.0
            axolotl_extras:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.7.0
-            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -99,11 +94,6 @@ jobs:
            python_version: "3.11"
            pytorch: 2.7.0
            axolotl_extras:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.7.0
-            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -3,7 +3,7 @@ name: docker-multigpu-tests-biweekly
 on:
  pull_request:
    paths:
-      - 'tests/e2e/multigpu/**.py'
+      - 'tests/e2e/multigpu/*.py'
      - 'requirements.txt'
      - 'setup.py'
      - 'pyproject.toml'
--- a/.github/workflows/preview-docs.yml
+++ b/.github/workflows/preview-docs.yml
@@ -4,12 +4,6 @@ on:
  pull_request:
    types: [opened, synchronize, reopened]

-    # Run the workflow only when one of these files changes
-    paths:
-      - '**/*.md'      # any Markdown file
-      - '**/*.qmd'     # any Quarto file
-      - '_quarto.yaml'
-
 permissions:
  checks: write
  contents: write
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -18,96 +18,9 @@ jobs:
        env:
          SKIP: no-commit-to-branch

-  preload-cache:
-    name: Preload HF cache
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        python_version: ["3.11"]
-        pytorch_version: ["2.6.0"]
-    timeout-minutes: 20
-
-    env:
-      AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
-
-    steps:
-      - name: Check out repository code
-        uses: actions/checkout@v4
-
-      - name: Restore HF cache
-        id: hf-cache-restore
-        uses: actions/cache/restore@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ runner.os }}-hf-hub-cache-v2
-
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python_version }}
-          cache: 'pip' # caching pip dependencies
-
-      - name: upgrade pip
-        run: |
-          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
-
-      - name: Install PyTorch
-        run: |
-          pip3 install torch==${{ matrix.pytorch_version }}
-
-      - name: Install dependencies
-        run: |
-          pip3 show torch
-          pip3 install --no-build-isolation -U -e .
-          python scripts/unsloth_install.py | sh
-          python scripts/cutcrossentropy_install.py | sh
-          pip3 install -r requirements-dev.txt -r requirements-tests.txt
-
-      - name: Make sure PyTorch version wasn't clobbered
-        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
-
-      - name: Ensure axolotl CLI was installed
-        run: |
-          axolotl --help
-
-      - name: Pre-Download dataset fixture
-        run: |
-          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
-
-      - name: Run tests
-        run: |
-          pytest -v tests/conftest.py
-
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v5
-        with:
-          token: ${{ secrets.CODECOV_TOKEN }}
-          files: ./coverage.xml
-          flags: unittests,pytorch-${{ matrix.pytorch_version }}
-          fail_ci_if_error: false
-
-      - name: cleanup pip cache
-        run: |
-          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
-
-      - name: Save HF cache
-        id: hf-cache
-        uses: actions/cache/save@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
-
  pytest:
    name: PyTest
    runs-on: ubuntu-latest
-    needs: [preload-cache]
    strategy:
      fail-fast: false
      max-parallel: 2
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -44,104 +44,12 @@ jobs:
        env:
          SKIP: no-commit-to-branch

-#  preload-cache:
-#    name: Preload HF cache
-#    runs-on: ubuntu-latest
-#    strategy:
-#      fail-fast: false
-#      matrix:
-#        python_version: ["3.11"]
-#        pytorch_version: ["2.6.0"]
-#    timeout-minutes: 20
-#
-#    env:
-#      AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
-#
-#    steps:
-#      - name: Check out repository code
-#        uses: actions/checkout@v4
-#
-#      - name: Restore HF cache
-#        id: hf-cache-restore
-#        uses: actions/cache/restore@v4
-#        with:
-#          path: |
-#            /home/runner/.cache/huggingface/hub/datasets--*
-#            /home/runner/.cache/huggingface/hub/models--*
-#          key: ${{ runner.os }}-hf-hub-cache-v2
-#
-#      - name: Restore Cache from S3
-#        id: hf-cache-restore-s3
-#        run: |
-#          mkdir -p /home/runner/.cache/huggingface/hub
-#          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd
-#
-#      - name: Setup Python
-#        uses: actions/setup-python@v5
-#        with:
-#          python-version: ${{ matrix.python_version }}
-#          cache: 'pip' # caching pip dependencies
-#
-#      - name: upgrade pip
-#        run: |
-#          pip3 install --upgrade pip
-#          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
-#
-#      - name: Install PyTorch
-#        run: |
-#          pip3 install torch==${{ matrix.pytorch_version }}
-#
-#      - name: Install dependencies
-#        run: |
-#          pip3 show torch
-#          pip3 install --no-build-isolation -U -e .
-#          python scripts/unsloth_install.py | sh
-#          python scripts/cutcrossentropy_install.py | sh
-#          pip3 install -r requirements-dev.txt -r requirements-tests.txt
-#
-#      - name: Make sure PyTorch version wasn't clobbered
-#        run: |
-#          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
-#
-#      - name: Ensure axolotl CLI was installed
-#        run: |
-#          axolotl --help
-#
-#      - name: Pre-Download dataset fixture
-#        run: |
-#          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
-#
-#      - name: Run tests
-#        run: |
-#          pytest -v tests/conftest.py
-#
-#      - name: Upload coverage to Codecov
-#        uses: codecov/codecov-action@v5
-#        with:
-#          token: ${{ secrets.CODECOV_TOKEN }}
-#          files: ./coverage.xml
-#          flags: unittests,pytorch-${{ matrix.pytorch_version }}
-#          fail_ci_if_error: false
-#
-#      - name: cleanup pip cache
-#        run: |
-#          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
-#
-#      - name: Save HF cache
-#        id: hf-cache
-#        uses: actions/cache/save@v4
-#        with:
-#          path: |
-#            /home/runner/.cache/huggingface/hub/datasets--*
-#            /home/runner/.cache/huggingface/hub/models--*
-#          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
-
  pytest:
    name: PyTest
    runs-on: ubuntu-latest
-#    needs: [preload-cache]
    strategy:
      fail-fast: false
+      max-parallel: 2
      matrix:
        python_version: ["3.11"]
        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
@@ -151,20 +59,14 @@ jobs:
      - name: Check out repository code
        uses: actions/checkout@v4

-#      - name: Restore HF cache
-#        id: hf-cache-restore
-#        uses: actions/cache/restore@v4
-#        with:
-#          path: |
-#            /home/runner/.cache/huggingface/hub/datasets--*
-#            /home/runner/.cache/huggingface/hub/models--*
-#          key: ${{ runner.os }}-hf-hub-cache-v2
-
-      - name: Restore Cache from S3
-        id: hf-cache-restore-s3
-        run: |
-          mkdir -p /home/runner/.cache/huggingface/hub
-          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd
+      - name: Restore HF cache
+        id: hf-cache-restore
+        uses: actions/cache/restore@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ runner.os }}-hf-hub-cache-v2

      - name: Setup Python
        uses: actions/setup-python@v5
@@ -219,12 +121,21 @@ jobs:
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

+      - name: Save HF cache
+        id: hf-cache
+        uses: actions/cache/save@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
+
  pytest-sdist:
    name: PyTest from Source Dist
    runs-on: ubuntu-latest
-#    needs: [preload-cache]
    strategy:
      fail-fast: false
+      max-parallel: 1
      matrix:
        python_version: ["3.11"]
        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
@@ -234,20 +145,14 @@ jobs:
      - name: Check out repository code
        uses: actions/checkout@v4

-#      - name: Restore HF cache
-#        id: hf-cache-restore
-#        uses: actions/cache/restore@v4
-#        with:
-#          path: |
-#            /home/runner/.cache/huggingface/hub/datasets--*
-#            /home/runner/.cache/huggingface/hub/models--*
-#          key: ${{ runner.os }}-hf-hub-cache-v2
-
-      - name: Restore Cache from S3
-        id: hf-cache-restore-s3
-        run: |
-          mkdir -p /home/runner/.cache/huggingface/hub
-          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd
+      - name: Restore HF cache
+        id: hf-cache-restore
+        uses: actions/cache/restore@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ runner.os }}-hf-hub-cache-v2

      - name: Setup Python
        uses: actions/setup-python@v5
@@ -294,8 +199,16 @@ jobs:
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

+      - name: Save HF cache
+        id: hf-cache
+        uses: actions/cache/save@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
+
  docker-e2e-tests-1st:
-    # Run this job first as a gate for running the remainder of the test matrix
    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
@@ -342,8 +255,6 @@ jobs:
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
    timeout-minutes: 90
-    # Only run the remainder of the matrix if the first e2e check passed;
-    # this is to save on wasted compute costs for known failures that get caught in the first run
    needs: [pre-commit, pytest, docker-e2e-tests-1st]

    strategy:
@@ -356,6 +267,12 @@ jobs:
            pytorch: 2.6.0
            num_gpus: 1
            axolotl_extras: llmcompressor
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.4.1
+            num_gpus: 1
+            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -368,12 +285,6 @@ jobs:
            pytorch: 2.7.0
            num_gpus: 1
            axolotl_extras:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.7.0
-            num_gpus: 1
-            axolotl_extras:
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -398,43 +309,3 @@ jobs:
      - name: Run tests job on Modal
        run: |
          modal run cicd.e2e_tests
-
-  docker-e2e-cleanup:
-    runs-on: [self-hosted, modal]
-    timeout-minutes: 90
-    needs: [docker-e2e-tests]
-
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.6.0
-            num_gpus: 1
-            axolotl_extras: vllm
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Install Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: Install Modal
-        run: |
-          python -m pip install --upgrade pip
-          pip install modal==0.71.8 jinja2
-      - name: Update env vars
-        run: |
-          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
-          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
-          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
-          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
-          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
-          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
-          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
-      - name: Run tests job on Modal
-        run: |
-          modal run cicd.cleanup
--- a/.runpod/src/handler.py
+++ b/.runpod/src/handler.py
@@ -57,10 +57,8 @@ async def handler(job):
    logger.info("Training Complete.")

    # Cleanup
-    if "WANDB_API_KEY" in os.environ:
-        del os.environ["WANDB_API_KEY"]
-    if "HF_TOKEN" in os.environ:
-        del os.environ["HF_TOKEN"]
+    del os.environ["WANDB_API_KEY"]
+    del os.environ["HF_TOKEN"]


 runpod.serverless.start({"handler": handler, "return_aggregate_stream": True})
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -48,22 +48,8 @@ quartodoc:
      contents:
        - core.trainers.base
        - core.trainers.trl
-        - core.trainers.mamba
-        - core.trainers.relora
        - core.trainers.dpo.trainer
        - core.trainers.grpo.trainer
-        - core.trainers.grpo.sampler
-        - core.trainers.utils
-    - title: Mixins
-      desc: Mixin classes for augmenting trainers
-      contents:
-        - core.trainers.mixins.optimizer
-        - core.trainers.mixins.rng_state_loader
-        - core.trainers.mixins.scheduler
-    - title: Context Managers
-      desc: Context managers for altering trainer behaviors
-      contents:
-        - utils.ctx_managers.sequence_parallel
    - title: Prompt Strategies
      desc: Prompt formatting strategies
      contents:
@@ -100,7 +86,7 @@ quartodoc:
        - kernels.swiglu
        - kernels.quantize
        - kernels.utils
-    - title: Monkey Patches
+    - title: MonkeyPatches
      desc: Runtime patches for model optimizations
      contents:
        - monkeypatch.llama_attn_hijack_flash
@@ -138,8 +124,7 @@ quartodoc:
        - utils.optimizers.adopt
        - utils.data.pretraining
        - utils.data.sft
-        - utils.gradient_checkpointing.offload_cpu
-        - utils.gradient_checkpointing.offload_disk
+        - utils.gradient_checkpointing.unsloth
    - title: Schemas
      desc: Pydantic data models for Axolotl config
      contents:
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -18,7 +18,7 @@ pytest -v --durations=10 \
  --cov-append

 # Run patched tests excluding lora kernels with coverage append
-pytest --full-trace -vvv --durations=10 \
+pytest -v --durations=10 \
  --ignore=tests/e2e/patched/lora_kernels \
  /workspace/axolotl/tests/e2e/patched \
  --cov=axolotl \
--- a/cicd/cleanup.py
+++ b/cicd/cleanup.py
@@ -1,19 +0,0 @@
-"""Modal app to run axolotl GPU cleanup"""
-
-from .single_gpu import VOLUME_CONFIG, app, cicd_image, run_cmd
-
-
-@app.function(
-    image=cicd_image,
-    timeout=60 * 60,
-    cpu=8.0,
-    memory=131072,
-    volumes=VOLUME_CONFIG,
-)
-def cleanup():
-    run_cmd("./cicd/cleanup.sh", "/workspace/axolotl")
-
-
-@app.local_entrypoint()
-def main():
-    cleanup.remote()
--- a/cicd/cleanup.sh
+++ b/cicd/cleanup.sh
@@ -1,6 +0,0 @@
-#!/bin/bash
-set -e
-
-# cleanup old cache files for datasets processing and intermediate mappings
-find /workspace/data/huggingface-cache/hub/datasets -name "cache-*" -type f -mtime +1 -exec rm {} \;
-find /workspace/data/huggingface-cache/hub/datasets -name "*.lock" -type f -mtime +1 -exec rm {} \;
--- a/cicd/e2e_tests.py
+++ b/cicd/e2e_tests.py
@@ -1,12 +1,75 @@
 """Modal app to run axolotl GPU tests"""

-from .single_gpu import GPU_CONFIG, VOLUME_CONFIG, app, cicd_image, run_cmd
+# pylint: disable=duplicate-code
+
+import os
+import pathlib
+import tempfile
+
+import jinja2
+import modal
+from jinja2 import select_autoescape
+from modal import App, Image
+
+cicd_path = pathlib.Path(__file__).parent.resolve()
+
+template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
+template_env = jinja2.Environment(
+    loader=template_loader, autoescape=select_autoescape()
+)
+df_template = template_env.get_template("Dockerfile.jinja")
+
+df_args = {
+    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
+    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
+    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
+    "CUDA": os.environ.get("CUDA", "121"),
+    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
+    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
+    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
+    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
+    "HF_HOME": "/workspace/data/huggingface-cache/hub",
+}
+
+dockerfile_contents = df_template.render(**df_args)
+
+temp_dir = tempfile.mkdtemp()
+with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
+    f.write(dockerfile_contents)
+
+cicd_image = Image.from_dockerfile(
+    pathlib.Path(temp_dir) / "Dockerfile",
+    context_mount=None,
+    force_build=True,
+    gpu="A10G",
+).env(df_args)
+
+app = App("Axolotl CI/CD", secrets=[])
+
+hf_cache_volume = modal.Volume.from_name(
+    "axolotl-ci-hf-hub-cache", create_if_missing=True
+)
+VOLUME_CONFIG = {
+    "/workspace/data/huggingface-cache/hub": hf_cache_volume,
+}
+
+N_GPUS = int(os.environ.get("N_GPUS", 1))
+GPU_CONFIG = modal.gpu.L40S(count=N_GPUS)
+
+
+def run_cmd(cmd: str, run_folder: str):
+    import subprocess  # nosec
+
+    # Propagate errors from subprocess.
+    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
+        exit(exit_code)  # pylint: disable=consider-using-sys-exit


@app.function(
    image=cicd_image,
    gpu=GPU_CONFIG,
-    timeout=90 * 60,  # 90 min
+    timeout=60 * 60,
    cpu=8.0,
    memory=131072,
    volumes=VOLUME_CONFIG,
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -70,7 +70,7 @@ def run_cmd(cmd: str, run_folder: str):
    image=cicd_image,
    gpu=GPU_CONFIG,
    timeout=90 * 60,
-    cpu=16.0,
+    cpu=8.0,
    memory=131072 * N_GPUS,
    volumes=VOLUME_CONFIG,
 )
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
@@ -1,66 +0,0 @@
-"""Modal app to run axolotl GPU tests"""
-
-# pylint: disable=duplicate-code
-
-import os
-import pathlib
-import tempfile
-
-import jinja2
-import modal
-from jinja2 import select_autoescape
-from modal import App, Image
-
-cicd_path = pathlib.Path(__file__).parent.resolve()
-
-template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
-template_env = jinja2.Environment(
-    loader=template_loader, autoescape=select_autoescape()
-)
-df_template = template_env.get_template("Dockerfile.jinja")
-
-df_args = {
-    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
-    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
-    "CUDA": os.environ.get("CUDA", "121"),
-    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
-    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
-    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
-    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
-    "HF_HOME": "/workspace/data/huggingface-cache/hub",
-}
-
-dockerfile_contents = df_template.render(**df_args)
-
-temp_dir = tempfile.mkdtemp()
-with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
-    f.write(dockerfile_contents)
-
-cicd_image = Image.from_dockerfile(
-    pathlib.Path(temp_dir) / "Dockerfile",
-    context_mount=None,
-    force_build=True,
-    gpu="A10G",
-).env(df_args)
-
-app = App("Axolotl CI/CD", secrets=[])
-
-hf_cache_volume = modal.Volume.from_name(
-    "axolotl-ci-hf-hub-cache", create_if_missing=True
-)
-VOLUME_CONFIG = {
-    "/workspace/data/huggingface-cache/hub": hf_cache_volume,
-}
-
-N_GPUS = int(os.environ.get("N_GPUS", 1))
-GPU_CONFIG = modal.gpu.L40S(count=N_GPUS)
-
-
-def run_cmd(cmd: str, run_folder: str):
-    import subprocess  # nosec
-
-    # Propagate errors from subprocess.
-    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
-        exit(exit_code)  # pylint: disable=consider-using-sys-exit
--- a/codecov.yml
+++ b/codecov.yml
@@ -19,7 +19,7 @@ coverage:
        if_no_uploads: error
        if_not_found: success
        if_ci_failed: error
-        only_pulls: true
+        only_pulls: false
        flags: null
        paths: null
    patch:
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -32,8 +32,6 @@ tokenizer_legacy:
 resize_token_embeddings_to_32x:
 # Optional[bool] Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.
 shrink_embeddings:
-# Optional[bool] Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs
-embeddings_skip_upcast:
 # Whether to load the model with randomly initialized weights. Useful for
 # pre-training a model from scratch or debugging purposes.
 random_init_weights:
@@ -75,12 +73,11 @@ load_in_8bit: true
 load_in_4bit:

 # Use CUDA bf16
-bf16: true # bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection. require >=ampere
+bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
 # Use CUDA fp16
 fp16: true
 # Use CUDA tf32
 tf32: true # require >=ampere
-# Note: if bf16 is set to 'auto', and fp16 is set to true, we will prefer the explict fp16 setting

 # No AMP (automatic mixed precision)
 bfloat16: true # require >=ampere
@@ -187,8 +184,8 @@ datasets:
    # adding a system turn with empty content.
    drop_system_message:

-    # Optional[bool]. (for Qwen3 template only) Whether to split the assistant content based on a reasoning trace inside delimited tags
-    # See example at `docs/dataset-formats/conversation.qmd`
+    # Optional[bool]. Whether to split the assistant turn based on a reasoning trace inside delimited tags
+    # defaults to False
    split_thinking:

    # IMPORTANT: The following fields determine which parts of the conversation to train on.
@@ -505,7 +502,6 @@ save_strategy: # Set to `"no"` to skip checkpoint saves, `"epoch"` at end of eac
 save_steps: # Leave empty to save at each epoch, integer for every N steps. float for fraction of total steps
 saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
 save_total_limit: # Checkpoints saved at a time
-save_only_model: # Save only the model weights, skipping the optimizer. Using this means you can't resume from checkpoints.
 # Maximum number of iterations to train for. It precedes num_epochs which means that
 # if both are set, num_epochs will not be guaranteed.
 # e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
@@ -539,7 +535,7 @@ train_on_inputs: false
 # Note that training loss may have an oscillating pattern with this enabled.
 group_by_length: false

-# Whether to use gradient checkpointing. Available options are: true, false, "offload", "offload_disk".
+# Whether to use gradient checkpointing. Available options are: true, false, "offload".
 # https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
 gradient_checkpointing: false
 # additional kwargs to pass to the trainer for gradient checkpointing
@@ -551,7 +547,7 @@ gradient_checkpointing: false
 early_stopping_patience: 3

 # Specify a scheduler and kwargs to use with the optimizer
-lr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | 'linear' | 'cosine_with_restarts' | 'polynomial' | 'constant' | 'constant_with_warmup' | 'inverse_sqrt' | 'reduce_lr_on_plateau' | 'cosine_with_min_lr' | 'warmup_stable_decay' | empty for cosine
+lr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | empty for cosine
 lr_scheduler_kwargs:
 cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
 cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)
@@ -613,7 +609,6 @@ lr_div_factor: # Learning rate div factor
 # - optimi_adamw
 # - ao_adamw_8bit
 # - ao_adamw_fp8
-# - came_pytorch
 optimizer:
 # Dictionary of arguments to pass to the optimizer
 optim_args:
@@ -633,9 +628,7 @@ weight_decay:
 # adamw hyperparams
 adam_beta1:
 adam_beta2:
-adam_beta3:  # only used for CAME Optimizer
 adam_epsilon:
-adam_epsilon2:  # only used for CAME Optimizer
 # Gradient clipping max norm
 max_grad_norm:

--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -196,34 +196,6 @@ datasets:
 It is not necessary to set both `message_field_training` and `message_field_training_detail` at once.
 :::

-8. (For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.
-
-```yaml
-datasets:
-  - path: ...
-    type: chat_template
-    chat_template: qwen3
-    split_thinking: true
-```
-
-For example, a content can look like:
-
-```json
-{
-  "content": "<think>Some thinking outputs</think>Output after thinking."
-}
-```
-
-After split, it will look like:
-
-```json
-{
-  "reasoning_content": "Some thinking outputs",
-  "content": "Output after thinking..."
-}
-```
-
-
 ## sharegpt

 ::: {.callout-important}
--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -8,10 +8,6 @@ format:

 This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai).

-::: {.callout-important}
-For Blackwell GPUs, please use the tags with Pytorch 2.7.0 and CUDA 12.8.
-:::
-
 ## Base

 The base image is the most minimal image that can install Axolotl. It is based on the `nvidia/cuda` image. It includes python, torch, git, git-lfs, awscli, pydantic, and more.
--- a/docs/getting-started.qmd
+++ b/docs/getting-started.qmd
@@ -104,7 +104,7 @@ the `alpaca` dataset format, which has the following format:
 Please see our [Dataset Formats](dataset-formats) for more dataset formats and how to
 format them.

-2. Prepare your JSONL data in the specified format (in this case, the expected `alpaca`
+2. Prepare your JSONL data in the specified format (in this case, the expected `alpaca
 format):

 ```json
@@ -120,12 +120,6 @@ axolotl train my_training.yml

 ## Common Tasks {#sec-common-tasks}

-::: {.callout-tip}
-
-The same yaml file is used for training, inference, and merging.
-
-:::
-
 ### Testing Your Model {#sec-testing}

 After training, test your model:
@@ -134,16 +128,6 @@ After training, test your model:
 axolotl inference my_training.yml --lora-model-dir="./outputs/lora-out"
 ```

-More details can be found in [Inference](inference.qmd).
-
-### Using a UI {#sec-ui}
-
-Launch a Gradio interface:
-
-```bash
-axolotl inference my_training.yml --lora-model-dir="./outputs/lora-out" --gradio
-```
-
 ### Preprocessing Data {#sec-preprocessing}

 For large datasets, preprocess first:
@@ -152,22 +136,14 @@ For large datasets, preprocess first:
 axolotl preprocess my_training.yml
 ```

-Please make sure to set `dataset_prepared_path: ` in your config to set the path to save the prepared dataset.
+### Using a UI {#sec-ui}

-More details can be found in [Dataset Preprocessing](dataset_preprocessing.qmd).
-
-### Merging LoRA weights {#sec-merging-lora}
-
-To merge the LoRA weights back into the base model, run:
+Launch a Gradio interface:

 ```bash
-axolotl merge-lora my_training.yml --lora-model-dir="./outputs/lora-out"
+axolotl inference my_training.yml --lora-model-dir="./outputs/lora-out" --gradio
 ```

-The merged model will be saved in the `{output_dir}/merged` directory.
-
-More details can be found in [Merging LoRA weights](inference.qmd#sec-merging).
-
 ## Next Steps {#sec-next-steps}

 Now that you have the basics, you might want to:
@@ -180,7 +156,6 @@ Now that you have the basics, you might want to:
 Check our other guides for details on these topics:

 - [Configuration Guide](config.qmd) - Full configuration options
- [Dataset Loading](dataset-loading.qmd) - Loading datasets from various sources
 - [Dataset Formats](dataset-formats) - Working with different data formats
 - [Multi-GPU Training](multi-gpu.qmd)
 - [Multi-Node Training](multi-node.qmd)
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -25,10 +25,6 @@ Please make sure to have Pytorch installed before installing Axolotl in your loc
 Follow the instructions at: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
 :::

-::: {.callout-important}
-For Blackwell GPUs, please use Pytorch 2.7.0 and CUDA 12.8.
-:::
-
 ### PyPI Installation (Recommended) {#sec-pypi}

 ```{.bash}
@@ -76,10 +72,6 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it \
 ```
 :::

-::: {.callout-important}
-For Blackwell GPUs, please use `axolotlai/axolotl:main-py3.11-cu128-2.7.0` or the cloud variant `axolotlai/axolotl-cloud:main-py3.11-cu128-2.7.0`.
-:::
-
 Please refer to the [Docker documentation](docker.qmd) for more information on the different Docker images that are available.

 ## Cloud Environments {#sec-cloud}
--- a/docs/multi-gpu.qmd
+++ b/docs/multi-gpu.qmd
@@ -87,7 +87,20 @@ We support sequence parallelism (SP) via the
 allows one to split up sequences across GPUs, which is useful in the event that a
 single sequence causes OOM errors during model training.

-See our [dedicated guide](sequence_parallelism.qmd) for more information.
+First, install `ring-flash-attn`, recommended via `pip install axolotl[ring-flash-attn]`,
+or from source with `pip install .[ring-flash-attn]`.
+
+Your Axolotl YAML config should contain the following lines:
+
+```{.yaml}
+sequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU
+flash_attention: true  # Required with sequence parallelism
+
+# Optional; strides across the key dimension. Larger values use more memory but will make training faster.
+heads_k_stride: 1
+```
+
+See our [dedicated guide](sequence_parallelism.qmd) for more details.

 ### FSDP + QLoRA {#sec-fsdp-qlora}

--- a/docs/sequence_parallelism.qmd
+++ b/docs/sequence_parallelism.qmd
@@ -3,6 +3,8 @@ title: Sequence Parallelism
 description: Train with long sequences split across multiple GPUs.
 ---

+# Sequence Parallelism
+
 Sequence parallelism is a technique that splits sequences across multiple GPUs,
 allowing you to train with very long sequences that wouldn't fit on a single GPU. Each
 GPU processes a different portion of the sequence, and the results are aggregated
@@ -25,7 +27,7 @@ To enable sequence parallelism, add the following to your configuration file:
 sequence_parallel_degree: 4  # Split sequences across 4 GPUs
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 heads_k_stride: 1
-# Optional; one of "varlen_llama3" or "batch_ring". Defaults to
+# Optional; one of "varlen_llama3", "batch_ring", "batch_zigzag", "batch_stripe". Defaults to
 # "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
 ring_attn_func:
 ```
@@ -41,7 +43,7 @@ When sequence parallelism is enabled:

 1. Each sequence is divided into equal chunks across the GPUs in a sequence parallel group
 2. The data collator handles the chunking of input_ids, attention_mask, labels, and position_ids
-3. Position IDs are adjusted to maintain proper relative positions
+3. Position IDs are adjusted to maintain proper relative positions, especially for packed sequences
 4. The trainer uses special ring communication patterns for attention operations

 ## Requirements
@@ -67,11 +69,9 @@ sequence_len: 8192
 ...

 sequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU
+flash_attention: true  # Required with sequence parallelism
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 heads_k_stride: 1
-# Optional; one of "varlen_llama3" or "batch_ring". Defaults to
-# "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
-ring_attn_func:

 ...
 ```
--- a/examples/llama-4/README.md
+++ b/examples/llama-4/README.md
@@ -34,5 +34,3 @@ We provide a script to delinearize Llama 4 linearized models into regular Huggin
 ```bash
 axolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir
 ```
-
-Note: This only works with the non-quantized linearized model. If you have an adapter, merge it with the *non-quantized linearized* model before delinearizing.
--- a/examples/mistral/bigstral-ds-zero3.yaml
+++ b/examples/mistral/bigstral-ds-zero3.yaml
--- a/examples/mistral/devstral-small-2505.yml
+++ b/examples/mistral/devstral-small-2505.yml
@@ -1,48 +0,0 @@
-base_model: mistralai/Devstral-Small-2505
-processor_type: AutoProcessor
-
-# these 3 lines are needed for now to handle vision chat templates w images
-skip_prepare_dataset: true
-remove_unused_columns: false
-sample_packing: false
-
-chat_template: mistral_v7_tekken
-datasets:
-  - path: HuggingFaceH4/llava-instruct-mix-vsft
-    type: chat_template
-    split: train[:1%]
-    field_messages: messages
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.01
-output_dir: ./outputs/out
-
-sequence_len: 2048
-pad_to_sequence_len: false
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 1
-micro_batch_size: 1
-num_epochs: 1
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-fp16:
-tf32: false
-
-gradient_checkpointing: true
-logging_steps: 1
-flash_attention: false
-eager_attention:
-
-warmup_ratio: 0.1
-evals_per_epoch: 1
-saves_per_epoch: 1
-weight_decay: 0.0
-special_tokens:
--- a/examples/orpheus/README.md
+++ b/examples/orpheus/README.md
@@ -1,341 +0,0 @@
-# Finetuning LLMs to output audio
-
-In this example, we finetune Orpcanopylabs/orpheus-tts-0.1-pretrained (a LLaMA 3.2 3b model) to output audio.
-
-The `finetune.yml` withe current settings will run on any Nvidia GPU with 45GB VRAM or more. If you adjust the batch size it can easily run on any GPU under 24GB.
-
-## Dataset pre-processing for pre-training
-If you are adding another voice in English, please jump ahead to finetuning pre-processing.
-
-For this to work, we need to preprocess our dataset. Since we are expecting to output audio, we will need to add tokens to the tokenizer.
-
-Using this code, it will download the SNAC model and add the correct tokens and upload the final dataset.
-
-```python
-import torch
-from snac import SNAC
-from datasets import load_dataset
-from huggingface_hub import snapshot_download
-from datasets import load_dataset
-import random
-import torchaudio.transforms as T
-from transformers import AutoTokenizer
-import os
-
-my_original_dataset_name = "<huggingface-id-of-dataset-that-we-want-to-preprocess>"
-name_to_push_dataset_to = "<huggingface-id-of-where-to-save-dataset>"
-
-dsn = my_original_dataset_name
-
-snapshot_download(
-    repo_id=dsn,
-    repo_type="dataset",
-    revision="main",
-    max_workers=64,
-)
-
-
-ds = load_dataset(dsn, split="train")
-ds_sample_rate = ds[0]["audio"]["sampling_rate"]
-
-model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
-model = model.to("mps")
-
-def tokenise_audio(waveform):
-  waveform = torch.from_numpy(waveform).unsqueeze(0)
-  waveform = waveform.to(dtype=torch.float32)
-  resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)
-  waveform = resample_transform(waveform)
-
-  waveform = waveform.unsqueeze(0).to("cuda")
-
-  #generate the codes from snac
-  with torch.inference_mode():
-    codes = model.encode(waveform)
-
-  all_codes = []
-  for i in range(codes[0].shape[1]):
-    all_codes.append(codes[0][0][i].item()+128266)
-    all_codes.append(codes[1][0][2*i].item()+128266+4096)
-    all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))
-    all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))
-    all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))
-    all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))
-    all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))
-
-
-  return all_codes
-
-def add_codes(example):
-    # Always initialize codes_list to None
-    codes_list = None
-
-    try:
-        answer_audio = example.get("audio")
-        # If there's a valid audio array, tokenise it
-        if answer_audio and "array" in answer_audio:
-            audio_array = answer_audio["array"]
-            codes_list = tokenise_audio(audio_array)
-    except Exception as e:
-        print(f"Skipping row due to error: {e}")
-        # Keep codes_list as None if we fail
-    example["codes_list"] = codes_list
-
-    return example
-
-ds = ds.map(add_codes, remove_columns=["audio"])
-
-#@title Load Tokenizer
-tokeniser_length = 128256
-start_of_text = 128000
-end_of_text = 128009
-
-start_of_speech = tokeniser_length + 1
-end_of_speech = tokeniser_length + 2
-
-start_of_human = tokeniser_length + 3
-end_of_human = tokeniser_length + 4
-
-start_of_ai = tokeniser_length + 5
-end_of_ai =  tokeniser_length + 6
-pad_token = tokeniser_length + 7
-
-audio_tokens_start = tokeniser_length + 10
-
-tokenizer_name = "canopylabs/orpheus-3b-0.1-pretrained"
-
-
-tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-num_proc = os.cpu_count() - 2
-
-ds = ds.filter(lambda x: x["codes_list"] is not None)
-ds = ds.filter(lambda x: len(x["codes_list"]) > 0)
-
-#@title Create Input Ids
-def remove_duplicate_frames(example):
-    vals = example["codes_list"]
-    if len(vals) % 7 != 0:
-        raise ValueError("Input list length must be divisible by 7")
-
-    result = vals[:7]
-
-    removed_frames = 0
-
-    for i in range(7, len(vals), 7):
-        current_first = vals[i]
-        previous_first = result[-7]
-
-        if current_first != previous_first:
-            result.extend(vals[i:i+7])
-        else:
-            removed_frames += 1
-
-    example["codes_list"] = result
-
-    return example
-
-ds = ds.map(remove_duplicate_frames, num_proc=num_proc)
-
-
-def create_input_ids(example):
-    text_ids = tokenizer.encode({example['text']},  add_special_tokens=True)
-    text_ids.append(end_of_text)
-    example["text_tokens"] = text_ids
-    input_ids = (
-        [start_of_human]
-        + example["text_tokens"]
-        + [end_of_human]
-        + [start_of_ai]
-        + [start_of_speech]
-        + example["codes_list"]
-        + [end_of_speech]
-        + [end_of_ai]
-    )
-    example["input_ids"] = input_ids
-    example["labels"] = input_ids
-    example["attention_mask"] = [1] * len(input_ids)
-
-    return example
-
-ds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=["text", "codes_list"])
-
-#@title Remove unnecessary columns
-columns_to_keep = ["input_ids", "labels", "attention_mask"]
-columns_to_remove = [col for col in ds.column_names if col not in columns_to_keep]
-
-ds = ds.remove_columns(columns_to_remove)
-
-ds.push_to_hub(name_to_push_dataset_to)
-```
-
-
-## Finetune pre-processing
-Use this code to add a new voice.
-
-```python
-import torch
-from snac import SNAC
-from datasets import load_dataset
-from huggingface_hub import snapshot_download
-from datasets import load_dataset
-import random
-import torchaudio.transforms as T
-from transformers import AutoTokenizer
-import os
-
-my_original_dataset_name = "<huggingface-id-of-dataset-that-we-want-to-preprocess>"
-name_to_push_dataset_to = "<huggingface-id-of-where-to-save-dataset>"
-
-dsn = my_original_dataset_name
-
-snapshot_download(
-    repo_id=dsn,
-    repo_type="dataset",
-    revision="main",
-    max_workers=64,
-)
-
-
-ds = load_dataset(dsn, split="train")
-ds_sample_rate = ds[0]["audio"]["sampling_rate"]
-
-model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
-model = model.to("mps")
-
-def tokenise_audio(waveform):
-  waveform = torch.from_numpy(waveform).unsqueeze(0)
-  waveform = waveform.to(dtype=torch.float32)
-  resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)
-  waveform = resample_transform(waveform)
-
-  waveform = waveform.unsqueeze(0).to("cuda")
-
-  #generate the codes from snac
-  with torch.inference_mode():
-    codes = model.encode(waveform)
-
-  all_codes = []
-  for i in range(codes[0].shape[1]):
-    all_codes.append(codes[0][0][i].item()+128266)
-    all_codes.append(codes[1][0][2*i].item()+128266+4096)
-    all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))
-    all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))
-    all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))
-    all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))
-    all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))
-
-
-  return all_codes
-
-def add_codes(example):
-    # Always initialize codes_list to None
-    codes_list = None
-
-    try:
-        answer_audio = example.get("audio")
-        # If there's a valid audio array, tokenise it
-        if answer_audio and "array" in answer_audio:
-            audio_array = answer_audio["array"]
-            codes_list = tokenise_audio(audio_array)
-    except Exception as e:
-        print(f"Skipping row due to error: {e}")
-        # Keep codes_list as None if we fail
-    example["codes_list"] = codes_list
-
-    return example
-
-ds = ds.map(add_codes, remove_columns=["audio"])
-
-#@title Load Tokenizer
-tokeniser_length = 128256
-start_of_text = 128000
-end_of_text = 128009
-
-start_of_speech = tokeniser_length + 1
-end_of_speech = tokeniser_length + 2
-
-start_of_human = tokeniser_length + 3
-end_of_human = tokeniser_length + 4
-
-start_of_ai = tokeniser_length + 5
-end_of_ai =  tokeniser_length + 6
-pad_token = tokeniser_length + 7
-
-audio_tokens_start = tokeniser_length + 10
-
-tokenizer_name = "canopylabs/orpheus-3b-0.1-pretrained"
-
-
-tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-num_proc = os.cpu_count() - 2
-
-ds = ds.filter(lambda x: x["codes_list"] is not None)
-ds = ds.filter(lambda x: len(x["codes_list"]) > 0)
-
-#@title Create Input Ids
-def remove_duplicate_frames(example):
-    vals = example["codes_list"]
-    if len(vals) % 7 != 0:
-        raise ValueError("Input list length must be divisible by 7")
-
-    result = vals[:7]
-
-    removed_frames = 0
-
-    for i in range(7, len(vals), 7):
-        current_first = vals[i]
-        previous_first = result[-7]
-
-        if current_first != previous_first:
-            result.extend(vals[i:i+7])
-        else:
-            removed_frames += 1
-
-    example["codes_list"] = result
-
-    return example
-
-ds = ds.map(remove_duplicate_frames, num_proc=num_proc)
-
-tok_info = '''*** HERE you can modify the text prompt
-i.e. if you wanted a multispeaker model like canopylabs/orpheus-3b-0.1-ft, you can pass:
-f"{example["source"]}:  {example["text"]}", as is passed.
-'''
-print(tok_info)
-
-def create_input_ids(example):
-    text_ids = tokenizer.encode(f"{example['speaker_id']}: {example['text']}",  add_special_tokens=True)
-    text_ids.append(end_of_text)
-    example["text_tokens"] = text_ids
-    input_ids = (
-        [start_of_human]
-        + example["text_tokens"]
-        + [end_of_human]
-        + [start_of_ai]
-        + [start_of_speech]
-        + example["codes_list"]
-        + [end_of_speech]
-        + [end_of_ai]
-    )
-    example["input_ids"] = input_ids
-    example["labels"] = input_ids
-    example["attention_mask"] = [1] * len(input_ids)
-
-    return example
-
-ds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=["text", "codes_list"])
-
-#@title Remove unnecessary columns
-columns_to_keep = ["input_ids", "labels", "attention_mask"]
-columns_to_remove = [col for col in ds.column_names if col not in columns_to_keep]
-
-ds = ds.remove_columns(columns_to_remove)
-
-ds.push_to_hub(name_to_push_dataset_to)
-```
-
-## Training
-After preprocessing is done, fill out the blanks in finetune.yml and simply run `axolotl train finetune.yml`
-
-## Inference
-For inference, please refer to the original [orpheus github](https://github.com/canopyai/Orpheus-TTS/tree/main).
--- a/examples/orpheus/finetune.yml
+++ b/examples/orpheus/finetune.yml
@@ -1,52 +0,0 @@
-base_model: canopylabs/orpheus-3b-0.1-pretrained
-
-hub_model_id: <your-hub-model-id>
-
-plugins:
-  - axolotl.integrations.liger.LigerPlugin
-liger_rope: true
-liger_rms_norm: true
-liger_glu_activation: true
-liger_fused_linear_cross_entropy: true
-
-datasets:
-  - path: <your-hf-dataset-id>
-    type:  # leave empty to load pre-tokenized
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.01
-output_dir: ./outputs/out
-
-sequence_len: 8192
-sample_packing: true
-pad_to_sequence_len: true
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 8
-micro_batch_size: 4
-num_epochs: 3
-optimizer: adamw_torch_fused
-lr_scheduler: cosine
-learning_rate: 2e-5
-
-bf16: auto
-tf32: false
-
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_steps: 20
-evals_per_epoch: 5
-saves_per_epoch: 5
-weight_decay: 0.05
-
-special_tokens:
-  pad_token: <custom_token_7>
--- a/examples/qwen2/dpo.yaml
+++ b/examples/qwen2/dpo.yaml
@@ -2,6 +2,7 @@ base_model: Qwen/Qwen2.5-0.5B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name

+
 chat_template: qwen_25
 rl: dpo
 datasets:
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,17 +6,16 @@ triton>=3.0.0
 mamba-ssm==1.2.0.post1
 xformers>=0.0.23.post1
 autoawq==0.2.7.post3
-liger-kernel==0.5.9
+liger-kernel==0.5.8
 # END section

 packaging==23.2

-huggingface_hub==0.31.0
 peft==0.15.2
 transformers==4.51.3
 tokenizers>=0.21.1
 accelerate==1.6.0
-datasets==3.5.1
+datasets==3.5.0
 deepspeed>=0.15.4
 trl==0.17.0
 hf_xet==1.1.0
--- a/setup.py
+++ b/setup.py
@@ -67,13 +67,13 @@ def parse_requirements(extras_require_map):
            if (major, minor) >= (2, 7):
                _install_requires.pop(_install_requires.index(xformers_version))
                # _install_requires.append("xformers==0.0.29.post3")  # xformers seems to be hard pinned to 2.6.0
-                extras_require_map["vllm"] = ["vllm==0.8.5.post1"]
+                extras_require_map["vllm"] = ["vllm==0.8.5"]
            elif (major, minor) >= (2, 6):
                _install_requires.pop(_install_requires.index(xformers_version))
                _install_requires.append(
                    "xformers==0.0.29.post2"
                )  # vllm needs post2 w torch 2.6
-                extras_require_map["vllm"] = ["vllm==0.8.5.post1"]
+                extras_require_map["vllm"] = ["vllm==0.8.5"]
            elif (major, minor) >= (2, 5):
                _install_requires.pop(_install_requires.index(xformers_version))
                if patch == 0:
@@ -142,7 +142,6 @@ extras_require = {
        "apollo-torch",
        "lomo-optim==0.1.1",
        "torch-optimi==0.2.1",
-        "came_pytorch==0.1.3",
    ],
    "ray": [
        "ray[train]",
--- a/src/axolotl/cli/args.py
+++ b/src/axolotl/cli/args.py
@@ -82,12 +82,6 @@ class VllmServeCliArgs:
            "hardware support this feature."
        },
    )
-    serve_module: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "Module to serve. If not set, the default module will be used."
-        },
-    )


@dataclass
--- a/src/axolotl/cli/art.py
+++ b/src/axolotl/cli/art.py
@@ -16,15 +16,8 @@ AXOLOTL_LOGO = """
    @@@@  @@@@@@@@@@@@@@@@
 """

-HAS_PRINTED_LOGO = False
-

 def print_axolotl_text_art():
    """Prints axolotl ASCII art."""
-
-    global HAS_PRINTED_LOGO  # pylint: disable=global-statement
-    if HAS_PRINTED_LOGO:
-        return
    if is_main_process():
-        HAS_PRINTED_LOGO = True
        print(AXOLOTL_LOGO)
--- a/src/axolotl/cli/evaluate.py
+++ b/src/axolotl/cli/evaluate.py
@@ -15,7 +15,7 @@ from axolotl.cli.checks import check_accelerate_default_config, check_user_token
 from axolotl.cli.config import load_cfg
 from axolotl.common.datasets import load_datasets, load_preference_datasets
 from axolotl.evaluate import evaluate
-from axolotl.utils import patch_optimized_env
+from axolotl.utils import set_pytorch_cuda_alloc_conf
 from axolotl.utils.dict import DictDefault

 LOG = logging.getLogger(__name__)
@@ -32,7 +32,7 @@ def do_evaluate(cfg: DictDefault, cli_args: TrainerCliArgs) -> None:
        cli_args: CLI arguments.
    """
    # Enable expandable segments for cuda allocation to improve VRAM usage
-    patch_optimized_env()
+    set_pytorch_cuda_alloc_conf()

    # pylint: disable=duplicate-code
    print_axolotl_text_art()
--- a/src/axolotl/cli/main.py
+++ b/src/axolotl/cli/main.py
@@ -29,7 +29,7 @@ from axolotl.cli.utils import (
    filter_none_kwargs,
 )
 from axolotl.integrations.lm_eval.cli import lm_eval
-from axolotl.utils import patch_optimized_env
+from axolotl.utils import set_pytorch_cuda_alloc_conf
 from axolotl.utils.schemas.config import AxolotlInputConfig


@@ -55,8 +55,6 @@ def preprocess(config: str, cloud: Optional[str] = None, **kwargs) -> None:
        kwargs: Additional keyword arguments which correspond to CLI args or `axolotl`
            config options.
    """
-    patch_optimized_env()
-
    if cloud:
        from axolotl.cli.cloud import do_cli_preprocess

@@ -102,7 +100,7 @@ def train(
            config options.
    """
    # Enable expandable segments for cuda allocation to improve VRAM usage
-    patch_optimized_env()
+    set_pytorch_cuda_alloc_conf()

    if "use_ray" in kwargs and kwargs["use_ray"]:
        accelerate = False
--- a/src/axolotl/cli/preprocess.py
+++ b/src/axolotl/cli/preprocess.py
@@ -18,7 +18,6 @@ from axolotl.cli.checks import check_accelerate_default_config, check_user_token
 from axolotl.cli.config import load_cfg
 from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
 from axolotl.common.datasets import load_datasets, load_preference_datasets
-from axolotl.integrations.base import PluginManager
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.trainer import disable_datasets_caching

@@ -48,10 +47,7 @@ def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None:
        cfg.dataset_prepared_path = DEFAULT_DATASET_PREPARED_PATH

    with disable_datasets_caching():
-        plugin_manager = PluginManager.get_instance()
-        if plugin_manager.load_datasets(cfg, preprocess=True):
-            pass
-        elif cfg.rl:
+        if cfg.rl:
            load_preference_datasets(cfg=cfg, cli_args=cli_args)
        else:
            load_datasets(cfg=cfg, cli_args=cli_args)
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -18,7 +18,7 @@ from axolotl.cli.config import load_cfg
 from axolotl.common.datasets import load_datasets, load_preference_datasets
 from axolotl.integrations.base import PluginManager
 from axolotl.train import train
-from axolotl.utils import patch_optimized_env
+from axolotl.utils import set_pytorch_cuda_alloc_conf
 from axolotl.utils.config import normalize_config, resolve_dtype
 from axolotl.utils.dict import DictDefault

@@ -36,20 +36,17 @@ def do_train(cfg: DictDefault, cli_args: TrainerCliArgs):
        cli_args: Training-specific CLI arguments.
    """
    # Enable expandable segments for cuda allocation to improve VRAM usage
-    patch_optimized_env()
+    set_pytorch_cuda_alloc_conf()

    print_axolotl_text_art()
    check_accelerate_default_config()
    if int(os.getenv("LOCAL_RANK", "0")) == 0:
        check_user_token()

-    plugin_manager = PluginManager.get_instance()
-    dataset_meta = plugin_manager.load_datasets(cfg, preprocess=False)
-    if not dataset_meta:
-        if cfg.rl:
-            dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args)
-        else:
-            dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+    if cfg.rl:
+        dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args)
+    else:
+        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

    model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta)

--- a/src/axolotl/cli/utils.py
+++ b/src/axolotl/cli/utils.py
@@ -20,9 +20,8 @@ from transformers import (
    ProcessorMixin,
 )

-from axolotl.loaders import load_processor, load_tokenizer
-from axolotl.loaders.model import ModelLoader
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.models import load_model, load_processor, load_tokenizer

 LOG = logging.getLogger(__name__)

@@ -319,8 +318,7 @@ def load_model_and_tokenizer(
    tokenizer = load_tokenizer(cfg)

    LOG.info("loading model...")
-    model_loader = ModelLoader(cfg, tokenizer, inference=inference)
-    model, _ = model_loader.load()
+    model, _ = load_model(cfg, tokenizer, inference=inference)

    processor = None
    if cfg.is_multimodal:
--- a/src/axolotl/cli/vllm_serve.py
+++ b/src/axolotl/cli/vllm_serve.py
@@ -6,6 +6,7 @@ from pathlib import Path
 from typing import Union

 from trl.scripts.vllm_serve import ScriptArguments
+from trl.scripts.vllm_serve import main as vllm_serve_main

 from axolotl.cli.config import load_cfg

@@ -27,9 +28,6 @@ def do_vllm_serve(
    cfg = load_cfg(config)
    model = cfg.base_model

-    serve_module = cli_args.get("serve_module", "trl.scripts.vllm_serve")
-    vllm_serve_main = getattr(__import__(serve_module, fromlist=["main"]), "main")
-
    tensor_parallel_size = (
        cli_args.get("tensor_parallel_size") or cfg.vllm.tensor_parallel_size
    )
--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -10,11 +10,10 @@ from datasets import Dataset

 import axolotl.monkeypatch.data.batch_dataset_fetcher  # pylint: disable=unused-import  # noqa: F401
 from axolotl.cli.args import PreprocessCliArgs, TrainerCliArgs
-from axolotl.loaders import load_processor, load_tokenizer
 from axolotl.utils.data import prepare_dataset
 from axolotl.utils.data.rl import load_prepare_preference_datasets
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.schemas.enums import RLType
+from axolotl.utils.models import load_processor, load_tokenizer
 from axolotl.utils.tokenization import check_dataset_labels

 LOG = logging.getLogger(__name__)
@@ -49,7 +48,6 @@ def load_datasets(
    *,
    cfg: DictDefault,
    cli_args: PreprocessCliArgs | TrainerCliArgs | None = None,
-    debug: bool = False,
 ) -> TrainDatasetMeta:
    """
    Loads one or more training or evaluation datasets, calling
@@ -58,7 +56,6 @@ def load_datasets(
    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        cli_args: Command-specific CLI arguments.
-        debug: Whether to print out tokenization of sample

    Returns:
        Dataclass with fields for training and evaluation datasets and the computed
@@ -80,25 +77,20 @@ def load_datasets(
        preprocess_iterable=preprocess_iterable,
    )

-    if (  # pylint: disable=too-many-boolean-expressions
-        cli_args
-        and (
-            cli_args.debug
-            or cfg.debug
-            or cli_args.debug_text_only
-            or int(cli_args.debug_num_examples) > 0
-        )
-    ) or debug:
+    if cli_args and (
+        cli_args.debug
+        or cfg.debug
+        or cli_args.debug_text_only
+        or int(cli_args.debug_num_examples) > 0
+    ):
        LOG.info("check_dataset_labels...")

-        num_examples = cli_args.debug_num_examples if cli_args else 1
-        text_only = cli_args.debug_text_only if cli_args else False
-        train_samples = sample_dataset(train_dataset, num_examples)
+        train_samples = sample_dataset(train_dataset, cli_args.debug_num_examples)
        check_dataset_labels(
            train_samples,
            tokenizer,
-            num_examples=num_examples,
-            text_only=text_only,
+            num_examples=cli_args.debug_num_examples,
+            text_only=cli_args.debug_text_only,
        )

        LOG.info("printing prompters...")
@@ -134,7 +126,7 @@ def load_preference_datasets(
    total_num_steps: Optional[int] = int(
        math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
    )
-    if cfg.rl is RLType.GRPO:
+    if cfg.rl == "grpo":
        total_num_steps = None

    if cli_args.debug or cfg.debug:
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -21,7 +21,6 @@ import importlib.util
 import inspect
 import logging
 import math
-import os
 import sys
 from abc import abstractmethod
 from pathlib import Path
@@ -59,7 +58,6 @@ from axolotl.core.training_args import (
    AxolotlTrainingArguments,
 )
 from axolotl.integrations.base import PluginManager
-from axolotl.loaders.utils import ensure_dtype
 from axolotl.monkeypatch.multipack import SUPPORTED_MULTIPACK_MODEL_TYPES
 from axolotl.monkeypatch.relora import ReLoRACallback
 from axolotl.monkeypatch.trainer.lr import patch_trainer_get_lr
@@ -74,7 +72,6 @@ from axolotl.utils.callbacks import (
    SaveBetterTransformerModelCallback,
    bench_eval_callback_factory,
    causal_lm_bench_eval_callback_factory,
-    colab_inference_post_train_callback,
    log_prediction_callback_factory,
 )
 from axolotl.utils.callbacks.lisa import lisa_callback_factory
@@ -87,7 +84,8 @@ from axolotl.utils.collators import (
    V2BatchSamplerDataCollatorForSeq2Seq,
 )
 from axolotl.utils.collators.mm_chat import MultiModalChatDataCollator
-from axolotl.utils.schemas.enums import CustomSupportedOptimizers, RLType
+from axolotl.utils.models import ensure_dtype
+from axolotl.utils.schemas.enums import CustomSupportedOptimizers

 try:
    import torch._dynamo  # pylint: disable=ungrouped-imports
@@ -170,9 +168,6 @@ class TrainerBuilderBase(abc.ABC):
                )
            )

-        if self.cfg.gc_steps:
-            callbacks.append(GCCallback(gc_steps=self.cfg.gc_steps))
-
        if self.cfg.use_wandb:
            callbacks.append(
                SaveAxolotlConfigtoWandBCallback(self.cfg.axolotl_config_path)
@@ -254,6 +249,9 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        if self.cfg.loss_watchdog_threshold is not None:
            callbacks.append(LossWatchDogCallback(self.cfg))

+        if self.cfg.gc_steps:
+            callbacks.append(GCCallback(gc_steps=self.cfg.gc_steps))
+
        return callbacks

    def get_post_trainer_create_callbacks(self, trainer):
@@ -295,10 +293,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers:
            callbacks.append(lisa_callback_factory(trainer))

-        if any("COLAB_" in key for key in os.environ):
-            ColabCallback = colab_inference_post_train_callback(trainer)
-            callbacks.append(ColabCallback(self.cfg))
-
        callbacks.extend(super().get_post_trainer_create_callbacks(trainer=trainer))
        return callbacks

@@ -353,7 +347,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        training_arguments_kwargs["warmup_steps"] = warmup_steps
        training_arguments_kwargs["logging_steps"] = logging_steps

-        if self.cfg.seed is not None:
+        if self.cfg.seed:
            training_arguments_kwargs["seed"] = self.cfg.seed

        if self.cfg.gradient_checkpointing:
@@ -387,12 +381,8 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            training_arguments_kwargs["adam_beta1"] = self.cfg.adam_beta1
        if self.cfg.adam_beta2:
            training_arguments_kwargs["adam_beta2"] = self.cfg.adam_beta2
-        if self.cfg.adam_beta3:
-            training_arguments_kwargs["adam_beta3"] = self.cfg.adam_beta3
        if self.cfg.adam_epsilon:
            training_arguments_kwargs["adam_epsilon"] = self.cfg.adam_epsilon
-        if self.cfg.adam_epsilon2:
-            training_arguments_kwargs["adam_epsilon2"] = self.cfg.adam_epsilon2
        if self.cfg.max_grad_norm:
            training_arguments_kwargs["max_grad_norm"] = self.cfg.max_grad_norm

@@ -551,6 +541,8 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        report_to = []
        if self.cfg.use_wandb:
            report_to.append("wandb")
+            if self.cfg.wandb_name:
+                training_arguments_kwargs["run_name"] = self.cfg.wandb_name
        if self.cfg.use_mlflow:
            report_to.append("mlflow")
        if self.cfg.use_tensorboard:
@@ -710,20 +702,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                optimizer_cls = ADOPT
                adam_kwargs["decouple"] = True
                optimizer_kwargs.update(adam_kwargs)
-            elif self.cfg.optimizer == "came_pytorch":
-                from came_pytorch import CAME
-
-                optimizer_cls = CAME
-
-                beta1 = training_arguments_kwargs.get("adam_beta1", 0.9)
-                beta2 = training_arguments_kwargs.get("adam_beta2", 0.999)
-                beta3 = training_arguments_kwargs.get("adam_beta3", 0.9999)
-                eps1 = training_arguments_kwargs.get("adam_epsilon", 1e-30)
-                eps2 = training_arguments_kwargs.get("adam_epsilon2", 1e-16)
-                adam_kwargs["betas"] = (beta1, beta2, beta3)
-                adam_kwargs["eps"] = (eps1, eps2)
-
-                optimizer_kwargs.update(adam_kwargs)

            # Parse any additional optimizer args from config
            if self.cfg.optim_args:
@@ -798,6 +776,11 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                self.cfg.kd_top_k_before_softmax
            )

+        training_arguments_kwargs["sequence_parallel_degree"] = (
+            self.cfg.sequence_parallel_degree
+        )
+        training_arguments_kwargs["ring_attn_func"] = self.cfg.ring_attn_func
+
        if self.cfg.reward_model:
            training_args_cls = AxolotlRewardConfig
        elif self.cfg.process_reward_model:
@@ -818,15 +801,14 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        data_collator_kwargs = {
            "padding": True,  # True/"longest" is the default
        }
-        multiple = 64
        if self.cfg.pad_to_sequence_len:
-            data_collator_kwargs["pad_to_multiple_of"] = multiple * math.ceil(
-                self.cfg.sequence_len / multiple
+            data_collator_kwargs["pad_to_multiple_of"] = 64 * math.ceil(
+                self.cfg.sequence_len / 64
            )
        else:
            # A100 is best at 64, while others at 8. Let's use the larger so we don't have to check
            # https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html
-            data_collator_kwargs["pad_to_multiple_of"] = multiple
+            data_collator_kwargs["pad_to_multiple_of"] = 64

        if self.cfg.reward_model:
            data_collator_kwargs["max_length"] = self.cfg.sequence_len
@@ -1032,10 +1014,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            training_args_kwargs["dataloader_prefetch_factor"] = (
                self.cfg.dataloader_prefetch_factor
            )
-
-        if self.cfg.seed is not None:
-            training_args_kwargs["seed"] = self.cfg.seed
-
        if self.cfg.gradient_checkpointing:
            training_args_kwargs["gradient_checkpointing"] = (
                self.cfg.gradient_checkpointing
@@ -1059,8 +1037,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            # default to saving each epoch if not defined
            training_args_kwargs["save_strategy"] = "epoch"

-        training_args_kwargs["save_only_model"] = self.cfg.save_only_model
-
        if self.cfg.dataset_processes:
            training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes

@@ -1080,7 +1056,7 @@ class HFRLTrainerBuilder(TrainerBuilderBase):

        training_args_cls = None
        blocklist_args_kwargs = []
-        if self.cfg.rl is RLType.SIMPO:
+        if self.cfg.rl == "simpo":
            training_args_cls = AxolotlCPOConfig
            training_args_kwargs["loss_type"] = "simpo"
            training_args_kwargs["max_length"] = self.cfg.sequence_len
@@ -1088,13 +1064,13 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            if self.cfg.cpo_alpha is not None:
                training_args_kwargs["cpo_alpha"] = self.cfg.cpo_alpha

-        elif self.cfg.rl is RLType.ORPO:
+        elif self.cfg.rl == "orpo":
            training_args_cls = AxolotlORPOConfig
            training_args_kwargs["max_length"] = self.cfg.sequence_len
            if self.cfg.max_prompt_len:
                training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len

-        elif self.cfg.rl is RLType.KTO:
+        elif self.cfg.rl == "kto":
            training_args_cls = AxolotlKTOConfig

            training_args_kwargs["desirable_weight"] = (
@@ -1108,14 +1084,14 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            if self.cfg.max_prompt_len:
                training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len

-        elif self.cfg.rl is RLType.GRPO:
+        elif self.cfg.rl == "grpo":
            training_args_cls = GRPOStrategy.get_training_args_class()
            training_args_kwargs.update(GRPOStrategy.set_training_args_kwargs(self.cfg))
            blocklist_args_kwargs = GRPOStrategy.get_blocklist_args_kwargs()

        else:
            training_args_cls = AxolotlDPOConfig
-            if self.cfg.rl is RLType.IPO:
+            if self.cfg.rl == "ipo":
                training_args_kwargs["loss_type"] = "ipo"
            training_args_kwargs["max_length"] = self.cfg.sequence_len
            training_args_kwargs["max_completion_length"] = None
@@ -1158,76 +1134,67 @@ class HFRLTrainerBuilder(TrainerBuilderBase):

    def build(self, total_num_steps):
        training_args = self.build_training_arguments(total_num_steps)
-        trainer_kwargs = {}
-        if self.cfg.rl is RLType.IPO:
+        dpo_trainer_kwargs = {}
+        if self.cfg.rl == "ipo":
            if self.cfg.dpo_label_smoothing:
-                trainer_kwargs["label_smoothing"] = self.cfg.dpo_label_smoothing
+                dpo_trainer_kwargs["label_smoothing"] = self.cfg.dpo_label_smoothing
        if self.eval_dataset:
-            trainer_kwargs["eval_dataset"] = self.eval_dataset
+            dpo_trainer_kwargs["eval_dataset"] = self.eval_dataset
        if self.cfg.adapter and self.peft_config:
-            if self.cfg.rl is not RLType.GRPO:
-                trainer_kwargs["peft_config"] = self.peft_config
+            dpo_trainer_kwargs["peft_config"] = self.peft_config
        if self.cfg.precompute_ref_log_probs is not None:
-            trainer_kwargs["precompute_ref_log_probs"] = (
+            dpo_trainer_kwargs["precompute_ref_log_probs"] = (
                self.cfg.precompute_ref_log_probs
            )
-        if self.cfg.rl is RLType.GRPO:
-            trainer_cls = GRPOStrategy.get_trainer_class(
-                sequence_parallel=self.cfg.sequence_parallel_degree > 1
-            )
+        if self.cfg.rl == "grpo":
+            trainer_cls = GRPOStrategy.get_trainer_class()
            trainer_cls_args = [self.model]
            trainer_cls_args.extend(GRPOStrategy.set_trainer_args(self.cfg))
-            trainer_kwargs.update(GRPOStrategy.set_trainer_kwargs(self.cfg))
-        elif self.cfg.rl in [RLType.DPO, RLType.IPO]:
+            dpo_trainer_kwargs.update(GRPOStrategy.set_trainer_kwargs(self.cfg))
+        elif self.cfg.rl in ["dpo", "ipo"]:
            trainer_cls = DPOStrategy.get_trainer_class()
            trainer_cls_args = [self.model, self.model_ref]
-        elif self.cfg.rl is RLType.ORPO:
+        elif self.cfg.rl == "orpo":
            trainer_cls = AxolotlORPOTrainer
            trainer_cls_args = [self.model]
-        elif self.cfg.rl is RLType.KTO:
+        elif self.cfg.rl in ["kto"]:
            trainer_cls = AxolotlKTOTrainer
            trainer_cls_args = [self.model]
-        elif self.cfg.rl is RLType.SIMPO:
+        elif self.cfg.rl in ["simpo"]:
            trainer_cls = AxolotlCPOTrainer
            trainer_cls_args = [self.model]
        else:
            raise ValueError(f"Unsupported RL: {self.cfg.rl}")

-        if self.cfg.plugins:
-            plugin_manager = PluginManager.get_instance()
-            temp_trainer_cls = plugin_manager.get_trainer_cls(self.cfg)
-            if temp_trainer_cls is not None:
-                trainer_cls = temp_trainer_cls
-
        sig = inspect.signature(trainer_cls)
        if "tokenizer" in sig.parameters.keys():
-            trainer_kwargs["tokenizer"] = self.tokenizer
+            dpo_trainer_kwargs["tokenizer"] = self.tokenizer
        else:
-            trainer_kwargs["processing_class"] = self.tokenizer
+            dpo_trainer_kwargs["processing_class"] = self.tokenizer

        if self.cfg.datasets is not None and (
            trainer_cls is DPOStrategy.get_trainer_class()
        ):
-            trainer_kwargs["dataset_tags"] = [
+            dpo_trainer_kwargs["dataset_tags"] = [
                d["path"] for d in self.cfg.datasets if not Path(d["path"]).is_dir()
            ]
-        trainer = trainer_cls(
+        dpo_trainer = trainer_cls(
            *trainer_cls_args,
            args=training_args,
            train_dataset=self.train_dataset,
            callbacks=self.get_callbacks(),
-            **trainer_kwargs,
+            **dpo_trainer_kwargs,
        )
        if self.cfg.fsdp:
-            ensure_dtype(trainer.model, dtype=self.cfg.torch_dtype)
-            if self.cfg.rl in [RLType.DPO, RLType.IPO] and trainer.ref_model:
-                ensure_dtype(trainer.ref_model, dtype=self.cfg.torch_dtype)
+            ensure_dtype(dpo_trainer.model, dtype=self.cfg.torch_dtype)
+            if self.cfg.rl in ["dpo", "ipo"] and dpo_trainer.ref_model:
+                ensure_dtype(dpo_trainer.ref_model, dtype=self.cfg.torch_dtype)

-        trainer = self.hook_post_create_trainer(trainer)
-        for callback in self.get_post_trainer_create_callbacks(trainer):
-            trainer.add_callback(callback)
+        dpo_trainer = self.hook_post_create_trainer(dpo_trainer)
+        for callback in self.get_post_trainer_create_callbacks(dpo_trainer):
+            dpo_trainer.add_callback(callback)

-        return trainer
+        return dpo_trainer


 class HFPPOTrainerBuilder(TrainerBuilderBase):
--- a/src/axolotl/core/trainers/init.py
+++ b/src/axolotl/core/trainers/init.py
@@ -5,7 +5,7 @@

 from .base import AxolotlTrainer
 from .dpo.trainer import AxolotlDPOTrainer
-from .grpo.trainer import AxolotlGRPOSequenceParallelTrainer, AxolotlGRPOTrainer
+from .grpo.trainer import AxolotlGRPOTrainer
 from .mamba import AxolotlMambaTrainer
 from .relora import ReLoRATrainer
 from .trl import (
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -29,6 +29,7 @@ from axolotl.core.trainers.mixins import (
    OptimizerMixin,
    RngLoaderMixin,
    SchedulerMixin,
+    SequenceParallelMixin,
 )
 from axolotl.core.trainers.utils import (
    sanitize_kwargs_for_ds_tagging,
@@ -39,7 +40,9 @@ from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
 LOG = logging.getLogger(__name__)


-class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
+class AxolotlTrainer(
+    SchedulerMixin, OptimizerMixin, RngLoaderMixin, SequenceParallelMixin, Trainer
+):
    """Extend the base Trainer for axolotl helpers"""

    args = None  # type: "AxolotlTrainingArguments"  # type: ignore[name-defined]
@@ -65,6 +68,10 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
        if self.args.orpo_alpha:
            self.loss_fct = torch.nn.CrossEntropyLoss(reduction="none")

+        # Initialize sequence parallelism if enabled
+        if self.args.sequence_parallel_degree > 1:
+            self._setup_sequence_parallel()
+
    def _wrap_model(self, model, training=True, dataloader=None):
        if self.args.torch_compile:
            torch._dynamo.config.accumulated_cache_size_limit = (  # pylint: disable=protected-access
@@ -107,16 +114,14 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
            packing_efficiency_estimate=self.args.sample_packing_efficiency,
            batch_max_len=batch_max_len,
            batch_size=batch_size,
-            group_size=self.args.sample_packing_group_size,
-            bin_size=self.args.sample_packing_bin_size,
            sequential=self.args.sample_packing_sequentially,
            drop_last=True,
        )

    def _get_train_sampler(self) -> Sampler | None:
        """
-        Helper method to get the sampler for training. Handles cases for sample packing
-        and curriculum sampling (sequential).
+        Helper method to get the sampler for training. Handles cases for sequence
+        parallelism, sample packing, and curriculum sampling (sequential).

        Returns:
            If the dataset is non-empty, a sampler is returned, the type of which
@@ -125,7 +130,9 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
        use_sample_packing = self.args.sample_packing and not self.args.pretraining

        # Determine the base sampler first
-        if self.args.curriculum_sampling:
+        if self.args.sequence_parallel_degree > 1:
+            base_sampler = self._sp_get_train_sampler(self.train_dataset)
+        elif self.args.curriculum_sampling:
            base_sampler = SequentialSampler(self.train_dataset)
        elif use_sample_packing:
            base_sampler = RandomSampler(self.train_dataset)
@@ -144,7 +151,8 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):

    def _get_eval_sampler(self, eval_dataset: Dataset | None = None) -> Sampler | None:
        """
-        Helper method to get the sampler for evaluation. Handles sample packing case.
+        Helper method to get the sampler for evaluation. Handles sequence parallelism
+        and sample packing cases.

        Returns:
            If the dataset is non-empty, a sampler is returned, the type of which
@@ -158,7 +166,9 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
        )

        # Determine the base sampler
-        if use_multipack:
+        if self.args.sequence_parallel_degree > 1:
+            base_sampler = self._sp_get_eval_sampler(eval_dataset)
+        elif use_multipack:
            base_sampler = SequentialSampler(eval_dataset)
        else:
            return super()._get_eval_sampler(eval_dataset)
@@ -224,6 +234,14 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
        ):
            self.accelerator.even_batches = False

+        # Return unprepared dataloader if using sequence parallelism
+        # TODO(djsaunde): We might be able to use `accelerate`'s dataloader preparation
+        # if we use `dispatch_batches` and `slice_fn_for_dispatch` properly (i.e.,
+        # slice each batch along the sequence dimension).
+        if self.args.sequence_parallel_degree > 1:
+            return dataloader
+
+        # Otherwise prepare with accelerator
        return self.accelerator.prepare_data_loader(dataloader)

    def get_train_dataloader(self) -> DataLoader:
@@ -267,7 +285,12 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):

            return dataloader

-        if self.args.sample_packing and self.args.eval_sample_packing is not False:
+        # Handle sample packing or sequence parallelism
+        if (
+            self.args.sample_packing
+            and self.args.eval_sample_packing is not False
+            or self.args.sequence_parallel_degree > 1
+        ):
            # Get appropriate data collator
            self.data_collator = (  # pylint: disable=attribute-defined-outside-init
                self.eval_data_collator
@@ -277,6 +300,17 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
            if "length" in eval_dataset.column_names:
                eval_dataset = eval_dataset.remove_columns(["length"])

+            # Handle dataset preprocessing for SP
+            if self.args.sequence_parallel_degree > 1:
+                if isinstance(eval_dataset, datasets.Dataset):
+                    eval_dataset = self._remove_unused_columns(
+                        eval_dataset, description="evaluation"
+                    )
+                else:
+                    self.data_collator = self._get_collator_with_removed_columns(  # pylint: disable=attribute-defined-outside-init
+                        self.data_collator, description="evaluation"
+                    )
+
            # Use eval_batch_size for sample packing, per_device_eval_batch_size otherwise
            batch_size = (
                self.args.eval_batch_size
@@ -337,13 +371,15 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
                num_items_in_batch=num_items_in_batch,
            )

-        return super().compute_loss(
+        loss = super().compute_loss(
            model,
            inputs,
            return_outputs=return_outputs,
            num_items_in_batch=num_items_in_batch,
        )

+        return loss
+
    @staticmethod
    def orpo_concatenate_inputs(inputs, label_pad_token=-100, pad_token=0, device=None):
        concatenated_batch = {}
--- a/src/axolotl/core/trainers/dpo/init.py
+++ b/src/axolotl/core/trainers/dpo/init.py
@@ -1,11 +1,14 @@
-"""DPO Specific Strategy for training"""
+"""
+DPO Specific Strategy for training
+"""

 from axolotl.core.trainers.dpo.trainer import AxolotlDPOTrainer
-from axolotl.utils.schemas.enums import RLType


 class DPOStrategy:
-    """Strategy for DPO training"""
+    """
+    Strategy for DPO training
+    """

    @classmethod
    def get_trainer_class(cls):
@@ -20,7 +23,7 @@ class DPOStrategy:
    @classmethod
    def set_training_args_kwargs(cls, cfg):
        training_args_kwargs = {}
-        if cfg.rl is RLType.IPO:
+        if cfg.rl == "ipo":
            training_args_kwargs["loss_type"] = "ipo"
        training_args_kwargs["max_length"] = cfg.sequence_len
        training_args_kwargs["max_completion_length"] = None
--- a/src/axolotl/core/trainers/dpo/trainer.py
+++ b/src/axolotl/core/trainers/dpo/trainer.py
@@ -1,15 +1,31 @@
-"""DPO trainer for axolotl"""
+"""
+DPO trainer for axolotl
+"""

 import gc
+import random
 from functools import wraps
-from typing import Any, Dict, Union
+from typing import Any, Dict, Optional, Union

+import pandas as pd
 import torch
+import wandb
+from accelerate import PartialState
+from datasets import Dataset, IterableDataset
 from peft.optimizers import create_loraplus_optimizer
 from torch import nn
-from transformers import Trainer
+from torch.utils.data import DataLoader
+from transformers import (
+    BaseImageProcessor,
+    FeatureExtractionMixin,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+    Trainer,
+)
+from transformers.trainer_utils import EvalLoopOutput
 from transformers.utils import is_sagemaker_mp_enabled
-from trl import DPOTrainer
+from trl import DPOConfig, DPOTrainer, maybe_apply_chat_template, maybe_extract_prompt
+from trl.trainer.utils import log_table_to_comet_experiment

 from axolotl.core.trainers.mixins import RngLoaderMixin, SchedulerMixin
 from axolotl.core.trainers.utils import (
@@ -22,7 +38,9 @@ if is_sagemaker_mp_enabled():


 class AxolotlDPOTrainer(RngLoaderMixin, SchedulerMixin, DPOTrainer):
-    """Extend the base DPOTrainer for axolotl helpers."""
+    """
+    Extend the base DPOTrainer for axolotl helpers
+    """

    tag_names = ["axolotl", "dpo"]

@@ -67,9 +85,8 @@ class AxolotlDPOTrainer(RngLoaderMixin, SchedulerMixin, DPOTrainer):
    @wraps(DPOTrainer.push_to_hub)
    def push_to_hub(self, *args, **kwargs) -> str:
        """
-        Overwrite the `push_to_hub` method in order to force-add the tags when pushing
-        the model on the Hub. Please refer to `~transformers.Trainer.push_to_hub`
-        for more details.
+        Overwrite the `push_to_hub` method in order to force-add the tags when pushing the
+        model on the Hub. Please refer to `~transformers.Trainer.push_to_hub` for more details.
        """
        kwargs = sanitize_kwargs_for_ds_tagging(
            dataset_tags=self.dataset_tags, kwargs=kwargs
@@ -78,6 +95,64 @@ class AxolotlDPOTrainer(RngLoaderMixin, SchedulerMixin, DPOTrainer):

        return super().push_to_hub(*args, **kwargs)

+    # TODO: remove this once https://github.com/huggingface/trl/pull/3377 is in a release
+    def _prepare_dataset(
+        self,
+        dataset: Union[Dataset, IterableDataset],
+        processing_class: Union[
+            PreTrainedTokenizerBase,
+            BaseImageProcessor,
+            FeatureExtractionMixin,
+            ProcessorMixin,
+        ],
+        args: DPOConfig,
+        dataset_name: str,
+    ) -> Union[Dataset, IterableDataset]:
+        # Build the kwargs for the `map` function
+        map_kwargs: Dict[str, Any] = {"writer_batch_size": 10}
+        if isinstance(dataset, Dataset):  # IterableDataset does not support num_proc
+            map_kwargs["num_proc"] = args.dataset_num_proc
+
+        with PartialState().main_process_first():
+            # Extract prompt if needed
+            if isinstance(
+                dataset, Dataset
+            ):  # `IterableDataset.map` does not support `desc`
+                map_kwargs["desc"] = f"Extracting prompt in {dataset_name} dataset"
+            dataset = dataset.map(maybe_extract_prompt, **map_kwargs)
+
+            # Apply the chat template if needed
+            if isinstance(
+                dataset, Dataset
+            ):  # `IterableDataset.map` does not support `desc`
+                map_kwargs["desc"] = f"Applying chat template to {dataset_name} dataset"
+            dataset = dataset.map(
+                maybe_apply_chat_template,
+                fn_kwargs={"tokenizer": processing_class, "tools": args.tools},
+                **map_kwargs,
+            )
+
+            # Tokenize the dataset
+            if isinstance(
+                dataset, Dataset
+            ):  # `IterableDataset.map` does not support `desc`
+                map_kwargs["desc"] = f"Tokenizing {dataset_name} dataset"
+
+            dataset = dataset.map(
+                self.tokenize_row if not self.is_vision_model else self.process_row,
+                remove_columns=["chosen", "rejected"],
+                fn_kwargs={
+                    "processing_class": processing_class,
+                    "max_prompt_length": args.max_prompt_length,
+                    "max_completion_length": args.max_completion_length,
+                    # for enc-dec, we add the special tokens ([bos_token] + prompt + [eos_token]; completion + [eos_token])
+                    "add_special_tokens": False,
+                },
+                **map_kwargs,
+            )
+
+        return dataset
+
    @staticmethod
    def tokenize_row(
        features,
@@ -102,8 +177,12 @@ class AxolotlDPOTrainer(RngLoaderMixin, SchedulerMixin, DPOTrainer):
            # dpo trainer may incorrectly prepend the bos_token_id to the dpo outputs
            if res["chosen_input_ids"][0] == processing_class.bos_token_id:
                res["chosen_input_ids"] = res["chosen_input_ids"][1:]
+                res["chosen_labels"] = res["chosen_labels"][1:]
+                res["chosen_attention_mask"] = res["chosen_attention_mask"][1:]
            if res["rejected_input_ids"][0] == processing_class.bos_token_id:
                res["rejected_input_ids"] = res["rejected_input_ids"][1:]
+                res["rejected_labels"] = res["rejected_labels"][1:]
+                res["rejected_attention_mask"] = res["rejected_attention_mask"][1:]

        return res

@@ -117,3 +196,67 @@ class AxolotlDPOTrainer(RngLoaderMixin, SchedulerMixin, DPOTrainer):
        gc.collect()
        torch.cuda.empty_cache()
        return loss
+
+    # TODO: remove this once https://github.com/huggingface/trl/pull/3377 is in a release
+    def evaluation_loop(
+        self,
+        dataloader: DataLoader,
+        description: str,
+        prediction_loss_only: Optional[bool] = None,
+        ignore_keys: Optional[list[str]] = None,
+        metric_key_prefix: str = "eval",
+    ) -> EvalLoopOutput:
+        """
+        Overriding built-in evaluation loop to store metrics for each batch.
+        Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.
+
+        Works both with or without labels.
+        """
+
+        # Sample and save to game log if requested (for one batch to save time)
+        if self.generate_during_eval:
+            # Generate random indices within the range of the total number of samples
+            num_samples = len(dataloader.dataset)
+            random_indices = random.sample(
+                range(num_samples), k=self.args.eval_batch_size
+            )
+
+            # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader
+            random_batch_dataset = dataloader.dataset.select(random_indices)
+            random_batch = self.data_collator(random_batch_dataset)
+            random_batch = self._prepare_inputs(random_batch)
+
+            policy_output_decoded, ref_output_decoded = (
+                self.generate_from_model_and_ref(self.model, random_batch)
+            )
+
+            table = pd.DataFrame(
+                columns=["Prompt", "Policy", "Ref Model"],
+                data=[
+                    [prompt, pol[len(prompt) :], ref[len(prompt) :]]
+                    for prompt, pol, ref in zip(
+                        random_batch_dataset["prompt"],
+                        policy_output_decoded,
+                        ref_output_decoded,
+                    )
+                ],
+            )
+            if "wandb" in self.args.report_to and self.accelerator.is_main_process:
+                wandb.log({"game_log": wandb.Table(data=table)})
+
+            if "comet_ml" in self.args.report_to:
+                log_table_to_comet_experiment(
+                    name="game_log.csv",
+                    table=table,
+                )
+
+        # Base evaluation
+        initial_output = super().evaluation_loop(
+            dataloader,
+            description,
+            prediction_loss_only,
+            ignore_keys,
+            metric_key_prefix,
+        )
+
+        return initial_output
--- a/src/axolotl/core/trainers/grpo/init.py
+++ b/src/axolotl/core/trainers/grpo/init.py
@@ -1,41 +1,37 @@
-"""GRPO Specific Strategy for training"""
+"""
+GRPO Specific Strategy for training
+"""

 import importlib
 import inspect
 import logging
-from typing import Any

 from trl.trainer.grpo_trainer import RewardFunc

-from axolotl.core.trainers.grpo.args import AxolotlGRPOConfig
-from axolotl.core.trainers.grpo.trainer import (
-    AxolotlGRPOSequenceParallelTrainer,
-    AxolotlGRPOTrainer,
-)
-from axolotl.utils.dict import DictDefault
+from axolotl.core.trainers.grpo.trainer import AxolotlGRPOTrainer
 from axolotl.utils.schemas.trl import TRLConfig

-LOG = logging.getLogger(__name__)
+LOG = logging.getLogger("axolotl")


 class GRPOStrategy:
-    """Strategy for GRPO training"""
+    """
+    Strategy for GRPO training
+    """

    @classmethod
-    def get_trainer_class(
-        cls, sequence_parallel: bool
-    ) -> type[AxolotlGRPOTrainer] | type[AxolotlGRPOSequenceParallelTrainer]:
-        if sequence_parallel:
-            return AxolotlGRPOSequenceParallelTrainer
+    def get_trainer_class(cls):
        return AxolotlGRPOTrainer

    @classmethod
-    def get_training_args_class(cls) -> type[AxolotlGRPOConfig]:
+    def get_training_args_class(cls):
+        from axolotl.core.trainers.grpo.args import AxolotlGRPOConfig
+
        return AxolotlGRPOConfig

    @classmethod
-    def set_training_args_kwargs(cls, cfg: DictDefault) -> dict[str, Any]:
-        grpo_args_kwargs: dict[str, Any] = {}
+    def set_training_args_kwargs(cls, cfg):
+        grpo_args_kwargs = {}

        if not hasattr(cfg, "trl") or not cfg.trl:
            return grpo_args_kwargs
@@ -44,8 +40,8 @@ class GRPOStrategy:

        if trl.use_vllm:
            grpo_args_kwargs["use_vllm"] = trl.use_vllm
-            grpo_args_kwargs["vllm_server_host"] = trl.vllm_server_host or trl.vllm.host  # type: ignore[attr-defined]
-            grpo_args_kwargs["vllm_server_port"] = trl.vllm_server_port or trl.vllm.port  # type: ignore[attr-defined]
+            grpo_args_kwargs["vllm_server_host"] = trl.vllm_server_host or trl.vllm.host
+            grpo_args_kwargs["vllm_server_port"] = trl.vllm_server_port or trl.vllm.port
            if trl.vllm_server_timeout:
                grpo_args_kwargs["vllm_server_timeout"] = trl.vllm_server_timeout
            if trl.vllm_guided_decoding_regex:
@@ -106,18 +102,17 @@ class GRPOStrategy:
        return grpo_args_kwargs

    @classmethod
-    def set_trainer_args(cls, cfg: DictDefault) -> list[Any]:
+    def set_trainer_args(cls, cfg):
        trainer_args = []
        if cfg.trl and cfg.trl.reward_funcs:
            reward_funcs = []
            for reward_func_fqn in cfg.trl.reward_funcs:
                reward_funcs.append(cls.get_reward_func(reward_func_fqn))
            trainer_args.append(reward_funcs)
-
        return trainer_args

    @classmethod
-    def set_trainer_kwargs(cls, cfg: DictDefault) -> dict[str, Any]:
+    def set_trainer_kwargs(cls, cfg):
        trainer_kwargs = {}
        if cfg.trl and cfg.trl.reward_processing_classes:
            trainer_kwargs["reward_processing_classes"] = (
@@ -131,7 +126,7 @@ class GRPOStrategy:
        return None

    @classmethod
-    def get_blocklist_args_kwargs(cls) -> list[str]:
+    def get_blocklist_args_kwargs(cls):
        return ["dataset_num_proc"]

    @classmethod
@@ -142,13 +137,13 @@ class GRPOStrategy:
        Args:
            reward_func_fqn (str): Fully qualified name of the reward function (e.g. r1_grpo.gsm8k_transform),
                or a HF hub path to the reward model.
+        Raises:
+            ValueError: If the reward function does not accept at least two arguments.

        Returns:
            RewardFunc: A callable that accepts prompts and completions and returns rewards,
                or a path to a reward model.

-        Raises:
-            ValueError: If the reward function does not accept at least two arguments.
        """
        try:
            # use importlib to dynamically load the reward function from the module
--- a/src/axolotl/core/trainers/grpo/args.py
+++ b/src/axolotl/core/trainers/grpo/args.py
@@ -11,4 +11,6 @@ from axolotl.core.training_args import AxolotlTrainingMixins

@dataclass
 class AxolotlGRPOConfig(AxolotlTrainingMixins, GRPOConfig):
-    """Axolotl GRPO Config for GRPO training"""
+    """
+    Axolotl GRPO Config for GRPO training
+    """
--- a/src/axolotl/core/trainers/grpo/sampler.py
+++ b/src/axolotl/core/trainers/grpo/sampler.py
@@ -1,172 +0,0 @@
-"""Repeat random sampler (similar to the one implemented in
-https://github.com/huggingface/trl/blob/main/trl/trainer/grpo_trainer.py) that adds
-sequence parallelism functionality; i.e., duplicating data across ranks in the same
-sequence parallel group.
-"""
-
-from typing import Iterator, Sized
-
-import torch
-from torch.utils.data import Sampler
-
-
-class SequenceParallelRepeatRandomSampler(Sampler):
-    """Sampler for GRPO training with sequence parallelism.
-
-    This sampler ensures:
-    - Ranks in the same sequence parallel (SP) group receive identical data.
-    - Each index is repeated multiple times for sampling different completions.
-    - Entire batches are repeated for reuse in multiple updates.
-    - Data is properly distributed across SP groups.
-
-    In the table below, the values represent dataset indices. Each SP group has
-    `sequence_parallel_degree = 2` GPUs working together on the same data. There are 2
-    SP groups (SP0 and SP1), with `world_size = 4` total GPUs.
-
-                                               Sequence Parallel Groups
-                                        |       SP0        |       SP1        |
-                                        |  GPU 0  |  GPU 1 |  GPU 2  |  GPU 3 |
-                    global_step  step    <---> mini_repeat_count=3
-                                            <----------> batch_size=2 per SP group
-    grad_accum=2   ▲  ▲  0       0         [0 0 0  1 1 1]     [2 2 2  3 3 3]   <- SP groups get different data
-                   ▼  |  0       1         [0 0 0  1 1 1]     [2 2 2  3 3 3]   <- Same data for each SP group GPU
-                      |
-                      |  1       2         [0 0 0  1 1 1]     [2 2 2  3 3 3]   <- Repeat same indices for iterations
-    num_iterations=2  ▼  1       3         [0 0 0  1 1 1]     [2 2 2  3 3 3]   <- When using gradient accumulation
-
-                         2       4         [4 4 4  5 5 5]     [6 6 6  7 7 7]   <- New batch of data indices
-                         2       5         [4 4 4  5 5 5]     [6 6 6  7 7 7]
-                                            ...
-
-    Args:
-        dataset: Dataset to sample from.
-        mini_repeat_count: How many times to repeat each sample immediately.
-        world_size: Total number of processes.
-        rank: Rank of current process.
-        batch_size: Number of samples per batch.
-        repeat_count: How many times to repeat the full sampling process.
-        sequence_parallel_degree: Number of ranks in a sequence parallel group.
-        shuffle: Whether to shuffle the dataset.
-        seed: Random seed for shuffling.
-        drop_last: Whether to drop the last incomplete batch.
-    """
-
-    def __init__(
-        self,
-        dataset: Sized,
-        mini_repeat_count: int,
-        world_size: int,
-        rank: int,
-        batch_size: int = 1,
-        repeat_count: int = 1,
-        sequence_parallel_degree: int = 1,
-        shuffle: bool = True,
-        seed: int = 0,
-        drop_last: bool = False,
-    ):
-        self.dataset = dataset
-        self.mini_repeat_count = mini_repeat_count
-        self.batch_size = batch_size
-        self.repeat_count = repeat_count
-        self.shuffle = shuffle
-        self.seed = seed
-        self.drop_last = drop_last
-        self.epoch = 0
-
-        self.world_size = world_size
-        self.rank = rank
-
-        # Sequence parallelism parameters
-        self.sequence_parallel_degree = sequence_parallel_degree
-        self.num_sp_groups = world_size // sequence_parallel_degree
-        self.sp_group_id = rank // sequence_parallel_degree
-
-        # Adjust dataset size for distributed sampling
-        self.num_samples = len(self.dataset)
-        self.total_size = self.num_samples
-
-        # Calculate effective number of samples per SP group
-        if (
-            self.drop_last
-            and self.total_size % (self.num_sp_groups * self.batch_size) != 0
-        ):
-            # Drop last incomplete batch if drop_last is True
-            self.num_samples_per_sp_group = (
-                self.total_size // self.batch_size // self.num_sp_groups
-            ) * self.batch_size
-        else:
-            # Round up to include last batch if drop_last is False
-            self.num_samples_per_sp_group = (
-                (self.total_size + self.batch_size * self.num_sp_groups - 1)
-                // (self.batch_size * self.num_sp_groups)
-                * self.batch_size
-            )
-
-        if shuffle:
-            self.generator = torch.Generator()
-            self.generator.manual_seed(seed)
-
-    def __iter__(self) -> Iterator[int]:
-        """Creates iterator over dataset indices.
-
-        Returns:
-            Iterator that yields indices into the dataset.
-        """
-        # Deterministically shuffle based on epoch and seed
-        if self.shuffle:
-            indices = torch.randperm(
-                self.num_samples, generator=self.generator
-            ).tolist()
-        else:
-            indices = list(range(self.num_samples))
-
-        # Add extra samples to make it evenly divisible by batch_size
-        if len(indices) % self.batch_size != 0:
-            padding = indices[: self.batch_size - len(indices) % self.batch_size]
-            indices += padding
-
-        # Subsample based on SP group ID
-        # Each SP group gets distinct batches of data
-        batch_indices = []
-        for i in range(0, len(indices), self.batch_size * self.num_sp_groups):
-            start_idx = i + self.sp_group_id * self.batch_size
-            end_idx = min(start_idx + self.batch_size, len(indices))
-            if start_idx < len(indices):
-                for j in range(self.batch_size):
-                    if start_idx + j < end_idx:
-                        batch_indices.append(indices[start_idx + j])
-
-        # Make sure batch_indices is exactly batch_size * num_batches_per_sp_group
-        if self.drop_last:
-            num_batches_per_sp_group = self.num_samples_per_sp_group // self.batch_size
-            target_len = self.batch_size * num_batches_per_sp_group
-            if len(batch_indices) > target_len:
-                batch_indices = batch_indices[:target_len]
-
-        # Apply the GRPO repeat pattern
-        final_indices = []
-        for _ in range(self.repeat_count):
-            for idx in batch_indices:
-                for _ in range(self.mini_repeat_count):
-                    final_indices.append(idx)
-
-        return iter(final_indices)
-
-    def __len__(self) -> int:
-        """Returns the total length of the iterable including repetitions.
-
-        Returns:
-            Total number of samples.
-        """
-        # Total length including all repetitions
-        return (
-            self.num_samples_per_sp_group * self.mini_repeat_count * self.repeat_count
-        )
-
-    def set_epoch(self, epoch: int) -> None:
-        """Sets the epoch for this sampler.
-
-        Args:
-            epoch: Epoch number to use for shuffling.
-        """
-        self.epoch = epoch
--- a/src/axolotl/core/trainers/grpo/trainer.py
+++ b/src/axolotl/core/trainers/grpo/trainer.py
@@ -1,653 +1,69 @@
-"""Axolotl GRPO trainers (with and without sequence parallelism handling)"""
+"""
+Axolotl GRPO trainer
+"""

-# pylint: disable=too-many-lines,duplicate-code,protected-access,no-member
+from contextlib import nullcontext

-import warnings
-from typing import Any
-
-import datasets
-import torch
-import torch.distributed as dist
-import torch.utils.data
-from accelerate.utils import (
-    broadcast_object_list,
-    gather,
-    gather_object,
-    is_peft_available,
-)
-from datasets import Dataset, IterableDataset
-from torch import nn
-from torch.utils.data import (
-    BatchSampler,
-    DataLoader,
-    Sampler,
-)
-from transformers import (
-    PreTrainedModel,
-    PreTrainedTokenizerBase,
-    Trainer,
-    TrainerCallback,
-)
-from transformers.trainer_utils import seed_worker
+from accelerate.utils import is_deepspeed_available, is_peft_model
 from trl import GRPOTrainer
-from trl.data_utils import (
-    apply_chat_template,
-    is_conversational,
-    maybe_apply_chat_template,
-)
-from trl.extras.profiling import profiling_context
-from trl.models import unwrap_model_for_generation
-from trl.trainer.grpo_config import GRPOConfig
-from trl.trainer.grpo_trainer import RewardFunc, nanstd
-from trl.trainer.utils import pad
+from trl.extras.profiling import profiling_decorator

-from axolotl.core.trainers.grpo.sampler import SequenceParallelRepeatRandomSampler
 from axolotl.core.trainers.mixins import RngLoaderMixin, SchedulerMixin
-from axolotl.monkeypatch.ring_attn import get_ring_attn_group

-if is_peft_available():
-    # pylint: disable=unused-import
-    from peft import PeftConfig
+if is_deepspeed_available():
+    import deepspeed


 class AxolotlGRPOTrainer(RngLoaderMixin, SchedulerMixin, GRPOTrainer):
-    """Extend the base GRPOTrainer for axolotl helpers"""
+    """
+    Extend the base GRPOTrainer for axolotl helpers
+    """

    _tag_names = ["trl", "grpo", "axolotl"]

-
-class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
-    """Extend the base GRPOTrainer for sequence parallelism handling"""
-
-    def __init__(
-        self,
-        model: str | PreTrainedModel,
-        reward_funcs: RewardFunc | list[RewardFunc],
-        args: GRPOConfig | None = None,
-        train_dataset: Dataset | IterableDataset | None = None,
-        eval_dataset: (
-            Dataset | IterableDataset | dict[str, Dataset | IterableDataset] | None
-        ) = None,
-        processing_class: PreTrainedTokenizerBase | None = None,
-        reward_processing_classes: (
-            PreTrainedTokenizerBase | list[PreTrainedTokenizerBase] | None
-        ) = None,
-        callbacks: list[TrainerCallback] | None = None,
-        optimizers: tuple[
-            torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None
-        ] = (None, None),
-        peft_config: "PeftConfig | None" = None,
-    ):
-        # First call the superclass constructor with all arguments
-        super().__init__(
-            model=model,
-            reward_funcs=reward_funcs,
-            args=args,
-            train_dataset=train_dataset,
-            eval_dataset=eval_dataset,
-            processing_class=processing_class,
-            reward_processing_classes=reward_processing_classes,
-            callbacks=callbacks,
-            optimizers=optimizers,
-            peft_config=peft_config,
+    @profiling_decorator
+    def _move_model_to_vllm(self):
+        # For DeepSpeed ZeRO-3, we need to gather all parameters before operations
+        deepspeed_plugin = self.accelerator.state.deepspeed_plugin
+        zero_stage_3 = deepspeed_plugin is not None and deepspeed_plugin.zero_stage == 3
+        gather_if_zero3 = (
+            deepspeed.zero.GatheredParameters if zero_stage_3 else nullcontext
        )

-        # Get number of SP groups (number of processes divided by SP degree)
-        num_processes = self.accelerator.num_processes
-        num_sp_groups = num_processes // self.args.sequence_parallel_degree
+        if is_peft_model(self.model):
+            # With PEFT and DeepSpeed ZeRO Stage 3, we must gather the full model at once before merging, as merging
+            # adapters in a sharded manner is not supported.
+            with gather_if_zero3(list(self.model.parameters())):
+                self.model.merge_adapter()

-        # Calculate batch size per SP group (not per process)
-        sp_group_batch_size = self.args.per_device_train_batch_size * num_sp_groups
-        possible_values = [
-            n_gen
-            for n_gen in range(2, sp_group_batch_size + 1)
-            if (sp_group_batch_size) % n_gen == 0
-        ]
+                # Update vLLM weights while parameters are gathered
+                for name, param in self.model.named_parameters():
+                    # When using PEFT, we need to recover the original parameter name and discard some parameters
+                    name = (
+                        name.removeprefix("base_model.model.")
+                        .removeprefix("base_model.model.")
+                        .replace(".base_layer", "")
+                    )
+                    if self.model.prefix in name:
+                        continue
+                    # When module to save, remove its prefix and discard the original module
+                    if "original_module" in name:
+                        continue
+                    name = name.replace("modules_to_save.default.", "")

-        if self.num_generations not in possible_values:
-            raise ValueError(
-                f"The batch size per SP group ({num_sp_groups} x "
-                f"{self.args.per_device_train_batch_size}) must be evenly divisible by "
-                f"the number of generations per prompt ({self.num_generations}). Given "
-                "the current configuration, the valid values for the number of "
-                f"generations are: {possible_values}."
-            )
+                    if self.accelerator.is_main_process:
+                        self.vllm_client.update_named_param(name, param.data)

-        if self.args.eval_strategy != "no":
-            # If sequence parallelism is enabled, calculate batch size per SP group
-            sp_group_eval_batch_size = args.per_device_eval_batch_size * num_sp_groups  # type: ignore[union-attr]
-            possible_values = [
-                n_gen
-                for n_gen in range(2, sp_group_eval_batch_size + 1)
-                if (sp_group_eval_batch_size) % n_gen == 0
-            ]
-
-            if self.num_generations not in possible_values:
-                raise ValueError(
-                    f"With sequence parallelism (degree {self.args.sequence_parallel_degree}), "
-                    f"the eval batch size per SP group ({num_sp_groups} x {self.args.per_device_eval_batch_size}) "
-                    f"must be evenly divisible by the number of generations per prompt "
-                    f"({self.num_generations}). Given the current eval batch size, "
-                    f"the valid values for the number of generations are: {possible_values}."
-                )
-
-        # Initialize the SP group
-        self.sp_group = get_ring_attn_group()
-        self.rank = dist.get_rank()
-        self.world_size = dist.get_world_size()
-        self.local_rank = dist.get_rank(group=self.sp_group)
-        self.local_world_size = dist.get_world_size(group=self.sp_group)
-
-    def _get_train_sampler(self) -> Sampler:
-        effective_batch_size = (
-            self.args.per_device_train_batch_size
-            * self.world_size
-            * self.args.gradient_accumulation_steps
-        )
-
-        return SequenceParallelRepeatRandomSampler(
-            dataset=self.train_dataset,
-            mini_repeat_count=self.num_generations,
-            world_size=self.world_size,
-            rank=self.rank,
-            batch_size=effective_batch_size
-            // self.num_generations
-            // self.args.sequence_parallel_degree,
-            repeat_count=self.num_iterations * self.args.gradient_accumulation_steps,
-            sequence_parallel_degree=self.args.sequence_parallel_degree,
-            shuffle=True,
-            seed=self.args.seed,
-            drop_last=True,
-        )
-
-    def _create_dataloader_params(self, is_eval=False, custom_batch_size=None):
-        """Create common dataloader parameters for train or eval."""
-        batch_size = custom_batch_size or (
-            self.args.eval_batch_size if is_eval else self._train_batch_size
-        )
-
-        params = {
-            "batch_size": batch_size,
-            "collate_fn": self.data_collator,
-            "num_workers": self.args.dataloader_num_workers,
-            "pin_memory": self.args.dataloader_pin_memory,
-        }
-
-        # Add persistent workers only for training
-        if not is_eval and hasattr(self.args, "dataloader_persistent_workers"):
-            params["persistent_workers"] = self.args.dataloader_persistent_workers
-
-        # Add prefetch factor if specified
-        if self.args.dataloader_prefetch_factor:
-            params["prefetch_factor"] = self.args.dataloader_prefetch_factor
-
-        return params
-
-    def _prepare_dataloader(
-        self, dataset, sampler, is_eval=False, custom_batch_size=None
-    ):
-        """Prepare a dataloader with the given dataset and sampler."""
-        # Get base parameters
-        dataloader_params = self._create_dataloader_params(is_eval, custom_batch_size)
-
-        # Add sampler configuration
-        if not isinstance(dataset, torch.utils.data.IterableDataset):
-            if isinstance(sampler, BatchSampler):
-                # batch_size and batch_sampler are mutually exclusive
-                dataloader_params["batch_sampler"] = sampler
-                del dataloader_params["batch_size"]
-            else:
-                dataloader_params["sampler"] = sampler
-                dataloader_params["drop_last"] = self.args.dataloader_drop_last
-
-            if not is_eval:
-                dataloader_params["worker_init_fn"] = seed_worker
-
-        # Create the dataloader
-        dataloader = DataLoader(dataset, **dataloader_params)
-
-        if self.args.sample_packing and (
-            (not is_eval and not self.args.pretraining)
-            or (is_eval and self.args.eval_sample_packing is not False)
-        ):
-            self.accelerator.even_batches = False
-
-        # Return unprepared dataloader if using sequence parallelism
-        # TODO(djsaunde): We might be able to use `accelerate`'s dataloader preparation
-        # if we use `dispatch_batches` and `slice_fn_for_dispatch` properly (i.e.,
-        # slice each batch along the sequence dimension).
-        if self.args.sequence_parallel_degree > 1:
-            return dataloader
-
-        # Otherwise prepare with accelerator
-        return self.accelerator.prepare_data_loader(dataloader)
-
-    def get_train_dataloader(self) -> DataLoader:
-        """Get dataloader for training"""
-        train_dataset = self.train_dataset
-        # pylint: disable=access-member-before-definition
-        data_collator = self.data_collator  # type: ignore
-
-        # Handle dataset preprocessing
-        if isinstance(train_dataset, datasets.Dataset):
-            # Add debug print before any modifications
-            if self.args.sample_packing and not self.args.pretraining:
-                train_dataset = train_dataset.remove_columns(["length"])
-            if not self.args.sample_packing or self.args.pretraining:
-                train_dataset = self._remove_unused_columns(
-                    train_dataset, description="training"
-                )
+                # Unmerge adapters while parameters are still gathered
+                self.model.unmerge_adapter()
+                # Parameters will automatically be repartitioned when exiting the context
        else:
-            self.data_collator = self._get_collator_with_removed_columns(  # pylint: disable=attribute-defined-outside-init
-                data_collator,
-                description="training",
-            )
+            # For non-PEFT models, simply gather and update each parameter individually.
+            for name, param in self.model.named_parameters():
+                with gather_if_zero3([param]):
+                    if self.accelerator.is_main_process:
+                        self.vllm_client.update_named_param(name, param.data)

-        # Get sampler and create dataloader
-        sampler = self._get_train_sampler()
-        dataloader = self._prepare_dataloader(train_dataset, sampler, is_eval=False)
-
-        return dataloader
-
-    def _generate_and_score_completions(
-        self, inputs: list[dict[str, torch.Tensor | Any]]
-    ) -> dict[str, torch.Tensor | Any]:
-        device = self.accelerator.device
-        mode = "eval" if self.control.should_evaluate else "train"
-
-        prompts = [x["prompt"] for x in inputs]
-        prompts_text = [
-            maybe_apply_chat_template(example, self.processing_class)["prompt"]
-            for example in inputs
-        ]
-        prompt_inputs = self.processing_class(
-            text=prompts_text,
-            return_tensors="pt",
-            padding=True,
-            padding_side="left",
-            add_special_tokens=False,
-        )
-        prompt_inputs = Trainer._prepare_inputs(self, prompt_inputs)
-        prompt_ids, prompt_mask = (
-            prompt_inputs["input_ids"],
-            prompt_inputs["attention_mask"],
-        )
-
-        if self.max_prompt_length is not None:
-            prompt_ids = prompt_ids[:, -self.max_prompt_length :]
-            prompt_mask = prompt_mask[:, -self.max_prompt_length :]
-
-        # Generate completions using either vLLM or regular generation
-        if self.args.use_vllm:
-            # First, have main process load weights if needed
-            # pylint: disable=access-member-before-definition
-            if self.state.global_step != self._last_loaded_step:  # type: ignore[has-type]
-                self._move_model_to_vllm()
-                # pylint: disable=attribute-defined-outside-init
-                self._last_loaded_step = self.state.global_step
-
-            # Generate completions using vLLM: gather all prompts and use them in a single call in the main process
-            all_prompts_text = gather_object(prompts_text)
-            if self.accelerator.is_main_process:
-                if self.args.sequence_parallel_degree > 1:
-                    # Calculate sequence parallel group information
-                    world_size = self.accelerator.num_processes
-                    sequence_parallel_degree = self.args.sequence_parallel_degree
-                    num_sp_groups = world_size // sequence_parallel_degree
-
-                    # Since processes in the same SP group have the same prompts, we need to ensure
-                    # we only take one copy of each prompt from each SP group
-                    ordered_set_of_prompts = []
-                    for sp_group_id in range(num_sp_groups):
-                        # Get the first process from each SP group (typically the group leader)
-                        group_leader_rank = sp_group_id * sequence_parallel_degree
-
-                        # Extract prompts from this SP group, accounting for num_generations duplicates
-                        # We only need prompts from one rank in each SP group
-                        group_prompts = all_prompts_text[
-                            group_leader_rank
-                            * len(prompts_text) : (group_leader_rank + 1)
-                            * len(prompts_text) : self.num_generations
-                        ]
-
-                        ordered_set_of_prompts.extend(group_prompts)
-                else:
-                    # Since 'prompts' contains 'num_generations' duplicates, we first take unique prompts, and generate
-                    # num_generations outputs for each one. This is faster than generating outputs for each duplicate
-                    # prompt individually.
-                    ordered_set_of_prompts = all_prompts_text[
-                        :: self.num_generations * self.args.sequence_parallel_degree
-                    ]
-
-                with profiling_context(self, "vLLM.generate"):
-                    completion_ids = self.vllm_client.generate(
-                        prompts=ordered_set_of_prompts,
-                        n=self.num_generations,
-                        repetition_penalty=self.repetition_penalty,
-                        temperature=self.temperature,
-                        top_p=self.top_p,
-                        top_k=-1 if self.top_k is None else self.top_k,
-                        min_p=0.0 if self.min_p is None else self.min_p,
-                        max_tokens=self.max_completion_length,
-                        guided_decoding_regex=self.guided_decoding_regex,
-                    )
-            else:
-                completion_ids = [None] * (
-                    len(all_prompts_text) // self.args.sequence_parallel_degree
-                )
-
-            # Broadcast the completions from the main process to all processes
-            completion_ids = broadcast_object_list(completion_ids, from_process=0)
-
-            # Determine the appropriate slice based on sequence parallelism
-            if self.args.sequence_parallel_degree > 1:
-                # Calculate SP group ID (which group of ranks this rank belongs to)
-                sp_group_id = self.accelerator.process_index // self.local_world_size
-
-                # Calculate the start index for this SP group
-                sp_group_start = sp_group_id * len(prompts) * self.local_world_size
-
-                # All ranks in the same SP group get the same data slice
-                process_slice = slice(
-                    sp_group_start,
-                    sp_group_start + len(prompts),
-                )
-                completion_ids = completion_ids[process_slice]
-            else:
-                # Original behavior for non-sequence parallel case
-                process_slice = slice(
-                    self.accelerator.process_index * len(prompts),
-                    (self.accelerator.process_index + 1) * len(prompts),
-                )
-                completion_ids = completion_ids[process_slice]
-
-            # Pad the completions, and concatenate them with the prompts
-            completion_ids = [
-                torch.tensor(ids, device=device) for ids in completion_ids
-            ]
-            completion_ids = pad(
-                completion_ids, padding_value=self.processing_class.pad_token_id
-            )
-            prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1)
-        else:
-            # Regular generation path
-            with unwrap_model_for_generation(
-                self.model_wrapped,
-                self.accelerator,
-                gather_deepspeed3_params=self.args.ds3_gather_for_generation,
-            ) as unwrapped_model:
-                prompt_completion_ids = unwrapped_model.generate(
-                    prompt_ids,
-                    attention_mask=prompt_mask,
-                    generation_config=self.generation_config,
-                )
-
-            # Compute prompt length and extract completion ids
-            prompt_length = prompt_ids.size(1)
-            prompt_ids = prompt_completion_ids[:, :prompt_length]
-            completion_ids = prompt_completion_ids[:, prompt_length:]
-
-        # Mask everything after the first EOS token
-        is_eos = completion_ids == self.processing_class.eos_token_id
-        eos_idx = torch.full(
-            (is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=device
-        )
-        eos_idx[is_eos.any(dim=1)] = is_eos.int().argmax(dim=1)[is_eos.any(dim=1)]
-        sequence_indices = torch.arange(is_eos.size(1), device=device).expand(
-            is_eos.size(0), -1
-        )
-        completion_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int()
-
-        # If mask_truncated_completions is enabled, zero out truncated completions in completion_mask
-        if self.args.mask_truncated_completions:
-            truncated_completions = ~is_eos.any(dim=1)
-            completion_mask = (
-                completion_mask * (~truncated_completions).unsqueeze(1).int()
-            )
-
-        # Concatenate prompt_mask with completion_mask for logit computation
-        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)  # (B, P+C)
-
-        logits_to_keep = completion_ids.size(
-            1
-        )  # we only need to compute the logits for the completion tokens
-        batch_size = (
-            self.args.per_device_train_batch_size
-            if mode == "train"
-            else self.args.per_device_eval_batch_size
-        )
-
-        with torch.no_grad():
-            # When using num_iterations == 1, old_per_token_logps == per_token_logps, so we can skip it's
-            # computation here, and use per_token_logps.detach() instead.
-            if self.num_iterations > 1:
-                old_per_token_logps = self._get_per_token_logps(
-                    self.model,
-                    prompt_completion_ids,
-                    attention_mask,
-                    logits_to_keep,
-                    batch_size,
-                )
-            else:
-                old_per_token_logps = None
-
-            if self.beta == 0.0:
-                ref_per_token_logps = None
-            elif self.ref_model is not None:
-                ref_per_token_logps = self._get_per_token_logps(
-                    self.ref_model,
-                    prompt_completion_ids,
-                    attention_mask,
-                    logits_to_keep,
-                    batch_size,
-                )
-            else:
-                with self.accelerator.unwrap_model(self.model).disable_adapter():
-                    ref_per_token_logps = self._get_per_token_logps(
-                        self.model,
-                        prompt_completion_ids,
-                        attention_mask,
-                        logits_to_keep,
-                        batch_size,
-                    )
-
-        # Decode the generated completions
-        completions_text = self.processing_class.batch_decode(
-            completion_ids, skip_special_tokens=True
-        )
-        if is_conversational(inputs[0]):
-            completions = []
-            for prompt, completion in zip(prompts, completions_text):
-                bootstrap = (
-                    prompt.pop()["content"] if prompt[-1]["role"] == "assistant" else ""
-                )
-                completions.append(
-                    [{"role": "assistant", "content": bootstrap + completion}]
-                )
-        else:
-            completions = completions_text
-
-        rewards_per_func = torch.zeros(
-            len(prompts), len(self.reward_funcs), device=device
-        )
-        for i, (reward_func, reward_processing_class, reward_func_name) in enumerate(
-            zip(
-                self.reward_funcs,
-                self.reward_processing_classes,
-                self.reward_func_names,
-            )
-        ):
-            with profiling_context(self, reward_func_name):
-                if isinstance(
-                    reward_func, nn.Module
-                ):  # Module instead of PretrainedModel for compat with compiled models
-                    if is_conversational(inputs[0]):
-                        messages = [
-                            {"messages": p + c} for p, c in zip(prompts, completions)
-                        ]
-                        texts = [
-                            apply_chat_template(x, reward_processing_class)["text"]
-                            for x in messages
-                        ]
-                    else:
-                        texts = [p + c for p, c in zip(prompts, completions)]
-                    reward_inputs = reward_processing_class(
-                        text=texts,
-                        return_tensors="pt",
-                        padding=True,
-                        padding_side="right",
-                        add_special_tokens=False,
-                    )
-                    reward_inputs = Trainer._prepare_inputs(self, reward_inputs)
-                    with torch.inference_mode():
-                        rewards_per_func[:, i] = reward_func(**reward_inputs).logits[
-                            :, 0
-                        ]  # Shape (B*G,)
-                else:
-                    # Repeat all input columns (but "prompt" and "completion") to match the number of generations
-                    keys = [
-                        key for key in inputs[0] if key not in ["prompt", "completion"]
-                    ]
-                    reward_kwargs = {
-                        key: [example[key] for example in inputs] for key in keys
-                    }
-                    output_reward_func = reward_func(
-                        prompts=prompts, completions=completions, **reward_kwargs
-                    )
-                    # Convert None values to NaN
-                    output_reward_func = [
-                        reward if reward is not None else torch.nan
-                        for reward in output_reward_func
-                    ]
-
-                    rewards_per_func[:, i] = torch.tensor(
-                        output_reward_func, dtype=torch.float32, device=device
-                    )
-
-        # If all reward functions return None for a given row, issue a detailed warning
-        if torch.isnan(rewards_per_func).all(dim=1).any():
-            nan_row_idx = (
-                torch.isnan(rewards_per_func).all(dim=1).nonzero(as_tuple=True)[0][0]
-            )
-            row_reward_kwargs = {
-                key: value[nan_row_idx] for key, value in reward_kwargs.items()
-            }
-            row_reward_kwargs["prompt"] = prompts[nan_row_idx]
-            row_reward_kwargs["completion"] = completions[nan_row_idx]
-            warnings.warn(
-                f"All reward functions returned None for the following kwargs: {row_reward_kwargs}. "
-                "Please ensure that at least one reward function returns a valid reward."
-            )
-
-        # Gather the reward per function: this part is crucial, because the rewards are normalized per group and the
-        # completions may be distributed across processes
-        rewards_per_func = gather(rewards_per_func)
-
-        # Apply weights to each reward function's output and sum
-        rewards = (
-            rewards_per_func * self.reward_weights.to(device).unsqueeze(0)
-        ).nansum(dim=1)
-
-        # Compute grouped-wise rewards
-        mean_grouped_rewards = rewards.view(-1, self.num_generations).mean(dim=1)
-        std_grouped_rewards = rewards.view(-1, self.num_generations).std(dim=1)
-
-        # Normalize the rewards to compute the advantages
-        mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(
-            self.num_generations, dim=0
-        )
-        std_grouped_rewards = std_grouped_rewards.repeat_interleave(
-            self.num_generations, dim=0
-        )
-        advantages = rewards - mean_grouped_rewards
-        if self.args.scale_rewards:
-            advantages = advantages / (std_grouped_rewards + 1e-4)
-
-        # Slice to keep only the local part of the data
-        if self.args.sequence_parallel_degree > 1:
-            # Calculate SP group ID (which group of ranks this rank belongs to)
-            sp_group_id = self.accelerator.process_index // self.local_world_size
-
-            # Calculate the start index for this SP group
-            sp_group_start = sp_group_id * len(prompts) * self.local_world_size
-
-            # All ranks in the same SP group get the same data slice
-            process_slice = slice(
-                sp_group_start,
-                sp_group_start + len(prompts),
-            )
-        else:
-            # Original behavior for non-sequence parallel case
-            process_slice = slice(
-                self.accelerator.process_index * len(prompts),
-                (self.accelerator.process_index + 1) * len(prompts),
-            )
-        advantages = advantages[process_slice]
-
-        # Log the metrics
-        if mode == "train":
-            self._total_train_tokens += (
-                self.accelerator.gather_for_metrics(attention_mask.sum()).sum().item()
-            )
-        self._metrics[mode]["num_tokens"] = [self._total_train_tokens]
-
-        # log completion lengths, mean, min, max
-        agg_completion_mask = self.accelerator.gather_for_metrics(
-            completion_mask.sum(1)
-        )
-        self._metrics[mode]["completions/mean_length"].append(
-            agg_completion_mask.float().mean().item()
-        )
-        self._metrics[mode]["completions/min_length"].append(
-            agg_completion_mask.float().min().item()
-        )
-        self._metrics[mode]["completions/max_length"].append(
-            agg_completion_mask.float().max().item()
-        )
-
-        # identify sequences that terminated with EOS and log their lengths
-        agg_terminated_with_eos = self.accelerator.gather_for_metrics(is_eos.any(dim=1))
-        term_completion_mask = agg_completion_mask[agg_terminated_with_eos]
-        clipped_completions_ratio = 1 - len(term_completion_mask) / len(
-            agg_completion_mask
-        )
-        self._metrics[mode]["completions/clipped_ratio"].append(
-            clipped_completions_ratio
-        )
-        if len(term_completion_mask) == 0:
-            # edge case where no completed sequences are found
-            term_completion_mask = torch.zeros(1, device=device)
-        self._metrics[mode]["completions/mean_terminated_length"].append(
-            term_completion_mask.float().mean().item()
-        )
-        self._metrics[mode]["completions/min_terminated_length"].append(
-            term_completion_mask.float().min().item()
-        )
-        self._metrics[mode]["completions/max_terminated_length"].append(
-            term_completion_mask.float().max().item()
-        )
-
-        # Calculate mean reward per function, but only for samples where the function was applied (non-NaN values)
-        for i, reward_func_name in enumerate(self.reward_func_names):
-            mean_rewards = torch.nanmean(rewards_per_func[:, i]).item()
-            self._metrics[mode][f"rewards/{reward_func_name}/mean"].append(mean_rewards)
-            std_rewards = nanstd(rewards_per_func[:, i]).item()
-            self._metrics[mode][f"rewards/{reward_func_name}/std"].append(std_rewards)
-        self._metrics[mode]["reward"].append(mean_grouped_rewards.mean().item())
-        self._metrics[mode]["reward_std"].append(std_grouped_rewards.mean().item())
-
-        # Log prompt and completion texts
-        self._textual_logs["prompt"].extend(gather_object(prompts_text))
-        self._textual_logs["completion"].extend(gather_object(completions_text))
-        for i, name in enumerate(self.reward_func_names):
-            self._textual_logs["rewards"][name].extend(rewards_per_func[:, i].tolist())
-
-        return {
-            "prompt_ids": prompt_ids,
-            "prompt_mask": prompt_mask,
-            "completion_ids": completion_ids,
-            "completion_mask": completion_mask,
-            "advantages": advantages,
-            "old_per_token_logps": old_per_token_logps,
-            "ref_per_token_logps": ref_per_token_logps,
-        }
+        # Reset cache on main process
+        if self.accelerator.is_main_process:
+            self.vllm_client.reset_prefix_cache()
--- a/src/axolotl/core/trainers/mixins/init.py
+++ b/src/axolotl/core/trainers/mixins/init.py
@@ -6,3 +6,4 @@
 from .optimizer import OptimizerMixin
 from .rng_state_loader import RngLoaderMixin
 from .scheduler import SchedulerMixin
+from .sequence_parallel import SequenceParallelContextManager, SequenceParallelMixin
--- a/src/axolotl/core/trainers/mixins/sequence_parallel.py
+++ b/src/axolotl/core/trainers/mixins/sequence_parallel.py
@@ -0,0 +1,313 @@
+"""
+Module for Axolotl trainer sequence parallelism mixin and training context manager
+"""
+
+import functools
+import logging
+
+import torch
+import torch.distributed as dist
+from datasets import Dataset
+from torch import nn
+from torch.utils.data import DistributedSampler, Sampler
+from torch.utils.hooks import RemovableHandle
+
+from axolotl.monkeypatch.attention.ring_attn import (
+    RingAttnFunc,
+    get_ring_attn_group,
+    update_ring_attn_params,
+)
+
+LOG = logging.getLogger(__name__)
+
+
+def apply_sequence_parallelism(
+    batch: dict[str, torch.Tensor],
+    local_rank: int,
+    local_world_size: int,
+    ring_attn_func: RingAttnFunc,
+) -> dict[str, torch.Tensor]:
+    """
+    Apply sequence parallelism slicing to a batch.
+
+    Args:
+        batch: Batch dictionary (e.g., input_ids, attention_mask, etc.)
+        local_rank: Local rank in the sequence parallel group
+        local_world_size: World size of the sequence parallel group
+        ring_attn_func: The ring attention function to use
+
+    Returns:
+        Sliced batch dictionary.
+    """
+    # Update ring attention params if needed
+    if batch.get("position_ids") is not None:
+        update_ring_attn_params(position_ids=batch["position_ids"])
+
+    # Slice batch for sequence parallel processing
+    total_seq_len = batch["input_ids"].size(1)
+    for key in batch:
+        if (
+            key in batch
+            and isinstance(batch[key], torch.Tensor)
+            and batch[key].dim() > 1
+            and batch[key].size(1) == total_seq_len
+        ):
+
+            if ring_attn_func in [
+                RingAttnFunc.VARLEN_LLAMA3,
+                RingAttnFunc.BATCH_RING,
+            ]:
+                # Split in sequential fashion and grab this rank's chunk
+                batch[key] = (
+                    batch[key].chunk(local_world_size, dim=1)[local_rank].contiguous()
+                )
+            elif ring_attn_func is RingAttnFunc.BATCH_ZIGZAG:
+                chunks = batch[key].chunk(2 * local_world_size, dim=1)
+
+                # Take rank's chunk and opposing chunk for zigzag pattern
+                selected_chunks = [
+                    chunks[local_rank],
+                    chunks[2 * local_world_size - local_rank - 1],
+                ]
+                batch[key] = torch.cat(selected_chunks, dim=1).contiguous()
+            elif ring_attn_func is RingAttnFunc.BATCH_STRIPE:
+                # Split into striped data and stack
+                tensor = torch.stack(
+                    batch[key].split(local_world_size, dim=1),
+                    dim=1,
+                ).transpose(1, 2)
+                batch[key] = tensor[:, local_rank].contiguous()
+
+    return batch
+
+
+class SequenceParallelMixin:
+    """
+    Mixin class for sequence parallelism support in trainers.
+
+    This mixin provides functionality for handling sequence parallelism,
+    specifically for creating appropriate data samplers.
+    """
+
+    args = None  # type: "AxolotlTrainingArguments"  # type: ignore[name-defined]
+
+    def _setup_sequence_parallel(self):
+        """Set up sequence parallelism environment."""
+        self.ring_attn_group = get_ring_attn_group()
+
+    def _create_sequence_parallel_sampler(
+        self,
+        dataset: Dataset,
+        shuffle: bool = True,
+        is_eval: bool = False,
+    ) -> DistributedSampler:
+        """
+        Helper method to create sampler for sequence parallelism (SP).
+
+        We create a distributed sampler with rank equal to the SP group ID, which
+        means that all ranks in the SP group receive the same sample / set of samples
+        per training step. We also set the number of replicas equal to the number of
+        SP groups, which is a bit of a hack / unintended use, but works!
+
+        Args:
+            dataset: Dataset to sample from.
+            shuffle: Whether to shuffle the dataset.
+            is_eval: Whether we are creating a sampler for evaluation or training.
+
+        Returns:
+            Distributed sampler.
+        """
+        num_sp_groups = self.args.world_size // self.args.sequence_parallel_degree
+        sp_group_id = dist.get_rank() // self.args.sequence_parallel_degree
+
+        return DistributedSampler(
+            dataset,
+            num_replicas=num_sp_groups,
+            rank=sp_group_id,
+            seed=self.args.seed if shuffle else None,
+            shuffle=shuffle,
+            drop_last=not is_eval,
+        )
+
+    def _sp_get_train_sampler(self, dataset) -> Sampler | None:
+        """
+        Get a training sampler configured for sequence parallelism.
+
+        Args:
+            dataset: The training dataset
+
+        Returns:
+            Configured sequence parallel sampler.
+        """
+        return self._create_sequence_parallel_sampler(
+            dataset,
+            shuffle=not self.args.curriculum_sampling,
+        )
+
+    def _sp_get_eval_sampler(self, eval_dataset) -> Sampler | None:
+        """
+        Get an evaluation sampler configured for sequence parallelism.
+
+        Args:
+            eval_dataset: The evaluation dataset.
+
+        Returns:
+            Configured sequence parallel sampler.
+        """
+        return self._create_sequence_parallel_sampler(
+            eval_dataset, shuffle=False, is_eval=True
+        )
+
+
+class SequenceParallelContextManager:
+    """
+    Context manager for sequence parallelism operations.
+
+    This class provides a context that will automatically apply sequence parallelism
+    during model forward passes using a pre-forward hook, and gather outputs from
+    across the sequence parallelism group using a post-forward hook.
+    """
+
+    def __init__(
+        self,
+        model: nn.Module,
+        sequence_parallel_degree: int,
+        ring_attn_func: RingAttnFunc,
+    ):
+        self.model = model
+        self.sequence_parallel_degree = sequence_parallel_degree
+        self.ring_attn_func = ring_attn_func
+        self.process_group = get_ring_attn_group()
+
+        # Initialize sequence parallel group details
+        self.local_rank = dist.get_rank(self.process_group)
+        self.local_world_size = dist.get_world_size(self.process_group)
+
+        # Will store hook handles for removal
+        self.hook_handles: list[RemovableHandle] = []
+
+        # Create a partially applied version of the apply_sequence_parallelism function
+        # with pre-configured params
+        self.apply_sequence_parallelism = functools.partial(
+            apply_sequence_parallelism,
+            local_rank=self.local_rank,
+            local_world_size=self.local_world_size,
+            ring_attn_func=self.ring_attn_func,
+        )
+
+    def __enter__(self):
+        # Forward pre-hook to apply sequence parallelism
+        def sequence_parallel_pre_hook(_, args, kwargs):
+            # Apply sequence parallelism to kwargs
+            kwargs = self.apply_sequence_parallelism(batch=kwargs)
+            return args, kwargs
+
+        # Forward post-hook to gather outputs
+        def sequence_parallel_post_hook(_, __, output):
+            # Gather the sharded outputs
+            return self.gather_outputs(output)
+
+        # Register both hooks
+        self.hook_handles.append(
+            self.model.register_forward_pre_hook(
+                sequence_parallel_pre_hook, with_kwargs=True
+            )
+        )
+        self.hook_handles.append(
+            self.model.register_forward_hook(sequence_parallel_post_hook)
+        )
+
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        # Remove all hooks
+        for handle in self.hook_handles:
+            handle.remove()
+        self.hook_handles = []
+
+    def gather_outputs(self, output):
+        """Gather sharded outputs from all ranks and reconstruct the full tensor."""
+        # Handle different output formats (dict, tensor, etc.)
+        if isinstance(output, dict):
+            gathered_output = {}
+            for key, value in output.items():
+                if isinstance(value, torch.Tensor) and value.dim() > 1:
+                    # Gather logits or other sequence-sharded tensors
+                    gathered_value = self.gather_tensor(value)
+                    gathered_output[key] = gathered_value
+                else:
+                    gathered_value = value.clone()
+                    dist.all_reduce(
+                        gathered_value, op=dist.ReduceOp.SUM, group=self.process_group
+                    )
+                    gathered_output[key] = gathered_value
+            return gathered_output
+        if isinstance(output, torch.Tensor):
+            return self.gather_tensor(output)
+
+        return output
+
+    def gather_tensor(self, tensor):
+        """Gather a sharded tensor from all ranks."""
+        # Prepare tensors for all_gather
+        world_size = self.local_world_size
+
+        # Create list to store tensors from all ranks
+        gathered_tensors = [torch.zeros_like(tensor) for _ in range(world_size)]
+
+        # All-gather operation
+        dist.all_gather(gathered_tensors, tensor, group=self.process_group)
+
+        # Concatenate along sequence dimension (typically dim=1)
+        if self.ring_attn_func in [RingAttnFunc.VARLEN_LLAMA3, RingAttnFunc.BATCH_RING]:
+            # Simple concatenation for standard sharding
+            return torch.cat(gathered_tensors, dim=1)
+
+        if self.ring_attn_func is RingAttnFunc.BATCH_ZIGZAG:
+            # Each rank has a pattern of (rank, world_size*2-rank-1)
+            reconstituted_tensors = [None] * (world_size * 2)
+
+            # First, split each gathered tensor into its two chunks
+            for rank, gathered_tensor in enumerate(gathered_tensors):
+                # Each tensor contains two chunks in the sequence dimension
+                chunk_size = gathered_tensor.size(1) // 2
+                chunk1, chunk2 = gathered_tensor.split(chunk_size, dim=1)
+
+                # Place chunks in their original positions
+                reconstituted_tensors[rank] = chunk1
+                reconstituted_tensors[world_size * 2 - rank - 1] = chunk2
+
+            # Concatenate the reconstituted tensors in the correct order
+            return torch.cat(reconstituted_tensors, dim=1)
+
+        # Otherwise, RingAttnFunc.BATCH_STRIPE
+        # In striping, each rank has every world_size-th slice
+        batch_size = tensor.size(0)
+        hidden_dim = tensor.size(-1)
+
+        # First, determine the full sequence length
+        total_seq_len = 0
+        for t in gathered_tensors:
+            total_seq_len += t.size(1)
+
+        # Create a tensor to hold the unstriped result
+        result = torch.zeros(
+            batch_size,
+            total_seq_len,
+            hidden_dim,
+            dtype=tensor.dtype,
+            device=tensor.device,
+        )
+
+        # For each rank's tensor, distribute its slices to the correct positions
+        for rank, gathered_tensor in enumerate(gathered_tensors):
+            # The rank's tensor contains every world_size-th slice
+            # starting from its rank position
+            seq_len = gathered_tensor.size(1)
+            for i in range(seq_len):
+                # Calculate the position in the full tensor
+                pos = i * world_size + rank
+                if pos < total_seq_len:
+                    result[:, pos] = gathered_tensor[:, i]
+
+        return result
--- a/src/axolotl/core/training_args.py
+++ b/src/axolotl/core/training_args.py
@@ -9,6 +9,8 @@ from PIL.Image import Resampling
 from transformers import TrainingArguments
 from trl import CPOConfig, KTOConfig, ORPOConfig, PRMConfig, RewardConfig

+from axolotl.monkeypatch.attention.ring_attn.patch import RingAttnFunc
+

@dataclass
 class AxolotlTrainingMixins:
@@ -214,16 +216,14 @@ class AxolotlTrainingMixins:
        },
    )

-    adam_beta3: Optional[float] = field(
-        default=None,
-        metadata={
-            "help": "The beta3 hyperparameter used in some optimizers such as CAME"
-        },
+    sequence_parallel_degree: Optional[int] = field(
+        default=1,
+        metadata={"help": "The number of workers to use in sequence parallelism"},
    )
-    adam_epsilon2: Optional[float] = field(
+    ring_attn_func: Optional[RingAttnFunc] = field(
        default=None,
        metadata={
-            "help": "The epsilon2 hyperparameter used in some optimizers such as CAME"
+            "help": "The ring-flash-attn function to use in sequence parallelism"
        },
    )

--- a/src/axolotl/integrations/base.py
+++ b/src/axolotl/integrations/base.py
@@ -10,240 +10,227 @@
 # License for the specific language governing permissions and limitations under
 # the License.

-"""Base class for all plugins.
+"""
+Base class for all plugins.

 A plugin is a reusable, modular, and self-contained piece of code that extends the functionality of Axolotl.
 Plugins can be used to integrate third-party models, modify the training process, or add new features.

 To create a new plugin, you need to inherit from the BasePlugin class and implement the required methods.
 """
-
-from __future__ import annotations
-
 import collections
 import importlib
 import logging
-from typing import TYPE_CHECKING, Callable, OrderedDict, Union
+from typing import OrderedDict

-from peft import PeftModel
-from torch.optim import Optimizer
+import torch
 from torch.optim.lr_scheduler import LRScheduler
-from transformers import PreTrainedModel, Trainer
-
-from axolotl.utils.dict import DictDefault
-
-if TYPE_CHECKING:
-    from axolotl.common.datasets import TrainDatasetMeta


 class BasePlugin:
-    """Base class for all plugins. Defines the interface for plugin methods.
+    """
+    Base class for all plugins. Defines the interface for plugin methods.
+
+    Attributes:
+    None

    Methods:
-        register(cfg): Registers the plugin with the given configuration.
-        load_datasets(cfg): Loads and preprocesses the dataset for training.
-        pre_model_load(cfg): Performs actions before the model is loaded.
-        post_model_build(cfg, model): Performs actions after the model is loaded, but
-            before LoRA adapters are applied.
-        pre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.
-        post_lora_load(cfg, model): Performs actions after LoRA weights are loaded.
-        post_model_load(cfg, model): Performs actions after the model is loaded,
-            inclusive of any adapters.
-        post_trainer_create(cfg, trainer): Performs actions after the trainer is
-            created.
-        create_optimizer(cfg, trainer): Creates and returns an optimizer for training.
-        create_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and
-            returns a learning rate scheduler.
-        add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before
-            training.
-        add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after
-            training.
+    register(cfg): Registers the plugin with the given configuration.
+    pre_model_load(cfg): Performs actions before the model is loaded.
+    post_model_build(cfg, model): Performs actions after the model is loaded, but before LoRA adapters are applied.
+    pre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.
+    post_lora_load(cfg, model): Performs actions after LoRA weights are loaded.
+    post_model_load(cfg, model): Performs actions after the model is loaded, inclusive of any adapters.
+    create_optimizer(cfg, trainer): Creates and returns an optimizer for training.
+    create_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and returns a learning rate scheduler.
+    add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before training.
+    add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after training.
    """

    def __init__(self):
-        """Initializes the BasePlugin."""
+        """
+        Initializes the BasePlugin.
+        """

    def register(self, cfg):  # pylint: disable=unused-argument
-        """Registers the plugin with the given configuration.
-
-        Args:
-            cfg: The configuration for the plugin.
        """
+        Registers the plugin with the given configuration.

-    def get_input_args(self) -> str | None:
-        """Returns a pydantic model for the plugin's input arguments."""
-
-    def load_datasets(
-        self, cfg: DictDefault, preprocess: bool = False
-    ) -> Union["TrainDatasetMeta", None]:
-        """Loads and preprocesses the dataset for training.
-
-        Args:
-            cfg: The configuration for the plugin.
-            preprocess: Whether this is the preprocess step of the datasets.
+        Parameters:
+        cfg (dict): The configuration for the plugin.

        Returns:
-            dataset_meta: The metadata for the training dataset.
+        None
        """

-    def pre_model_load(self, cfg: DictDefault):  # pylint: disable=unused-argument
-        """Performs actions before the model is loaded.
-
-        Args:
-            cfg: The configuration for the plugin.
+    def get_input_args(self):
+        """
+        Returns a pydantic model for the plugin's input arguments.
        """

-    # pylint: disable=unused-argument
-    def post_model_build(self, cfg: DictDefault, model: PreTrainedModel):
-        """Performs actions after the model is built/loaded, but before any adapters are applied.
-
-        Args:
-            cfg: The configuration for the plugin.
+    def pre_model_load(self, cfg):  # pylint: disable=unused-argument
        """
+        Performs actions before the model is loaded.

-    # pylint: disable=unused-argument
-    def pre_lora_load(self, cfg: DictDefault, model: PreTrainedModel):
-        """Performs actions before LoRA weights are loaded.
-
-        Args:
-            cfg: The configuration for the plugin.
-            model: The loaded model.
-        """
-
-    # pylint: disable=unused-argument
-    def post_lora_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
-        """Performs actions after LoRA weights are loaded.
-
-        Args:
-            cfg: The configuration for the plugin.
-            model: The loaded model.
-        """
-
-    # pylint: disable=unused-argument
-    def post_model_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
-        """Performs actions after the model is loaded.
-
-        Args:
-            cfg: The configuration for the plugin.
-            model: The loaded model.
-        """
-
-    # pylint: disable=unused-argument
-    def get_trainer_cls(self, cfg: DictDefault) -> Trainer | None:
-        """Returns a custom class for the trainer.
-
-        Args:
-            cfg: The global axolotl configuration.
+        Parameters:
+        cfg (dict): The configuration for the plugin.

        Returns:
-            The first non-`None` trainer class returned by a plugin.
+        None
        """

-    # pylint: disable=unused-argument
-    def post_trainer_create(self, cfg: DictDefault, trainer: Trainer):
-        """Performs actions after the trainer is created.
+    def post_model_build(self, cfg, model):  # pylint: disable=unused-argument
+        """
+        Performs actions after the model is built/loaded, but before any adapters are applied.

        Args:
-            cfg: The configuration for the plugin.
-            trainer: The trainer object for training.
+            cfg (dict): The configuration for the plugin.
        """

-    # pylint: disable=unused-argument
-    def create_optimizer(self, cfg: DictDefault, trainer: Trainer) -> Optimizer | None:
-        """Creates and returns an optimizer for training.
+    def post_model_load(self, cfg, model):  # pylint: disable=unused-argument
+        """
+        Performs actions after the model is loaded.

-        Args:
-            cfg: The configuration for the plugin.
-            trainer: The trainer object for training.
+        Parameters:
+        cfg (dict): The configuration for the plugin.
+        model (object): The loaded model.

        Returns:
-            The created optimizer.
+        None
+        """
+
+    def pre_lora_load(self, cfg, model):  # pylint: disable=unused-argument
+        """
+        Performs actions before LoRA weights are loaded.
+
+        Parameters:
+        cfg (dict): The configuration for the plugin.
+        model (object): The loaded model.
+
+        Returns:
+        None
+        """
+
+    def post_lora_load(self, cfg, model):  # pylint: disable=unused-argument
+        """
+        Performs actions after LoRA weights are loaded.
+
+        Parameters:
+        cfg (dict): The configuration for the plugin.
+        model (object): The loaded model.
+
+        Returns:
+        None
+        """
+
+    def get_trainer_cls(self, cfg):  # pylint: disable=unused-argument):
+        """
+        Returns a custom class for the trainer.
+
+        Parameters:
+        cfg (dict): The global axolotl configuration.
+
+        Returns:
+        class: The class for the trainer.
+        """
+
+    def create_optimizer(self, cfg, trainer):  # pylint: disable=unused-argument
+        """
+        Creates and returns an optimizer for training.
+
+        Parameters:
+        cfg (dict): The configuration for the plugin.
+        trainer (object): The trainer object for training.
+
+        Returns:
+        object: The created optimizer.
        """

-    # pylint: disable=unused-argument
    def create_lr_scheduler(
-        self,
-        cfg: DictDefault,
-        trainer: Trainer,
-        optimizer: Optimizer,
-        num_training_steps: int,
-    ) -> LRScheduler | None:
-        """Creates and returns a learning rate scheduler.
+        self, cfg, trainer, optimizer, num_training_steps
+    ) -> LRScheduler | None:  # pylint: disable=unused-argument
+        """
+        Creates and returns a learning rate scheduler.

-        Args:
-            cfg: The configuration for the plugin.
-            trainer: The trainer object for training.
-            optimizer: The optimizer for training.
-            num_training_steps: Total number of training steps
+        Parameters:
+        cfg (dict): The configuration for the plugin.
+        trainer (object): The trainer object for training.
+        optimizer (object): The optimizer for training.
+        num_training_steps (int): Total number of training steps

        Returns:
-            The created learning rate scheduler.
+        object (LRScheduler): The created learning rate scheduler.
        """

-    # pylint: disable=unused-argument
-    def add_callbacks_pre_trainer(
-        self, cfg: DictDefault, model: PreTrainedModel
-    ) -> list[Callable]:
-        """Set up callbacks before creating the trainer.
+    def add_callbacks_pre_trainer(self, cfg, model):  # pylint: disable=unused-argument
+        """
+        setup callbacks before creating the trainer.

-        Args:
-            cfg: The configuration for the plugin.
-            model: The loaded model.
+        Parameters:
+        cfg (dict): The configuration for the plugin.
+        model (object): The loaded model.

        Returns:
-            A list of callback functions to be added to the `TrainingArgs`.
+        List[callable]: A list of callback functions to be added to the TrainingArgs
        """
        return []

-    # pylint: disable=unused-argument
    def add_callbacks_post_trainer(
-        self, cfg: DictDefault, trainer: Trainer
-    ) -> list[Callable]:
-        """Adds callbacks to the trainer after creating the trainer. This is useful for
-        callbacks that require access to the model or trainer.
+        self, cfg, trainer
+    ):  # pylint: disable=unused-argument
+        """
+        Adds callbacks to the trainer after creating the trainer.
+        This is useful for callbacks that require access to the model or trainer.

-        Args:
-            cfg: The configuration for the plugin.
-            trainer: The trainer object for training.
+        Parameters:
+        cfg (dict): The configuration for the plugin.
+        trainer (object): The trainer object for training.

        Returns:
-            A list of callback functions to be added
+        List[callable]: A list of callback functions to be added
        """
        return []

-    # pylint: disable=unused-argument
-    def post_train(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
-        """Performs actions after training is complete.
+    def post_train(self, cfg, model):  # pylint: disable=unused-argument
+        """
+        Performs actions after training is complete.

-        Args:
-            cfg: The axolotl configuration.
-            model: The loaded model.
+        Parameters:
+        cfg (dict): The axolotl configuration
+        model (object): The loaded model.
+
+        Returns:
+        None
        """

-    def post_train_unload(self, cfg: DictDefault):  # pylint: disable=unused-argument
-        """Performs actions after training is complete and the model is unloaded.
+    def post_train_unload(self, cfg):  # pylint: disable=unused-argument
+        """
+        Performs actions after training is complete and the model is unloaded.

-        Args:
-            cfg: The configuration for the plugin.
+        Parameters:
+        cfg (dict): The configuration for the plugin.
+
+        Returns:
+        None
        """


 def load_plugin(plugin_name: str) -> BasePlugin:
-    """Loads a plugin based on the given plugin name.
+    """
+    Loads a plugin based on the given plugin name.

-    The plugin name should be in the format "module_name.class_name". This function
-    splits the plugin name into module and class, imports the module, retrieves the
-    class from the module, and creates an instance of the class.
+    The plugin name should be in the format "module_name.class_name".
+    This function splits the plugin name into module and class, imports the module,
+    retrieves the class from the module, and creates an instance of the class.

-    Args:
-        plugin_name: The name of the plugin to be loaded. The name should be in the
-            format "module_name.class_name".
+    Parameters:
+    plugin_name (str): The name of the plugin to be loaded. The name should be in the format "module_name.class_name".

    Returns:
-        An instance of the loaded plugin.
+    BasePlugin: An instance of the loaded plugin.

    Raises:
-        ImportError: If the plugin module cannot be imported.
+    ImportError: If the plugin module cannot be imported.
    """
    # split the plugin name into module and class
    module_name, class_name = plugin_name.rsplit(".", 1)
@@ -269,25 +256,28 @@ def load_plugin(plugin_name: str) -> BasePlugin:


 class PluginManager:
-    """The `PluginManager` class is responsible for loading and managing plugins. It
-    should be a singleton so it can be accessed from anywhere in the codebase.
+    """
+    The PluginManager class is responsible for loading and managing plugins.
+    It should be a singleton so it can be accessed from anywhere in the codebase.

    Attributes:
-        plugins: A list of loaded plugins.
+    plugins (List[BasePlugin]): A list of loaded plugins.

    Methods:
-        get_instance(): Static method to get the singleton instance of `PluginManager`.
-        register(plugin_name: str): Registers a new plugin by its name.
-        pre_model_load(cfg): Calls the pre_model_load method of all registered plugins.
+    get_instance(): Static method to get the singleton instance of PluginManager.
+    register(plugin_name: str): Registers a new plugin by its name.
+    pre_model_load(cfg): Calls the pre_model_load method of all registered plugins.
    """

    plugins: OrderedDict[str, BasePlugin] = collections.OrderedDict()

-    _instance: PluginManager | None = None
-    _cfg: DictDefault | None = None
+    _instance = None
+    _cfg = None

    def __new__(cls):
-        """Creates a new instance of PluginManager if it doesn't exist yet."""
+        """
+        Creates a new instance of PluginManager if it doesn't exist yet.
+        """
        if cls._instance is None:
            cls._instance = super(PluginManager, cls).__new__(cls)
            cls._instance.plugins: OrderedDict[str, BasePlugin] = (
@@ -297,8 +287,9 @@ class PluginManager:

    @staticmethod
    def get_instance() -> "PluginManager":
-        """Returns the singleton instance of PluginManager. If the instance doesn't
-        exist, it creates a new one.
+        """
+        Returns the singleton instance of PluginManager.
+        If the instance doesn't exist, it creates a new one.
        """
        if PluginManager._instance is None:
            PluginManager()
@@ -313,13 +304,17 @@ class PluginManager:
        self._cfg = cfg

    def register(self, plugin_name: str):
-        """Registers a new plugin by its name.
+        """
+        Registers a new plugin by its name.

-        Args:
-            plugin_name: The name of the plugin to be registered.
+        Parameters:
+        plugin_name (str): The name of the plugin to be registered.
+
+        Returns:
+        None

        Raises:
-            ImportError: If the plugin module cannot be imported.
+        ImportError: If the plugin module cannot be imported.
        """
        try:
            logging.info(f"Attempting to load plugin: {plugin_name}")
@@ -329,11 +324,12 @@ class PluginManager:
        except ImportError:
            logging.error(f"Failed to load plugin: {plugin_name}")

-    def get_input_args(self) -> list[str]:
-        """Returns a list of Pydantic classes for all registered plugins' input arguments.'
+    def get_input_args(self):
+        """
+        Returns a list of Pydantic classes for all registered plugins' input arguments.'

        Returns:
-            A list of Pydantic classes for all registered plugins' input arguments.'
+        list[str]: A list of Pydantic classes for all registered plugins' input arguments.'
        """
        input_args = []
        for plugin in self.plugins.values():
@@ -342,88 +338,83 @@ class PluginManager:
                input_args.append(input_args_from_plugin)
        return input_args

-    def load_datasets(
-        self, cfg: DictDefault, preprocess: bool = False
-    ) -> Union["TrainDatasetMeta", None]:
-        """Calls the load_datasets method of each registered plugin.
+    def pre_model_load(self, cfg):
+        """
+        Calls the pre_model_load method of all registered plugins.

-        Args:
-            cfg: The configuration for the plugins.
-            preprocess: Whether this is preprocess step of the datasets.
+        Parameters:
+        cfg (dict): The configuration for the plugins.

        Returns:
-            The dataset metadata loaded from all registered plugins.
-        """
-        return_ds_meta = None
-        for plugin in self.plugins.values():
-            dataset_meta = plugin.load_datasets(cfg, preprocess)
-            if dataset_meta is not None:
-                if return_ds_meta is None:
-                    return_ds_meta = dataset_meta
-                else:
-                    raise RuntimeError("Multiple plugins loaded datasets")
-        return return_ds_meta
-
-    def pre_model_load(self, cfg: DictDefault):
-        """Calls the pre_model_load method of all registered plugins.
-
-        Args:
-            cfg: The configuration for the plugins.
+        None
        """
        for plugin in self.plugins.values():
            plugin.pre_model_load(cfg)

-    def post_model_build(self, cfg: DictDefault, model: PreTrainedModel):
-        """Calls the `post_model_build` method of all registered plugins after the
-        model has been built / loaded, but before any adapters have been applied.
+    def post_model_build(self, cfg, model):
+        """
+        Calls the post_model_build method of all registered plugins after the model has been built/loaded,
+        but before any adapters have been applied.

        Args:
-            cfg: The configuration for the plugins.
-            model: The loaded model.
+            cfg (dict): The configuration for the plugins.
+            model (object): The loaded model.
        """
        for plugin in self.plugins.values():
            plugin.post_model_build(cfg, model)

-    def pre_lora_load(self, cfg: DictDefault, model: PreTrainedModel):
-        """Calls the `pre_lora_load` method of all registered plugins.
-
-        Args:
-            cfg: The configuration for the plugins.
-            model: The loaded model.
+    def post_model_load(self, cfg, model):
        """
-        for plugin in self.plugins.values():
-            plugin.pre_lora_load(cfg, model)
+        Calls the post_model_load method of all registered plugins after the model has been loaded
+        inclusive of any adapters

-    def post_lora_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
-        """Calls the `post_lora_load` method of all registered plugins.
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+        model (object): The loaded model.

-        Args:
-            cfg: The configuration for the plugins.
-            model: The loaded model.
-        """
-        for plugin in self.plugins.values():
-            plugin.post_lora_load(cfg, model)
-
-    def post_model_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
-        """Calls the `post_model_load` method of all registered plugins after the model
-        has been loaded inclusive of any adapters.
-
-        Args:
-            cfg: The configuration for the plugins.
-            model: The loaded model.
+        Returns:
+        None
        """
        for plugin in self.plugins.values():
            plugin.post_model_load(cfg, model)

-    def get_trainer_cls(self, cfg: DictDefault) -> Trainer | None:
-        """Calls the `get_trainer_cls` method of all registered plugins and returns the
-        first non-`None` trainer class.
+    def pre_lora_load(self, cfg, model):
+        """
+        Calls the pre_lora_load method of all registered plugins.

-        Args:
-            cfg: The configuration for the plugins.
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+        model (object): The loaded model.

        Returns:
-            The first non-`None` trainer class returned by a plugin.
+        None
+        """
+        for plugin in self.plugins.values():
+            plugin.pre_lora_load(cfg, model)
+
+    def post_lora_load(self, cfg, model):
+        """
+        Calls the post_lora_load method of all registered plugins.
+
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+        model (object): The loaded model.
+
+        Returns:
+        None
+        """
+        for plugin in self.plugins.values():
+            plugin.post_lora_load(cfg, model)
+
+    def get_trainer_cls(self, cfg):
+        """
+        Calls the get_trainer_cls method of all registered plugins and returns the first non-None trainer class.
+
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+
+        Returns:
+        object: The trainer class, or None if none was found.
        """
        for plugin in self.plugins.values():
            trainer_cls = plugin.get_trainer_cls(cfg)
@@ -431,25 +422,15 @@ class PluginManager:
                return trainer_cls
        return None

-    def post_trainer_create(self, cfg: DictDefault, trainer: Trainer):
-        """Calls the `post_trainer_create` method of all registered plugins.
-
-        Args:
-            cfg: The configuration for the plugins.
-            trainer: The trainer object for training.
+    def create_optimizer(self, trainer):
        """
-        for plugin in self.plugins.values():
-            plugin.post_trainer_create(cfg, trainer)
+        Calls the create_optimizer method of all registered plugins and returns the first non-None optimizer.

-    def create_optimizer(self, trainer: Trainer) -> Optimizer | None:
-        """Calls the `create_optimizer` method of all registered plugins and returns
-        the first non-`None` optimizer.
-
-        Args:
-            trainer: The trainer object for training.
+        Parameters:
+        trainer (object): The trainer object for training.

        Returns:
-            The created optimizer, or `None` if none was found.
+        object: The created optimizer, or None if none was found.
        """
        for plugin in self.plugins.values():
            optimizer = plugin.create_optimizer(self.cfg, trainer)
@@ -458,17 +439,17 @@ class PluginManager:
        return None

    def create_lr_scheduler(
-        self, trainer: Trainer, optimizer: Optimizer, num_training_steps: int
+        self, trainer, optimizer, num_training_steps
    ) -> LRScheduler | None:
-        """Calls the `create_lr_scheduler` method of all registered plugins and returns
-        the first non-`None` scheduler.
+        """
+        Calls the create_lr_scheduler method of all registered plugins and returns the first non-None scheduler.

-        Args:
-            trainer: The trainer object for training.
-            optimizer: The optimizer for training.
+        Parameters:
+        trainer (object): The trainer object for training.
+        optimizer (object): The optimizer for training.

        Returns:
-            The created learning rate scheduler, or `None` if not found.
+        object: The created learning rate scheduler, or None if none was found.
        """
        for plugin in self.plugins.values():
            scheduler: LRScheduler | None = plugin.create_lr_scheduler(
@@ -481,17 +462,16 @@ class PluginManager:
                return scheduler
        return None

-    def add_callbacks_pre_trainer(
-        self, cfg: DictDefault, model: PreTrainedModel
-    ) -> list[Callable]:
-        """Calls the add_callbacks_pre_trainer method of all registered plugins.
+    def add_callbacks_pre_trainer(self, cfg, model):
+        """
+        Calls the add_callbacks_pre_trainer method of all registered plugins.

-        Args:
-            cfg: The configuration for the plugins.
-            model: The loaded model.
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+        model (object): The loaded model.

        Returns:
-            A list of callback functions to be added to the `TrainingArgs`.
+        List[callable]: A list of callback functions to be added to the TrainingArgs.
        """
        callbacks = []
        for plugin in self.plugins.values():
@@ -500,17 +480,16 @@ class PluginManager:
                callbacks.extend(plugin_callbacks)
        return callbacks

-    def add_callbacks_post_trainer(
-        self, cfg: DictDefault, trainer: Trainer
-    ) -> list[Callable]:
-        """Calls the `add_callbacks_post_trainer` method of all registered plugins.
+    def add_callbacks_post_trainer(self, cfg, trainer):
+        """
+        Calls the add_callbacks_post_trainer method of all registered plugins.

-        Args:
-            cfg: The configuration for the plugins.
-            trainer: The trainer object for training.
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+        trainer (object): The trainer object for training.

        Returns:
-            A list of callback functions to be added to the `TrainingArgs`.
+        List[callable]: A list of callback functions to be added to the TrainingArgs.
        """
        callbacks = []
        for plugin in self.plugins.values():
@@ -519,31 +498,41 @@ class PluginManager:
                callbacks.extend(plugin_callbacks)
        return callbacks

-    def post_train(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
-        """Calls the post_train method of all registered plugins.
+    def post_train(self, cfg, model):
+        """
+        Calls the post_train method of all registered plugins.

-        Args:
-            cfg: The configuration for the plugins.
-            model: The loaded model.
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+        model (object): The loaded model.
+
+        Returns:
+        None
        """
        for plugin in self.plugins.values():
            plugin.post_train(cfg, model)

-    def post_train_unload(self, cfg: DictDefault):
-        """Calls the post_train_unload method of all registered plugins.
+    def post_train_unload(self, cfg):
+        """
+        Calls the post_train_unload method of all registered plugins.

-        Args:
-            cfg: The configuration for the plugins.
-            model: The loaded model.
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+        model (object): The loaded model.
+
+        Returns:
+        None
        """
        for plugin in self.plugins.values():
            plugin.post_train_unload(cfg)


 class BaseOptimizerFactory:
-    """Base class for factories to create custom optimizers"""
+    """
+    Base class for factories to create custom optimizers
+    """

    def __call__(
        self, opt_model, training_args, **optimizer_kwargs
-    ) -> Optimizer | None:
+    ) -> "torch.optim.Optimizer":
        pass
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/cohere.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/cohere.py
@@ -20,15 +20,25 @@ from cut_cross_entropy.transformers.utils import (
 from transformers.cache_utils import Cache
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.models.cohere.modeling_cohere import (
+    _CONFIG_FOR_DOC,
+    COHERE_INPUTS_DOCSTRING,
    KwargsForCausalLM,
 )
 from transformers.processing_utils import Unpack
+from transformers.utils import (
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
 from transformers.utils.deprecation import deprecate_kwarg

 _PATCH_OPTS: PatchOptions | None = None


@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+@add_start_docstrings_to_model_forward(COHERE_INPUTS_DOCSTRING)
+@replace_return_docstrings(
+    output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
+)
 def cce_forward(
    self,
    input_ids: torch.LongTensor | None = None,
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/gemma.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/gemma.py
@@ -17,15 +17,25 @@ from cut_cross_entropy.transformers.utils import (
 from transformers.cache_utils import Cache
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.models.gemma.modeling_gemma import (
+    _CONFIG_FOR_DOC,
+    GEMMA_INPUTS_DOCSTRING,
    KwargsForCausalLM,
 )
 from transformers.processing_utils import Unpack
+from transformers.utils import (
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
 from transformers.utils.deprecation import deprecate_kwarg

 _PATCH_OPTS: PatchOptions | None = None


@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+@add_start_docstrings_to_model_forward(GEMMA_INPUTS_DOCSTRING)
+@replace_return_docstrings(
+    output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
+)
 def cce_forward(
    self,
    input_ids: torch.LongTensor | None = None,
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/gemma3.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/gemma3.py
@@ -20,11 +20,15 @@ from torch import nn
 from transformers.cache_utils import Cache, HybridCache
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.models.gemma3.modeling_gemma3 import (
+    _CONFIG_FOR_DOC,
+    GEMMA3_INPUTS_DOCSTRING,
    Gemma3CausalLMOutputWithPast,
    logger,
 )
 from transformers.utils import (
+    add_start_docstrings_to_model_forward,
    is_torchdynamo_compiling,
+    replace_return_docstrings,
 )
 from transformers.utils.deprecation import deprecate_kwarg

@@ -34,6 +38,10 @@ _PATCH_OPTS: PatchOptions | None = None


@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+@add_start_docstrings_to_model_forward(GEMMA3_INPUTS_DOCSTRING)
+@replace_return_docstrings(
+    output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
+)
 def cce_forward(
    self,
    input_ids: torch.LongTensor | None = None,
@@ -162,6 +170,10 @@ def cce_forward(


@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+@add_start_docstrings_to_model_forward(GEMMA3_INPUTS_DOCSTRING)
+@replace_return_docstrings(
+    output_type=Gemma3CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
+)
 def cce_forward_multimodal(
    self,
    input_ids: torch.LongTensor | None = None,
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/llama.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/llama.py
@@ -19,9 +19,15 @@ from transformers.modeling_outputs import (
    CausalLMOutputWithPast,
 )
 from transformers.models.llama.modeling_llama import (
+    _CONFIG_FOR_DOC,
+    LLAMA_INPUTS_DOCSTRING,
    KwargsForCausalLM,
 )
 from transformers.processing_utils import Unpack
+from transformers.utils import (
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
 from transformers.utils.deprecation import deprecate_kwarg
 from transformers.utils.generic import can_return_tuple

@@ -30,6 +36,10 @@ _PATCH_OPTS: PatchOptions | None = None

@can_return_tuple
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+@add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+@replace_return_docstrings(
+    output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
+)
 def cce_forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/llama4.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/llama4.py
@@ -16,12 +16,22 @@ from torch import nn
 from transformers.cache_utils import Cache
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.models.llama4.modeling_llama4 import (
+    _CONFIG_FOR_DOC,
+    LLAMA4_INPUTS_DOCSTRING,
    Llama4CausalLMOutputWithPast,
 )
+from transformers.utils import (
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)

 _PATCH_OPTS: PatchOptions | None = None


+@add_start_docstrings_to_model_forward(LLAMA4_INPUTS_DOCSTRING)
+@replace_return_docstrings(
+    output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
+)
 def cce_forward(
    self,
    input_ids: torch.LongTensor | None = None,
@@ -150,6 +160,9 @@ def cce_forward(
    )


+@replace_return_docstrings(
+    output_type=Llama4CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
+)
 def cce_forward_multimodal(
    self,
    input_ids: torch.LongTensor | None = None,  # type: ignore
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/mistral3.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/mistral3.py
@@ -19,11 +19,15 @@ from transformers.models.mistral3.modeling_mistral3 import (
    Mistral3CausalLMOutputWithPast,
 )
 from transformers.models.mistral.modeling_mistral import (
+    _CONFIG_FOR_DOC,
+    MISTRAL_INPUTS_DOCSTRING,
    KwargsForCausalLM,
 )
 from transformers.processing_utils import Unpack
 from transformers.utils import (
+    add_start_docstrings_to_model_forward,
    is_torchdynamo_compiling,
+    replace_return_docstrings,
 )
 from transformers.utils.deprecation import deprecate_kwarg

@@ -31,6 +35,10 @@ _PATCH_OPTS: PatchOptions | None = None


@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+@add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+@replace_return_docstrings(
+    output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
+)
 def cce_forward(
    self,
    input_ids: torch.LongTensor | None = None,
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_moe.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_moe.py
@@ -13,10 +13,16 @@ from cut_cross_entropy.transformers.utils import (
    apply_lce,
 )
 from transformers.models.qwen2_moe.modeling_qwen2_moe import (
+    _CONFIG_FOR_DOC,
+    QWEN2MOE_INPUTS_DOCSTRING,
    MoeCausalLMOutputWithPast,
    MoeModelOutputWithPast,
    load_balancing_loss_func,
 )
+from transformers.utils import (
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
 from transformers.utils.deprecation import deprecate_kwarg
 from transformers.utils.generic import can_return_tuple

@@ -25,6 +31,10 @@ _PATCH_OPTS: PatchOptions | None = None

@can_return_tuple
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+@add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING)
+@replace_return_docstrings(
+    output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
+)
 def forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_vl.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_vl.py
@@ -14,12 +14,22 @@ from cut_cross_entropy.transformers.utils import (
 )
 from torch.nn import CrossEntropyLoss
 from transformers.models.qwen2_vl.modeling_qwen2_vl import (
+    _CONFIG_FOR_DOC,
+    QWEN2_VL_INPUTS_DOCSTRING,
    Qwen2VLCausalLMOutputWithPast,
 )
+from transformers.utils import (
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)

 _PATCH_OPTS: PatchOptions | None = None


+@add_start_docstrings_to_model_forward(QWEN2_VL_INPUTS_DOCSTRING)
+@replace_return_docstrings(
+    output_type=Qwen2VLCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
+)
 def cce_forward_multimodal(
    self,
    input_ids: Optional[torch.LongTensor] = None,
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen3_moe.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen3_moe.py
@@ -12,13 +12,20 @@ from cut_cross_entropy.transformers.utils import (
    TransformersModelT,
    apply_lce,
 )
+from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.models.qwen3_moe.modeling_qwen3_moe import (
+    _CONFIG_FOR_DOC,
+    QWEN3_MOE_INPUTS_DOCSTRING,
    KwargsForCausalLM,
    MoeCausalLMOutputWithPast,
    MoeModelOutputWithPast,
    load_balancing_loss_func,
 )
 from transformers.processing_utils import Unpack
+from transformers.utils import (
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
 from transformers.utils.deprecation import deprecate_kwarg
 from transformers.utils.generic import can_return_tuple

@@ -27,6 +34,10 @@ _PATCH_OPTS: PatchOptions | None = None

@can_return_tuple
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+@add_start_docstrings_to_model_forward(QWEN3_MOE_INPUTS_DOCSTRING)
+@replace_return_docstrings(
+    output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
+)
 def forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
--- a/src/axolotl/integrations/liger/init.py
+++ b/src/axolotl/integrations/liger/init.py
@@ -151,30 +151,6 @@ class LigerPlugin(BasePlugin):
                rms_norm=cfg.liger_rms_norm,
                layer_norm=cfg.liger_layer_norm,
            )
-        elif cfg.model_config_type == "qwen3":
-            from axolotl.integrations.liger.models.qwen3 import (
-                apply_liger_kernel_to_qwen3,
-            )
-
-            apply_liger_kernel_to_qwen3(
-                cross_entropy=cfg.liger_cross_entropy,
-                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
-                glu_activation=cfg.liger_glu_activation,
-                rms_norm=cfg.liger_rms_norm,
-                layer_norm=cfg.liger_layer_norm,
-            )
-        elif cfg.model_config_type == "qwen3_moe":
-            from axolotl.integrations.liger.models.qwen3_moe import (
-                apply_liger_kernel_to_qwen3_moe,
-            )
-
-            apply_liger_kernel_to_qwen3_moe(
-                cross_entropy=cfg.liger_cross_entropy,
-                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
-                glu_activation=cfg.liger_glu_activation,
-                rms_norm=cfg.liger_rms_norm,
-                layer_norm=cfg.liger_layer_norm,
-            )
        else:
            logging.warning(
                f"Unsupported model config type: {cfg.model_config_type}. Liger not applied."
--- a/src/axolotl/integrations/liger/models/init.py
+++ b/src/axolotl/integrations/liger/models/init.py
--- a/src/axolotl/integrations/liger/models/deepseekv2.py
+++ b/src/axolotl/integrations/liger/models/deepseekv2.py
@@ -14,6 +14,10 @@ from torch.nn import CrossEntropyLoss
 from transformers.modeling_outputs import CausalLMOutputWithPast


+# @add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
+# @replace_return_docstrings(
+#    output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
+# )
 def lce_forward(
    self,
    input_ids: torch.LongTensor = None,
--- a/src/axolotl/integrations/liger/models/jamba.py
+++ b/src/axolotl/integrations/liger/models/jamba.py
@@ -13,11 +13,21 @@ from liger_kernel.transformers.fused_linear_cross_entropy import (
 from torch.nn import CrossEntropyLoss
 from transformers.modeling_outputs import MoeCausalLMOutputWithPast
 from transformers.models.jamba.modeling_jamba import (
+    _CONFIG_FOR_DOC,
+    JAMBA_INPUTS_DOCSTRING,
    HybridMambaAttentionDynamicCache,
    load_balancing_loss_func,
 )
+from transformers.utils import (
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)


+@add_start_docstrings_to_model_forward(JAMBA_INPUTS_DOCSTRING)
+@replace_return_docstrings(
+    output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
+)
 def lce_forward(
    self,
    input_ids: torch.LongTensor = None,
--- a/src/axolotl/integrations/liger/models/qwen3.py
+++ b/src/axolotl/integrations/liger/models/qwen3.py
@@ -1,160 +0,0 @@
-"""
-Liger FLCE for Qwen3. Based on transformers v4.51.3.
-"""
-
-import sys
-from typing import Optional, Tuple, Union
-
-import torch
-from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
-from transformers.cache_utils import Cache
-from transformers.modeling_outputs import CausalLMOutputWithPast
-
-
-def lce_forward(
-    self,
-    input_ids: Optional[torch.LongTensor] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[Cache] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    **kwargs,
-) -> Union[Tuple, CausalLMOutputWithPast]:
-    r"""
-    Args:
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-    Returns:
-    """
-
-    # pylint: disable=duplicate-code
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-
-    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-    outputs = self.model(
-        input_ids=input_ids,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        cache_position=cache_position,
-        **kwargs,
-    )
-
-    hidden_states = outputs[0]
-
-    logits = None
-    loss = None
-    # if in training mode, don't materialize logits
-    if self.training and (labels is not None):
-        loss = LigerForCausalLMLoss(
-            hidden_states=hidden_states,
-            lm_head_weight=self.lm_head.weight,
-            labels=labels,
-            hidden_size=self.config.hidden_size,
-            **kwargs,
-        )
-
-    else:  # if in inference mode materialize logits
-        slice_indices = (
-            slice(-logits_to_keep, None)
-            if isinstance(logits_to_keep, int)
-            else logits_to_keep
-        )
-        logits = self.lm_head(hidden_states[:, slice_indices, :])
-        if labels is not None:
-            loss = self.loss_function(
-                logits=logits,
-                labels=labels,
-                vocab_size=self.config.vocab_size,
-                **kwargs,
-            )
-
-    return CausalLMOutputWithPast(
-        loss=loss,
-        logits=logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-    )
-
-
-def apply_liger_kernel_to_qwen3(
-    cross_entropy: bool = False,
-    fused_linear_cross_entropy: bool = False,
-    rms_norm: bool = False,
-    glu_activation: bool = False,
-    layer_norm: bool = False,
-    **kwargs,  # pylint: disable=unused-argument
-) -> None:
-    # pylint: disable=duplicate-code
-    """
-    Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
-
-    Args:
-        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
-        fused_linear_cross_entropy (bool):
-            Whether to apply Liger's fused linear cross entropy loss. Default is False.
-            `cross_entropy` and `fused_linear_cross_entropy` cannot both be False.
-            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
-        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is False.
-        glu_activation (bool): Whether to apply Liger's SwiGLU MLP. Default is False.
-        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
-    """
-
-    import transformers.models.qwen3.modeling_qwen3  # noqa: F401  # pylint: disable=unused-import
-    from liger_kernel.transformers.functional import liger_cross_entropy
-    from liger_kernel.transformers.layer_norm import LigerLayerNorm
-    from liger_kernel.transformers.rms_norm import LigerRMSNorm
-    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
-
-    assert not (
-        cross_entropy and fused_linear_cross_entropy
-    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
-
-    modeling_qwen3 = sys.modules["transformers.models.qwen3.modeling_qwen3"]
-
-    if rms_norm:
-        modeling_qwen3.Qwen3RMSNorm = LigerRMSNorm
-
-    if glu_activation:
-        modeling_qwen3.Qwen3MLP = LigerSwiGLUMLP
-
-    if layer_norm:
-        modeling_qwen3.nn.LayerNorm = LigerLayerNorm
-
-    if cross_entropy:
-        from transformers.loss.loss_utils import nn
-
-        nn.functional.cross_entropy = liger_cross_entropy
-
-    if fused_linear_cross_entropy:
-        modeling_qwen3.Qwen3ForCausalLM.forward = lce_forward
--- a/src/axolotl/integrations/liger/models/qwen3_moe.py
+++ b/src/axolotl/integrations/liger/models/qwen3_moe.py
@@ -1,191 +0,0 @@
-"""
-Liger FLCE for Qwen3 MoE. Based on transformers v4.51.3.
-"""
-
-import sys
-from copy import deepcopy
-from typing import List, Optional, Union
-
-import torch
-from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
-from transformers.modeling_outputs import MoeCausalLMOutputWithPast
-from transformers.models.qwen3_moe.modeling_qwen3_moe import load_balancing_loss_func
-
-
-def lce_forward(
-    self,
-    input_ids: Optional[torch.LongTensor] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[List[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    output_router_logits: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    **kwargs,
-) -> MoeCausalLMOutputWithPast:
-    r"""
-    Args:
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-    Returns:
-    """
-
-    # pylint: disable=duplicate-code
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_router_logits = (
-        output_router_logits
-        if output_router_logits is not None
-        else self.config.output_router_logits
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-
-    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-    outputs = self.model(
-        input_ids=input_ids,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        output_router_logits=output_router_logits,
-        cache_position=cache_position,
-        **kwargs,
-    )
-
-    hidden_states = outputs[0]
-
-    logits = None
-    loss = None
-    # if in training mode, don't materialize logits
-    if self.training and (labels is not None):
-        loss = LigerForCausalLMLoss(
-            hidden_states=hidden_states,
-            lm_head_weight=self.lm_head.weight,
-            labels=labels,
-            hidden_size=self.config.hidden_size,
-            **kwargs,
-        )
-
-    else:  # if in inference mode materialize logits
-        slice_indices = (
-            slice(-logits_to_keep, None)
-            if isinstance(logits_to_keep, int)
-            else logits_to_keep
-        )
-        logits = self.lm_head(hidden_states[:, slice_indices, :])
-        if labels is not None:
-            loss = self.loss_function(
-                logits=logits,
-                labels=labels,
-                vocab_size=self.config.vocab_size,
-                **kwargs,
-            )
-
-    aux_loss = None
-    if output_router_logits:
-        aux_loss = load_balancing_loss_func(
-            outputs.router_logits,
-            self.num_experts,
-            self.num_experts_per_tok,
-            attention_mask,
-        )
-        if labels is not None:
-            loss += self.router_aux_loss_coef * aux_loss.to(
-                loss.device
-            )  # make sure to reside in the same device
-
-    return MoeCausalLMOutputWithPast(
-        loss=loss,
-        aux_loss=aux_loss,
-        logits=logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-    )
-
-
-def apply_liger_kernel_to_qwen3_moe(
-    cross_entropy: bool = False,
-    fused_linear_cross_entropy: bool = False,
-    rms_norm: bool = False,
-    glu_activation: bool = False,
-    layer_norm: bool = False,
-    **kwargs,  # pylint: disable=unused-argument
-) -> None:
-    # pylint: disable=duplicate-code
-    """
-    Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
-
-    Args:
-        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
-        fused_linear_cross_entropy (bool):
-            Whether to apply Liger's fused linear cross entropy loss. Default is False.
-            `cross_entropy` and `fused_linear_cross_entropy` cannot both be False.
-            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
-        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is False.
-        glu_activation (bool): Whether to apply Liger's SwiGLU MLP. Default is False.
-        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
-    """
-
-    import transformers.models.qwen3_moe.modeling_qwen3_moe  # noqa: F401  # pylint: disable=unused-import
-    from liger_kernel.transformers.functional import liger_cross_entropy
-    from liger_kernel.transformers.layer_norm import LigerLayerNorm
-    from liger_kernel.transformers.rms_norm import LigerRMSNorm
-    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
-
-    assert not (
-        cross_entropy and fused_linear_cross_entropy
-    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
-
-    modeling_qwen3_moe = sys.modules["transformers.models.qwen3_moe.modeling_qwen3_moe"]
-
-    if rms_norm:
-        modeling_qwen3_moe.Qwen3MoeRMSNorm = LigerRMSNorm
-
-    if glu_activation:
-
-        def _liger_swiglu_mlp_wrapper(config, intermediate_size=None, **kwargs):
-            "Accepts intermediate_size to pass to LigerSwiGLUMLP"
-            # clone config to avoid modifying the original
-            config = deepcopy(config)
-            if intermediate_size:
-                setattr(config, "intermediate_size", intermediate_size)
-            return LigerSwiGLUMLP(config, **kwargs)
-
-        modeling_qwen3_moe.Qwen3MoeMLP = _liger_swiglu_mlp_wrapper
-
-    if layer_norm:
-        modeling_qwen3_moe.nn.LayerNorm = LigerLayerNorm
-
-    if cross_entropy:
-        from transformers.loss.loss_utils import nn
-
-        nn.functional.cross_entropy = liger_cross_entropy
-
-    if fused_linear_cross_entropy:
-        modeling_qwen3_moe.Qwen3MoeForCausalLM.forward = lce_forward
--- a/src/axolotl/kernels/geglu.py
+++ b/src/axolotl/kernels/geglu.py
@@ -1,4 +1,5 @@
-"""Module for definition of GEGLU Triton kernels.
+"""
+Module for definition of GEGLU Triton kernels.

 See "GLU Variants Improve Transformer" (https://arxiv.org/abs/2002.05202).

@@ -11,6 +12,8 @@ import torch
 import triton
 import triton.language as tl

+SQRT_2_PI: tl.constexpr = 0.7978845608028654  # sqrt(2/π)
+

@triton.jit
 def _geglu_fwd_kernel(
--- a/src/axolotl/kernels/quantize.py
+++ b/src/axolotl/kernels/quantize.py
@@ -55,13 +55,16 @@ def dequantize(
    target_device = W.device

    # Extract quantization state
+    nested = False
    if not isinstance(quant_state, list):
        # New style quant_state class
        absmax = quant_state.absmax.to(target_device)
        shape = quant_state.shape
        dtype = quant_state.dtype
        blocksize = quant_state.blocksize
-        offset = quant_state.offset.to(target_device)
+        if quant_state.nested:
+            nested = True
+            offset = quant_state.offset.to(target_device)
        state2 = quant_state.state2
        absmax2 = state2.absmax.to(target_device)
        code2 = state2.code.to(target_device)
@@ -115,7 +118,8 @@ def dequantize(
            ctypes.c_int(n_elements_absmax),
        )

-    out_absmax += offset
+    if nested:
+        out_absmax += offset

    # Choose appropriate dequantization function
    fx = (
--- a/src/axolotl/loaders/init.py
+++ b/src/axolotl/loaders/init.py
@@ -1,10 +0,0 @@
-"""Init for axolotl.loaders module"""
-
-# pylint: disable=unused-import
-# flake8: noqa
-
-from .adapter import load_adapter, load_lora
-from .constants import MULTIMODAL_AUTO_MODEL_MAPPING
-from .model import ModelLoader
-from .processor import load_processor
-from .tokenizer import load_tokenizer
--- a/src/axolotl/loaders/adapter.py
+++ b/src/axolotl/loaders/adapter.py
@@ -1,206 +0,0 @@
-"""Adapter loading functionality, including LoRA / QLoRA and associated utils"""
-
-import logging
-import os
-import types
-from typing import Any
-
-import bitsandbytes as bnb
-import torch
-from bitsandbytes.nn import Params4bit
-from peft import (
-    AdaptionPromptConfig,
-    LoftQConfig,
-    LoraConfig,
-    PeftConfig,
-    PeftMixedModel,
-    PeftModel,
-    get_peft_model,
-)
-from transformers import PreTrainedModel
-
-from axolotl.loaders.utils import get_linear_embedding_layers
-from axolotl.utils.dict import DictDefault
-
-LOG = logging.getLogger(__name__)
-
-
-def setup_quantized_meta_for_peft(model: torch.nn.Module):
-    """Replaces `quant_state.to` with a dummy function to prevent PEFT from moving `quant_state` to meta device"""
-
-    def temp_to_method(self, *args, **kwargs):  # pylint: disable=unused-argument
-        return self
-
-    for param in model.parameters():
-        if isinstance(param, Params4bit):
-            param.quant_state._orig_to = (  # pylint: disable=protected-access
-                param.quant_state.to
-            )
-            param.quant_state.to = types.MethodType(temp_to_method, param.quant_state)
-
-
-def setup_quantized_peft_meta_for_training(model: torch.nn.Module):
-    """Replaces dummy `quant_state.to` method with the original function to allow training to continue"""
-    for param in model.parameters():
-        if isinstance(param, Params4bit) and hasattr(param.quant_state, "_orig_to"):
-            param.quant_state.to = (
-                param.quant_state._orig_to  # pylint: disable=protected-access
-            )
-            param.quant_state._orig_to = None  # pylint: disable=protected-access
-
-
-def find_all_linear_names(model):
-    cls = (bnb.nn.Linear4bit, bnb.nn.Linear8bitLt, torch.nn.Linear)
-    lora_module_names = set()
-    for name, module in model.named_modules():
-        if (
-            isinstance(module, cls)
-            or "Linear" in module.__class__.__name__
-            and module.__class__.__name__ not in ("LlamaLinearScalingRotaryEmbedding",)
-        ):
-            names = name.split(".")
-            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
-
-    embedding_modules = get_linear_embedding_layers(model.config.model_type)
-    output_embedding = embedding_modules[1]
-    if output_embedding in lora_module_names:  # needed for 16-bit
-        lora_module_names.remove(output_embedding)
-
-    return list(lora_module_names)
-
-
-def load_lora(
-    model: PreTrainedModel,
-    cfg: DictDefault,
-    inference: bool = False,
-    config_only: bool = False,
-) -> tuple[PreTrainedModel | PeftModel | PeftMixedModel | None, PeftConfig | None]:
-    lora_target_modules = cfg.lora_target_modules or []
-
-    if cfg.lora_target_linear:
-        linear_names = find_all_linear_names(model)
-        LOG.info(f"found linear modules: {repr(sorted(linear_names))}")
-        lora_target_modules_as_list = (
-            lora_target_modules
-            if isinstance(lora_target_modules, list)
-            else [lora_target_modules]
-        )
-        lora_target_modules = list(set(lora_target_modules_as_list + linear_names))
-
-    lora_config_kwargs = {}
-    loftq_bits = cfg.peft and cfg.peft.loftq_config and cfg.peft.loftq_config.loftq_bits
-    if loftq_bits:
-        lora_config_kwargs["loftq_config"] = LoftQConfig(loftq_bits=loftq_bits)
-        lora_config_kwargs["init_lora_weights"] = "loftq"
-    if cfg.peft_init_lora_weights:
-        lora_config_kwargs["init_lora_weights"] = cfg.peft_init_lora_weights
-    if cfg.peft_use_dora:
-        lora_config_kwargs["use_dora"] = cfg.peft_use_dora
-        LOG.info("Initializing LoRA weights using dora. This might take longer.")
-    if cfg.peft_use_rslora:
-        lora_config_kwargs["use_rslora"] = cfg.peft_use_rslora
-    if cfg.peft_layer_replication:
-        lora_config_kwargs["layer_replication"] = cfg.peft_layer_replication
-
-    lora_config = LoraConfig(
-        r=cfg.lora_r,
-        lora_alpha=cfg.lora_alpha,
-        target_modules=lora_target_modules,
-        layers_to_transform=cfg.peft_layers_to_transform,
-        layers_pattern=cfg.peft_layers_pattern,
-        lora_dropout=cfg.lora_dropout,
-        fan_in_fan_out=cfg.lora_fan_in_fan_out,
-        modules_to_save=cfg.lora_modules_to_save if cfg.lora_modules_to_save else None,
-        bias="none",
-        task_type="CAUSAL_LM",
-        **lora_config_kwargs,
-    )
-
-    if config_only:
-        return None, lora_config
-
-    rank = int(os.environ.get("LOCAL_RANK", 0))
-
-    if (
-        cfg.fsdp
-        and cfg.adapter
-        and cfg.fsdp_config.fsdp_cpu_ram_efficient_loading
-        and rank != 0
-    ):
-        setup_quantized_meta_for_peft(model)
-
-    if cfg.lora_model_dir:
-        LOG.debug("Loading pretrained PEFT - LoRA")
-        model_kwargs: Any = {}
-        if cfg.lora_on_cpu:
-            model_kwargs["max_memory"] = {"cpu": "256GiB"}
-            model_kwargs["device_map"] = {"": "cpu"}
-        model = PeftModel.from_pretrained(
-            model,
-            cfg.lora_model_dir,
-            is_trainable=(not inference),
-            **model_kwargs,
-        )
-    else:
-        model = get_peft_model(model, lora_config)
-
-    if rank == 0:
-        try:
-            model.print_trainable_parameters()
-        except AttributeError as exc:
-            LOG.warning(
-                "Exception caught during model.print_trainable_parameters(): %s", exc
-            )
-    elif (
-        cfg.fsdp
-        and cfg.adapter
-        and cfg.fsdp_config.fsdp_cpu_ram_efficient_loading
-        and rank != 0
-    ):
-        setup_quantized_peft_meta_for_training(model)
-
-    return model, lora_config
-
-
-def load_adapter(
-    model: PreTrainedModel,
-    cfg: DictDefault,
-    adapter: str | None,
-    inference: bool = False,
-) -> tuple[PreTrainedModel | PeftModel | PeftMixedModel, PeftConfig | None]:
-    if adapter is None:
-        return model, None
-    if hasattr(model, "enable_input_require_grads"):
-        model.enable_input_require_grads()
-    if adapter in ["lora", "qlora"]:
-        peft_model, lora_config = load_lora(model, cfg, inference=inference)
-        return peft_model, lora_config
-    if adapter == "llama-adapter":
-        peft_model, lora_config = load_llama_adapter(model, cfg)
-        return peft_model, lora_config
-
-    raise NotImplementedError(f"{adapter} PEFT adapter not available")
-
-
-def load_llama_adapter(
-    model: PreTrainedModel, cfg: DictDefault
-) -> tuple[PeftModel | PeftMixedModel, PeftConfig]:
-    peft_config = AdaptionPromptConfig(
-        adapter_layers=cfg.peft_adapter.layers,  # layers (L)
-        adapter_len=cfg.peft_adapter.len,  # prompt length (K)
-        task_type="CAUSAL_LM",
-    )
-
-    if cfg.lora_model_dir:
-        LOG.debug("Loading pretrained PEFT - llama_adapter")
-        peft_model = PeftModel.from_pretrained(
-            model,
-            cfg.lora_model_dir,
-            torch_dtype=torch.float16,
-        )
-    else:
-        peft_model = get_peft_model(model, peft_config)
-
-    peft_model.print_trainable_parameters()
-
-    return peft_model, peft_config
--- a/src/axolotl/loaders/constants.py
+++ b/src/axolotl/loaders/constants.py
@@ -1,21 +0,0 @@
-"""Shared constants for axolotl.loaders module"""
-
-from transformers import (
-    Gemma3ForConditionalGeneration,
-    Llama4ForConditionalGeneration,
-    LlavaForConditionalGeneration,
-    Mistral3ForConditionalGeneration,
-    MllamaForConditionalGeneration,
-    Qwen2_5_VLForConditionalGeneration,
-    Qwen2VLForConditionalGeneration,
-)
-
-MULTIMODAL_AUTO_MODEL_MAPPING = {
-    "mllama": MllamaForConditionalGeneration,
-    "llama4": Llama4ForConditionalGeneration,
-    "llava": LlavaForConditionalGeneration,
-    "qwen2_vl": Qwen2VLForConditionalGeneration,
-    "qwen2_5_vl": Qwen2_5_VLForConditionalGeneration,
-    "mistral3": Mistral3ForConditionalGeneration,
-    "gemma3": Gemma3ForConditionalGeneration,
-}
--- a/src/axolotl/loaders/model.py
+++ b/src/axolotl/loaders/model.py
@@ -1,754 +0,0 @@
-"""Model loader class implementation for loading, configuring, and patching various
-models.
-"""
-
-import gc
-import logging
-import math
-import os
-from functools import cached_property
-from importlib.util import find_spec
-from typing import Any
-
-import peft
-import torch
-import transformers
-import transformers.modeling_utils
-from accelerate import init_empty_weights
-from peft import PeftConfig, PeftMixedModel, PeftModel, prepare_model_for_kbit_training
-from transformers import (
-    AutoModelForCausalLM,
-    AutoModelForVision2Seq,
-    AwqConfig,
-    BitsAndBytesConfig,
-    GPTQConfig,
-    PreTrainedModel,
-    PreTrainedTokenizerBase,
-)
-from transformers.integrations.deepspeed import (
-    HfTrainerDeepSpeedConfig,
-    is_deepspeed_zero3_enabled,
-)
-
-from axolotl.common.architectures import MOE_ARCH_BLOCK
-from axolotl.integrations.base import PluginManager
-from axolotl.loaders.adapter import load_adapter, load_lora
-from axolotl.loaders.constants import MULTIMODAL_AUTO_MODEL_MAPPING
-from axolotl.loaders.patch_manager import PatchManager
-from axolotl.loaders.utils import (
-    get_linear_embedding_layers,
-    get_module_class_from_name,
-    load_model_config,
-)
-from axolotl.models.mamba import fix_mamba_attn_for_loss
-from axolotl.utils.bench import log_gpu_memory_usage
-from axolotl.utils.dict import DictDefault
-from axolotl.utils.distributed import (
-    get_device_count,
-    get_device_type,
-)
-from axolotl.utils.model_shard_quant import load_sharded_model_quant
-from axolotl.utils.schemas.enums import RLType
-
-LOG = logging.getLogger(__name__)
-PLUGIN_MANAGER = PluginManager.get_instance()
-
-
-class ModelLoader:
-    """Manages model configuration, initialization and application of patches during
-    model loading.
-
-    This class orchestrates the entire process of loading a model from configuration to
-    final preparation. It handles device mapping, quantization, attention mechanisms,
-    adapter integration, and various optimizations.
-
-    The loading process includes:
-        - Loading and validating model configuration
-        - Applying monkey patches for optimizations / fixes
-        - Setting up device mapping (including multi-GPU configurations)
-        - Configuring quantization
-        - Setting attention mechanisms (Flash Attention, SDPA, etc.)
-        - Loading and initializing the model
-        - Applying adapters (LoRA, QLoRA, etc.)
-
-    Attributes:
-        model: The loaded model instance (available after load() is called).
-        model_kwargs: Dictionary of keyword arguments passed to model initialization.
-        base_model: Name or path of the base model to load.
-        model_type: Type of model to load (e.g., `AutoModelForCausalLM`).
-        model_config: Configuration object for the model.
-        auto_model_loader: class used for loading the model (default:
-            `AutoModelForCausalLM`).
-    """
-
-    def __init__(
-        self,
-        cfg: DictDefault,
-        tokenizer: PreTrainedTokenizerBase,
-        *,
-        inference: bool = False,
-        reference_model: bool = False,
-        **kwargs,  # pylint: disable=unused-argument
-    ):
-        """Initializes the ModelLoader.
-
-        Args:
-            cfg: Configuration dictionary with model and training settings.
-            tokenizer: Tokenizer instance associated with the model.
-            processor: Optional processor for multimodal models. Defaults to None.
-            inference: Whether the model is being loaded for inference mode. Defaults
-                to False.
-            reference_model: Whether this is a reference model (used in setups like DPO
-                training). Defaults to False.
-            **kwargs: Additional keyword arguments (ignored).
-        """
-        self.cfg = cfg
-        self.tokenizer = tokenizer
-        self.inference: bool = inference
-        self.reference_model: bool = reference_model
-
-        # Init model kwargs
-        self.model_kwargs: dict[str, Any] = {}
-        if cfg.overrides_of_model_kwargs:
-            for key, val in cfg.overrides_of_model_kwargs.items():
-                self.model_kwargs[key] = val
-
-        # Init model
-        self.model: PreTrainedModel | PeftModel | PeftMixedModel
-        self.base_model = cfg.base_model
-        self.model_type = cfg.type_of_model
-
-        # Init model config
-        self.model_config = load_model_config(cfg)
-        self.auto_model_loader = AutoModelForCausalLM  # pylint: disable=invalid-name
-
-        # Initialize the patch manager
-        self.patch_manager = PatchManager(
-            cfg=cfg,
-            model_config=self.model_config,
-            inference=inference,
-        )
-
-    @cached_property
-    def has_flash_attn(self) -> bool:
-        """Check if flash attention is installed."""
-        return find_spec("flash_attn") is not None
-
-    @cached_property
-    def qlora_fsdp(self):
-        """Property that determines if FSDP with QLoRA is enabled."""
-        return self.cfg.fsdp and self.cfg.adapter == "qlora"
-
-    def load(self) -> tuple[PreTrainedModel, PeftConfig | None]:
-        """Load and prepare the model with all configurations and patches.
-
-        Returns:
-            A tuple with the loaded model and its LoRA configuration (if applicable).
-        """
-        # Initial setup and patches
-        self.patch_manager.apply_pre_model_load_patches()
-        self._apply_pre_model_load_setup()
-
-        # Build the model
-        PLUGIN_MANAGER.pre_model_load(self.cfg)
-        skip_move_to_device = self._build_model()
-        PLUGIN_MANAGER.post_model_build(self.cfg, self.model)
-
-        # Post-build model configuration
-        self._apply_post_model_load_setup()
-
-        # Load adapters (LoRA, etc.)
-        PLUGIN_MANAGER.pre_lora_load(self.cfg, self.model)
-        lora_config = self._load_adapters()
-        PLUGIN_MANAGER.post_lora_load(self.cfg, self.model)
-
-        # Apply remaining patches and finalize
-        self._apply_post_lora_load_setup(skip_move_to_device)
-        self.patch_manager.apply_post_model_load_patches(self.model)
-        PLUGIN_MANAGER.post_model_load(self.cfg, self.model)
-
-        return self.model, lora_config
-
-    def _apply_pre_model_load_setup(self):
-        """Apply patches and setup configurations before model loading."""
-        self._set_auto_model_loader()
-        self._set_device_map_config()
-        if self.cfg.revision_of_model:
-            self.model_kwargs["revision"] = self.cfg.revision_of_model
-        self._set_quantization_config()
-        self._set_attention_config()
-
-    def _apply_post_model_load_setup(self):
-        """Configure the model after it has been loaded."""
-        # Handle PeftModel if needed
-        if (
-            isinstance(self.model, (peft.PeftModel, peft.PeftModelForCausalLM))
-            and not self.qlora_fsdp
-        ):
-            self.model = self.model.merge_and_unload()
-
-        self._resize_token_embeddings()
-        self._adjust_model_config()
-        self._log_memory_usage()
-        self._configure_embedding_dtypes()
-
-    def _resize_token_embeddings(self):
-        """Resize token embeddings if needed."""
-        embeddings_len = (
-            math.ceil(len(self.tokenizer) / 32) * 32
-            if self.cfg.resize_token_embeddings_to_32x
-            else len(self.tokenizer)
-        )
-        if hasattr(self.model, "get_input_embeddings") and (
-            self.model.get_input_embeddings().num_embeddings < embeddings_len
-            or (
-                self.model.get_input_embeddings().num_embeddings > embeddings_len
-                and self.cfg.shrink_embeddings
-            )
-        ):
-            resize_kwargs = {}
-            if self.cfg.mean_resizing_embeddings is not None and (
-                self.model_config.model_type != "llava"
-            ):
-                resize_kwargs["mean_resizing"] = self.cfg.mean_resizing_embeddings
-            self.model.resize_token_embeddings(embeddings_len, **resize_kwargs)
-        else:
-            self.model.tie_weights()
-
-    def _adjust_model_config(self):
-        if (
-            hasattr(self.model, "config")
-            and hasattr(self.model.config, "max_position_embeddings")
-            and self.model.config.max_position_embeddings
-            and self.cfg.sequence_len > self.model.config.max_position_embeddings
-        ):
-            LOG.warning(
-                "increasing model.config.max_position_embeddings from "
-                f"{self.model.config.max_position_embeddings} to {self.cfg.sequence_len}"
-            )
-            self.model.config.max_position_embeddings = self.cfg.sequence_len
-
-        if (
-            hasattr(self.model, "config")
-            and hasattr(self.model.config, "bos_token_id")
-            and self.model.config.bos_token_id
-            and self.model.config.bos_token_id != self.tokenizer.bos_token_id
-        ):
-            self.model.config.bos_token_id = self.tokenizer.bos_token_id
-
-        if (
-            hasattr(self.model, "config")
-            and hasattr(self.model.config, "eos_token_id")
-            and self.model.config.eos_token_id
-            and self.model.config.eos_token_id != self.tokenizer.eos_token_id
-        ):
-            self.model.config.eos_token_id = self.tokenizer.eos_token_id
-
-    def _log_memory_usage(self):
-        """Log device memory usage after model load."""
-        if hasattr(self.model, "device") and self.model.device.type in (
-            "cuda",
-            "mps",
-            "npu",
-        ):
-            log_gpu_memory_usage(LOG, "after model load", self.model.device)
-
-    def _configure_embedding_dtypes(self):
-        """Configure embedding module dtypes."""
-        # Get embedding modules
-        embedding_modules = get_linear_embedding_layers(self.cfg.model_config_type)
-
-        # Initial dtype conversion
-        if not self.cfg.fsdp:
-            # We don't run this during FSDP because this will leave mixed and bfloat16
-            # dtypes in the model which FSDP doesn't like
-            if self.cfg.load_in_4bit and self.cfg.embeddings_skip_upcast:
-                embedding_modules = []
-            self._convert_embedding_modules_dtype(
-                embedding_modules,
-                dist_dtype=torch.float32,
-                before_kbit_train_or_finetune=True,
-            )
-
-        # Handle DeepSpeed Zero3
-        if is_deepspeed_zero3_enabled():
-            self._set_z3_leaf_modules()
-
-        # Apply gradient checkpointing if needed
-        needs_fa2_dtype = self.cfg.adapter or self.cfg.fsdp
-        if self.cfg.adapter in ["lora", "qlora"]:
-            needs_fa2_dtype = True
-            if self.cfg.gradient_checkpointing:
-                self.model.gradient_checkpointing_enable(
-                    gradient_checkpointing_kwargs=self.cfg.gradient_checkpointing_kwargs
-                )
-
-        self._prepare_model_for_quantization()
-
-        # Convert dtypes if needed
-        should_convert = (
-            # LlamaRMSNorm layers are in fp32 after kbit_training or full finetune, so
-            # we need to convert them back to fp16/bf16 for flash-attn compatibility.
-            (
-                (needs_fa2_dtype or self.cfg.flash_attention or self.cfg.flex_attention)
-                and not self.qlora_fsdp
-            )
-            # CCE requires embedding layers to be in fp16/bf16 for backward pass
-            or self.cfg.cut_cross_entropy
-        )
-
-        if should_convert:
-            LOG.info("Converting modules to %s", self.cfg.torch_dtype)
-            self._convert_embedding_modules_dtype(
-                embedding_modules=embedding_modules,
-                dist_dtype=self.cfg.torch_dtype,
-                before_kbit_train_or_finetune=False,
-            )
-
-    def _load_adapters(self) -> PeftConfig | None:
-        """Load LoRA or other adapters."""
-        # Load LoRA or adapter
-        lora_config = None
-        if not self.reference_model or self.cfg.lora_model_dir:
-            # If we're not loading the reference model, then we're loading the model
-            # for training. Then, the DPO trainer doesn't want the PEFT model loaded
-            # over it, it just wants the LoRA / PEFT config.
-            if (
-                self.cfg.adapter
-                and self.cfg.rl in [RLType.DPO, RLType.IPO, RLType.KTO]
-                and not self.cfg.merge_lora
-            ):
-                _, lora_config = load_lora(
-                    self.model, self.cfg, inference=False, config_only=True
-                )
-            else:
-                self.model, lora_config = load_adapter(
-                    self.model, self.cfg, self.cfg.adapter
-                )
-
-        return lora_config
-
-    def _apply_post_lora_load_setup(self, skip_move_to_device: bool):
-        """Apply final optimizations and patches."""
-        # Place model on accelerator
-        if (
-            self.cfg.ddp
-            and not self.cfg.load_in_8bit
-            and not (self.cfg.rl and self.cfg.load_in_4bit)
-            and not skip_move_to_device
-        ):
-            # TODO: validate this conditional
-            self.model.to(f"{str(get_device_type())}:{self.cfg.local_rank}")
-
-        if get_device_count() > 1 and int(os.getenv("WORLD_SIZE", "1")) == 1:
-            self.model.is_parallelizable = True
-            self.model.model_parallel = True
-
-        if not any(
-            param.requires_grad
-            for _, param in self.model.named_parameters(recurse=True)
-        ):
-            LOG.warning("There are no parameters that require gradient updates")
-
-        if self.cfg.flash_optimum:
-            from optimum.bettertransformer import BetterTransformer
-
-            self.model = BetterTransformer.transform(self.model)
-
-        if self.cfg.adapter is not None:
-            log_gpu_memory_usage(LOG, "after adapters", self.model.device)
-
-        for _ in range(3):
-            gc.collect()
-            torch.cuda.empty_cache()
-
-    def _set_auto_model_loader(self):
-        """Set `self.auto_model_loader`. Defaults to `transformers.AutoModelForCausalLM`
-        (set at `__init__`). When using a multimodal model, `self.auto_model_loader`
-        should be set according to the type of the model.
-        """
-        if self.cfg.is_multimodal:
-            self.auto_model_loader = MULTIMODAL_AUTO_MODEL_MAPPING.get(
-                self.model_config.model_type, AutoModelForVision2Seq
-            )
-
-    def _set_device_map_config(self):
-        """Setup `device_map` according to config"""
-        device_map = self.cfg.device_map
-        max_memory = self.cfg.max_memory
-
-        if self.cfg.gpu_memory_limit:
-            gpu_memory_limit = (
-                str(self.cfg.gpu_memory_limit) + "GiB"
-                if isinstance(self.cfg.gpu_memory_limit, int)
-                else self.cfg.gpu_memory_limit
-            )
-
-            max_memory = {}
-            num_device = get_device_count()
-            for i in range(num_device):
-                max_memory[i] = gpu_memory_limit
-            max_memory["cpu"] = "256GiB"  # something sufficiently large to fit anything
-
-        if max_memory is not None:
-            # Based on https://github.com/togethercomputer/OpenChatKit/blob/main/inference/bot.py
-            from accelerate import infer_auto_device_map
-
-            with init_empty_weights():
-                model_canvas = self.auto_model_loader.from_config(
-                    self.model_config,
-                    trust_remote_code=self.cfg.trust_remote_code or False,
-                )
-            model_canvas.tie_weights()
-            device_map = infer_auto_device_map(
-                model_canvas,
-                max_memory=max_memory,
-                dtype=self.cfg.torch_dtype,
-            )
-            # We can discard max_memory now as we have a device map set up
-            max_memory = None
-
-        self.model_kwargs["torch_dtype"] = self.cfg.torch_dtype
-
-        if not is_deepspeed_zero3_enabled():
-            self.model_kwargs["device_map"] = device_map
-
-            cur_device = get_device_type()
-            if "mps" in str(cur_device):
-                self.model_kwargs["device_map"] = "mps:0"
-            elif "npu" in str(cur_device):
-                self.model_kwargs["device_map"] = "npu:0"
-
-        # TODO: can we put the reference model on it's own gpu? I think we have to move
-        # logits around to calculate loss
-        # if cfg.rl:
-        #     if torch.cuda.device_count() > 1:
-        #         if reference_model:
-        #             model_kwargs["device_map"] = "cuda:" + str(
-        #                 torch.cuda.current_device() + 1
-        #             )
-        #         else:
-        #             model_kwargs["device_map"] = "cuda:" + str(torch.cuda.current_device())
-
-    def _set_quantization_config(self):
-        """Set up quantization config (bitsandbytes, awq, gptq, etc.)"""
-        self.model_kwargs["load_in_8bit"] = self.cfg.load_in_8bit
-        self.model_kwargs["load_in_4bit"] = self.cfg.load_in_4bit
-
-        if self.cfg.gptq:
-            if not hasattr(self.model_config, "quantization_config"):
-                LOG.warning(
-                    "model config does not contain quantization_config information"
-                )
-            else:
-                if self.cfg.gptq_disable_exllama is not None:
-                    self.model_config.quantization_config["disable_exllama"] = (
-                        self.cfg.gptq_disable_exllama
-                    )
-                self.model_kwargs["quantization_config"] = GPTQConfig(
-                    **self.model_config.quantization_config
-                )
-        if (
-            self.cfg.adapter in ["qlora", "lora"]
-            and hasattr(self.model_config, "quantization_config")
-            and self.model_config.quantization_config["quant_method"]
-            in ["gptq", "awq", "bitsandbytes"]
-        ):
-            if self.model_config.quantization_config["quant_method"] == "gptq":
-                self.model_kwargs["quantization_config"] = GPTQConfig(
-                    **self.model_config.quantization_config
-                )
-            elif self.model_config.quantization_config["quant_method"] == "awq":
-                self.model_kwargs["quantization_config"] = AwqConfig(
-                    **self.model_config.quantization_config
-                )
-            elif (
-                self.model_config.quantization_config["quant_method"] == "bitsandbytes"
-            ):
-                self.model_kwargs["quantization_config"] = BitsAndBytesConfig(
-                    **self.model_config.quantization_config
-                )
-        elif self.cfg.adapter == "qlora" and self.model_kwargs["load_in_4bit"]:
-            bnb_config = {
-                "load_in_4bit": True,
-                "llm_int8_threshold": 6.0,
-                "llm_int8_has_fp16_weight": False,
-                "bnb_4bit_compute_dtype": self.cfg.torch_dtype,
-                "bnb_4bit_use_double_quant": True,
-                "bnb_4bit_quant_type": "nf4",
-                "bnb_4bit_quant_storage": torch.bfloat16,
-            }
-            if self.cfg.model_config_type in ["jamba", "qwen2_moe"] and not (
-                self.cfg.deepspeed or self.cfg.fsdp
-            ):
-                # for some reason, this causes the loss to be off by an order of magnitude
-                # but deepspeed needs this still in bfloat16
-                bnb_config["bnb_4bit_quant_storage"] = torch.float32
-
-            if self.cfg.bnb_config_kwargs:
-                bnb_config.update(self.cfg.bnb_config_kwargs)
-
-            self.model_kwargs["quantization_config"] = BitsAndBytesConfig(
-                **bnb_config,
-            )
-        elif self.cfg.adapter == "lora" and self.model_kwargs["load_in_8bit"]:
-            bnb_config = {
-                "load_in_8bit": True,
-            }
-            # Exclude mamba blocks from int8 quantization for jamba
-            if self.cfg.model_config_type == "jamba":
-                bnb_config["llm_int8_skip_modules"] = ["mamba"]
-            self.model_kwargs["quantization_config"] = BitsAndBytesConfig(
-                **bnb_config,
-            )
-
-        # no longer needed per https://github.com/huggingface/transformers/pull/26610
-        if "quantization_config" in self.model_kwargs or self.cfg.gptq:
-            self.model_kwargs.pop("load_in_8bit", None)
-            self.model_kwargs.pop("load_in_4bit", None)
-
-    def _set_attention_config(self):
-        """Sample packing uses custom FA2 patch"""
-        if self.cfg.flex_attention:
-            self.model_kwargs["attn_implementation"] = "flex_attention"
-            self.model_config._attn_implementation = (  # pylint: disable=protected-access
-                "flex_attention"
-            )
-
-        elif self.cfg.flash_attention:
-            if not self.cfg.sample_packing and self.cfg.s2_attention:
-                pass
-            self.model_kwargs["attn_implementation"] = "flash_attention_2"
-            self.model_config._attn_implementation = (  # pylint: disable=protected-access
-                "flash_attention_2"
-            )
-        elif self.cfg.sdp_attention:
-            self.model_kwargs["attn_implementation"] = "sdpa"
-            self.model_config._attn_implementation = (  # pylint: disable=protected-access
-                "sdpa"
-            )
-        elif self.cfg.eager_attention:
-            self.model_kwargs["attn_implementation"] = "eager"
-            self.model_config._attn_implementation = (  # pylint: disable=protected-access
-                "eager"
-            )
-
-        if self.cfg.low_cpu_mem_usage:
-            self.model_kwargs["low_cpu_mem_usage"] = True
-
-    def _configure_zero3_memory_efficient_loading(self):
-        """Set the deepspeed config to load the model into RAM first before moving
-        to VRAM.
-
-        We need to return `hf_ds_cfg` as it needs to exist before model loading.
-        """
-        hf_ds_cfg = None
-
-        if os.getenv("ACCELERATE_DEEPSPEED_ZERO_STAGE") == "3":
-            hf_ds_cfg = HfTrainerDeepSpeedConfig(self.cfg.deepspeed)
-            hf_ds_cfg.fill_match(
-                "train_micro_batch_size_per_gpu", self.cfg.micro_batch_size
-            )
-            hf_ds_cfg.fill_match(
-                "gradient_accumulation_steps", self.cfg.gradient_accumulation_steps
-            )
-            hf_ds_cfg.fill_match(
-                "train_batch_size",
-                int(os.getenv("WORLD_SIZE", "1"))
-                * self.cfg.micro_batch_size
-                * self.cfg.gradient_accumulation_steps,
-            )
-            if "device_map" in self.model_kwargs:
-                del self.model_kwargs["device_map"]
-
-            transformers.modeling_utils.is_deepspeed_zero3_enabled = lambda: True
-            transformers.integrations.deepspeed.is_deepspeed_zero3_enabled = (
-                lambda: True
-            )
-
-        return hf_ds_cfg
-
-    def _build_model(self) -> bool:
-        """Load model, with load strategy depending on config."""
-        skip_move_to_device = False
-        if (
-            self.qlora_fsdp
-            and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading
-            and (
-                self.cfg.model_config_type == "dbrx"
-                or self.cfg.qlora_sharded_model_loading
-            )
-        ):
-            quant_storage = self.cfg.torch_dtype
-            quantization_config = getattr(
-                self.model_config, "quantization_config", None
-            )
-            quantization_config = (
-                quantization_config or self.model_kwargs["quantization_config"]
-            )
-            self.model = load_sharded_model_quant(
-                self.base_model,
-                self.model_config,
-                self.cfg,
-                quant_storage=quant_storage,
-                quantization_config=quantization_config,
-            )
-            skip_move_to_device = True
-        elif (
-            self.model_config.model_type in ["llama", "llama4"]
-            and not self.cfg.trust_remote_code
-            and not self.cfg.gptq
-        ):
-            # TODO: Do we need to open this up for all models?
-            if self.cfg.fsdp and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading:
-                skip_move_to_device = True
-                if "device_map" in self.model_kwargs:
-                    del self.model_kwargs["device_map"]
-
-            self._configure_zero3_memory_efficient_loading()
-
-            # Load model with random initialization if specified
-            if self.cfg.random_init_weights:
-                # AutoModel classes support the from_config method
-                if self.auto_model_loader in [
-                    AutoModelForCausalLM,
-                    AutoModelForVision2Seq,
-                ]:
-                    self.model = self.auto_model_loader.from_config(
-                        config=self.model_config,
-                    )
-                else:
-                    self.model = self.auto_model_loader(config=self.model_config)
-            else:
-                self.model = self.auto_model_loader.from_pretrained(
-                    self.base_model,
-                    config=self.model_config,
-                    **self.model_kwargs,
-                )
-        elif self.model_type == "MambaLMHeadModel":
-            # FIXME this is janky at best and hacked together to make it work
-            MambaLMHeadModel = fix_mamba_attn_for_loss()  # pylint: disable=invalid-name
-
-            self.model_kwargs["dtype"] = self.model_kwargs["torch_dtype"]
-            self.model_kwargs["device"] = torch.cuda.current_device()
-            self.model_kwargs.pop("torch_dtype", None)
-            self.model_kwargs.pop("device_map", None)
-
-            self.model = MambaLMHeadModel.from_pretrained(
-                self.base_model,
-                **self.model_kwargs,
-            )
-        elif (
-            self.model_type
-            and self.model_type != "AutoModelForCausalLM"
-            and not self.cfg.trust_remote_code
-        ):
-            if self.cfg.gptq:
-                self.model = self.auto_model_loader.from_pretrained(
-                    self.base_model,
-                    config=self.model_config,
-                    trust_remote_code=self.cfg.trust_remote_code or False,
-                    **self.model_kwargs,
-                )
-            else:
-                self.model = getattr(transformers, self.model_type).from_pretrained(
-                    self.base_model,
-                    config=self.model_config,
-                    trust_remote_code=self.cfg.trust_remote_code or False,
-                    **self.model_kwargs,
-                )
-        else:
-            if self.cfg.gptq:
-                self.model = self.auto_model_loader.from_pretrained(
-                    self.base_model,
-                    config=self.model_config,
-                    trust_remote_code=self.cfg.trust_remote_code or False,
-                    **self.model_kwargs,
-                )
-            else:
-                if (
-                    self.cfg.fsdp
-                    and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading
-                ):
-                    # disabling either of these two still leads to VRAM spike before setting back down
-                    skip_move_to_device = True
-                    if "device_map" in self.model_kwargs:
-                        del self.model_kwargs["device_map"]
-
-                self._configure_zero3_memory_efficient_loading()
-
-                self.model = self.auto_model_loader.from_pretrained(
-                    self.base_model,
-                    config=self.model_config,
-                    trust_remote_code=self.cfg.trust_remote_code or False,
-                    **self.model_kwargs,
-                )
-        if is_deepspeed_zero3_enabled():
-            skip_move_to_device = True
-
-        return skip_move_to_device
-
-    def _set_z3_leaf_modules(self):
-        from deepspeed.utils import set_z3_leaf_modules
-
-        if self.cfg.model_config_type in MOE_ARCH_BLOCK:
-            moe_blocks = MOE_ARCH_BLOCK[self.cfg.model_config_type]
-            moe_blocks = [moe_blocks] if isinstance(moe_blocks, str) else moe_blocks
-            set_z3_leaf_modules(
-                self.model,
-                [
-                    get_module_class_from_name(self.model, module_name)
-                    for module_name in moe_blocks
-                ],
-            )
-
-    def _prepare_model_for_quantization(self):
-        """Prepare loaded model for quantization."""
-        skip_prepare_model_for_kbit_training = False
-        if self.cfg.model_config_type == "qwen" and self.cfg.adapter == "lora":
-            # Qwen doesn't play nicely with LoRA if this is enabled
-            skip_prepare_model_for_kbit_training = True
-
-        loftq_bits = (
-            self.cfg.peft
-            and self.cfg.peft.loftq_config
-            and self.cfg.peft.loftq_config.loftq_bits
-        )
-        if self.cfg.adapter == "lora" and loftq_bits:
-            skip_prepare_model_for_kbit_training = True
-
-        if (
-            self.qlora_fsdp
-            or (self.cfg.fsdp and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading)
-            or is_deepspeed_zero3_enabled()
-        ):
-            # Make sure everything is in the same dtype
-            skip_prepare_model_for_kbit_training = True
-
-        if (
-            not skip_prepare_model_for_kbit_training
-            and self.cfg.adapter in ["lora", "qlora"]
-            and (self.cfg.load_in_8bit or self.cfg.load_in_4bit)
-        ):
-            LOG.info("converting PEFT model w/ prepare_model_for_kbit_training")
-            self.model = prepare_model_for_kbit_training(
-                self.model, use_gradient_checkpointing=self.cfg.gradient_checkpointing
-            )
-
-    def _convert_embedding_modules_dtype(
-        self,
-        embedding_modules: list[str],
-        dist_dtype: torch.dtype,
-        before_kbit_train_or_finetune: bool,
-    ):
-        for name, module in self.model.named_modules():
-            if "norm" in name:
-                module.to(dist_dtype)
-            if before_kbit_train_or_finetune:
-                if name.endswith(".gate"):
-                    module.to(dist_dtype)
-                if self.model_config.model_type == "btlm":
-                    # don't upcast lm_head for btlm
-                    continue
-            if any(m in name for m in embedding_modules) and hasattr(module, "weight"):
-                module.to(dist_dtype)
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -1,380 +0,0 @@
-"""Patch manager class implementation to complement `axolotl.loaders.ModelLoader`.
-
-Applies pre- and post-model load patches for various fixes and optimizations.
-"""
-
-import importlib.util
-import logging
-from functools import cached_property
-
-import addict
-import transformers
-from transformers import PretrainedConfig, PreTrainedModel
-
-from axolotl.integrations.base import PluginManager
-from axolotl.monkeypatch.multipack import (
-    SUPPORTED_MULTIPACK_MODEL_TYPES,
-    patch_for_multipack,
-)
-from axolotl.utils.dict import DictDefault
-
-LOG = logging.getLogger(__name__)
-PLUGIN_MANAGER = PluginManager.get_instance()
-
-
-class PatchManager:
-    """Manages the application of patches during the model loading process."""
-
-    def __init__(
-        self,
-        cfg: DictDefault,
-        model_config: PretrainedConfig | addict.Dict,
-        inference: bool = False,
-    ):
-        """Initialize the `PatchManager`.
-
-        Args:
-            cfg: Configuration dictionary with model and training settings.
-            model_config: Configuration object for the model.
-            inference: Whether the model is being loaded for inference mode.
-        """
-        self.cfg = cfg
-        self.model_config = model_config
-        self.inference = inference
-
-    @cached_property
-    def has_flash_attn(self) -> bool:
-        """Check if flash attention is installed."""
-        return importlib.util.find_spec("flash_attn") is not None
-
-    def apply_pre_model_load_patches(self):
-        """Apply pre-model load patches based on config."""
-        self._apply_flash_attention_patches()
-        self._apply_fsdp_patches()
-        self._apply_adapter_patches()
-        self._apply_flex_attention_patches()
-        self._apply_model_specific_patches()
-        self._apply_fp8_patches()
-        self._apply_flash_attention_peft_patches()
-        self._apply_gradient_checkpointing_patches()
-        self._patch_attention()
-        self._apply_multipack_patches()
-        self._patch_llama_derived_model()
-        self._apply_mistral_cross_entropy_patch()
-        self._apply_unsloth_self_attention_patch()
-
-    def apply_post_model_load_patches(self, model: PreTrainedModel):
-        """Apply patches that require the model instance."""
-        self._apply_llama_flash_attn_patches(model)
-        self._apply_unsloth_patches(model)
-        self._apply_lora_kernel_patch(model)
-
-    def _apply_flash_attention_patches(self):
-        """Apply patches related to Flash Attention."""
-        if self.cfg.xformers_attention and self.cfg.sample_packing:
-            from axolotl.monkeypatch.attention import patch_xformers_attn_over_fa2
-
-            patch_xformers_attn_over_fa2()
-            self.cfg.flash_attention = True
-
-    def _apply_fsdp_patches(self):
-        """Apply patches for FSDP configurations."""
-        if self.cfg.fsdp_config and str(self.cfg.fsdp_config.fsdp_version) == "2":
-            from axolotl.monkeypatch.accelerate.fsdp2 import patch_accelerate_fsdp_utils
-
-            patch_accelerate_fsdp_utils()
-
-    def _apply_adapter_patches(self):
-        """Apply patches for adapter configurations."""
-        if self.cfg.adapter and self.cfg.embeddings_skip_upcast:
-            from axolotl.monkeypatch.peft.utils import patch_peft_prep_code
-
-            patch_peft_prep_code()
-
-    def _apply_flex_attention_patches(self):
-        """Apply patches for flexible attention."""
-        if self.cfg.flex_attention:
-            from axolotl.monkeypatch.attention.flex_attn import (
-                patch_flex_make_mask,
-                patch_flex_wrapper,
-            )
-
-            flex_attn_compile_kwargs = self.cfg.flex_attn_compile_kwargs or {}
-            patch_flex_wrapper(**flex_attn_compile_kwargs)
-            patch_flex_make_mask()
-
-    def _apply_model_specific_patches(self):
-        """Apply patches specific to model architectures."""
-        if (
-            self.cfg.model_config_type == "llama4"
-            and self.cfg.llama4_linearized_experts
-        ):
-            from axolotl.monkeypatch.models.llama4.modeling import (
-                patch_llama4_linearized_modeling,
-            )
-
-            patch_llama4_linearized_modeling()
-
-        if self.cfg.model_config_type == "gemma3":
-            from axolotl.monkeypatch.gemma3 import (
-                patch_gemma3conditionalgeneration_forward,
-            )
-
-            patch_gemma3conditionalgeneration_forward()
-
-    def _apply_fp8_patches(self):
-        """Apply patches for FP8 support."""
-        if self.cfg.fp8:
-            from axolotl.monkeypatch.trainer_accelerator_args import (
-                patch_create_accelerate_code_for_fp8,
-            )
-
-            patch_create_accelerate_code_for_fp8()
-
-    def _apply_flash_attention_peft_patches(self):
-        """Apply patches for Flash Attention with PEFT."""
-        if self.cfg.adapter:
-            from axolotl.monkeypatch.transformers_fa_utils import (
-                patch_fa_peft_integration,
-            )
-
-            patch_fa_peft_integration()
-
-    def _apply_gradient_checkpointing_patches(self):
-        """Apply patches for gradient checkpointing."""
-        if self.cfg.gradient_checkpointing in ["unsloth", "offload"]:
-            from axolotl.monkeypatch.gradient_checkpointing import (
-                hf_grad_checkpoint_offload_wrapper,
-            )
-
-            transformers.modeling_utils.checkpoint = hf_grad_checkpoint_offload_wrapper
-        if self.cfg.gradient_checkpointing == "offload_disk":
-            from axolotl.monkeypatch.gradient_checkpointing import (
-                hf_grad_checkpoint_disk_offload_wrapper,
-            )
-
-            transformers.modeling_utils.checkpoint = (
-                hf_grad_checkpoint_disk_offload_wrapper
-            )
-
-    def _apply_mistral_cross_entropy_patch(self):
-        """Apply Mistral cross entropy patch if configured."""
-        if (
-            self.cfg.model_config_type == "mistral"
-            and self.cfg.flash_attn_cross_entropy_loss
-        ):
-            from axolotl.monkeypatch.mistral_attn_hijack_flash import (
-                patch_mistral_cross_entropy,
-            )
-
-            patch_mistral_cross_entropy()
-
-    def _apply_unsloth_self_attention_patch(self):
-        """Apply Unsloth self-attention patches if configured."""
-        if self.cfg.unsloth_lora_qkv or self.cfg.unsloth_lora_o:
-            from axolotl.monkeypatch.lora_kernels import patch_self_attn_lora
-
-            patch_self_attn_lora(self.cfg)
-
-    def _apply_multipack_patches(self):
-        """Apply multipack patches if necessary."""
-        if (
-            self.cfg.model_config_type in SUPPORTED_MULTIPACK_MODEL_TYPES
-            and (self.cfg.flash_attention or self.cfg.flex_attention)
-            and self.cfg.sample_packing
-        ):
-            # Get automap config if it exists
-            auto_map_config = None
-            if isinstance(self.model_config, dict) and "auto_map" in self.model_config:
-                auto_map_config = self.model_config["auto_map"]
-            elif hasattr(self.model_config, "auto_map"):
-                auto_map_config = self.model_config.auto_map
-
-            # Determine if the model has remote code
-            if auto_map_config is not None:
-                has_remote_code = "AutoModelForCausalLM" in auto_map_config
-            else:
-                has_remote_code = False
-
-            if has_remote_code and self.cfg.trust_remote_code is False:
-                # If explicitly set in YAML, prefer that
-                has_remote_code = self.cfg.trust_remote_code
-
-            patch_for_multipack(
-                self.cfg.model_config_type,
-                model_name=self.cfg.base_model,
-                has_remote_code=has_remote_code,
-            )
-
-            if self.cfg.is_llama_derived_model:
-                self._patch_loss_llama()
-
-    def _patch_attention(self):
-        """Apply attention-specific patches based on model type."""
-        if not (self.cfg.flash_attention and hasattr(self.model_config, "model_type")):
-            return
-
-        if self.model_config.model_type == "mllama" and self.cfg.flash_attention:
-            from axolotl.monkeypatch.attention.mllama import patch_mllama
-
-            patch_mllama()
-
-        if self.model_config.model_type == "btlm":
-            from axolotl.monkeypatch.btlm_attn_hijack_flash import (
-                replace_btlm_attn_with_flash_attn,
-            )
-
-            replace_btlm_attn_with_flash_attn(self.cfg.base_model)
-
-        if self.model_config.model_type == "stablelm_epoch" and self.cfg.sample_packing:
-            from axolotl.monkeypatch.stablelm_attn_hijack_flash import (
-                replace_stablelm_attn_with_flash_attn,
-            )
-
-            replace_stablelm_attn_with_flash_attn(self.cfg.base_model)
-
-    def _patch_loss_llama(self):
-        """Patch loss functions and other optimizations for LLaMA models."""
-        if self.cfg.flash_attn_cross_entropy and self.has_flash_attn:
-            from axolotl.monkeypatch.llama_attn_hijack_flash import (
-                patch_fa_llama_cross_entropy,
-            )
-
-            patch_fa_llama_cross_entropy()
-        elif self.cfg.unsloth_cross_entropy_loss:
-            from axolotl.monkeypatch.unsloth_ import integrate_cross_entropy_loss_patch
-
-            integrate_cross_entropy_loss_patch(model_type="llama")
-
-        if self.cfg.flash_attn_rms_norm and self.has_flash_attn:
-            from axolotl.monkeypatch.llama_attn_hijack_flash import patch_llama_rms_norm
-
-            patch_llama_rms_norm()
-        elif self.cfg.unsloth_rms_norm:
-            from axolotl.monkeypatch.unsloth_ import patch_unsloth_layernorm
-
-            patch_unsloth_layernorm()
-
-        if self.cfg.unsloth_lora_qkv or self.cfg.unsloth_lora_o:
-            from axolotl.monkeypatch.unsloth_ import patch_self_attn_lora
-
-            patch_self_attn_lora()
-
-    def _patch_llama_flash_attention(self, packed=False):
-        """Apply Flash Attention patches for LLaMA models."""
-        from axolotl.monkeypatch.llama_attn_hijack_flash import (
-            replace_llama_attn_with_flash_attn,
-        )
-
-        if packed:
-            if self.cfg.device not in ["mps", "cpu"] and not self.inference:
-                LOG.info("patching with flash attention for sample packing")
-                replace_llama_attn_with_flash_attn(
-                    packed=True,
-                    cross_entropy=self.cfg.flash_attn_cross_entropy,
-                    rms_norm=self.cfg.flash_attn_rms_norm,
-                )
-        elif self.cfg.s2_attention:
-            LOG.info("patching w/ flash-enabled, shifted-sparse attention")
-            replace_llama_attn_with_flash_attn(
-                packed=False,
-                cross_entropy=self.cfg.flash_attn_cross_entropy,
-                rms_norm=self.cfg.flash_attn_rms_norm,
-                use_shifted_sparse_attn=True,
-            )
-        elif self.cfg.flash_attn_cross_entropy or self.cfg.flash_attn_rms_norm:
-            replace_llama_attn_with_flash_attn(
-                packed=False,
-                cross_entropy=self.cfg.flash_attn_cross_entropy,
-                rms_norm=self.cfg.flash_attn_rms_norm,
-            )
-
-    def _patch_llama_xformers_attention(self):
-        """Apply xformers attention patches for LLaMA models."""
-        from axolotl.monkeypatch.llama_attn_hijack_xformers import (
-            hijack_llama_attention,
-        )
-
-        LOG.info("Patching with xformers attention...")
-        hijack_llama_attention()
-
-    def _patch_llama_sample_packing(self):
-        """Apply sample packing patches for LLaMA models."""
-        from axolotl.monkeypatch.llama_patch_multipack import (
-            hijack_llama_prepare_4d_mask,
-        )
-
-        LOG.info("Patching llama _prepare_4d_causal_attention_mask*...")
-        hijack_llama_prepare_4d_mask()
-
-    def _patch_llama_derived_model(self):
-        """Modify all llama derived models in one block."""
-        if self.cfg.is_llama_derived_model and not (
-            self.cfg.model_config_type in SUPPORTED_MULTIPACK_MODEL_TYPES
-            and (self.cfg.flash_attention or self.cfg.flex_attention)
-            and self.cfg.sample_packing
-        ):
-            self._patch_loss_llama()
-
-            if self.cfg.flash_attention:
-                self._patch_llama_flash_attention(packed=self.cfg.sample_packing)
-            elif self.cfg.xformers_attention:
-                self._patch_llama_xformers_attention()
-            elif self.cfg.sample_packing:
-                self._patch_llama_sample_packing()
-            elif self.cfg.s2_attention:
-                raise NotImplementedError(
-                    "Shifted-sparse attention not currently implemented without flash attention."
-                )
-
-    def _apply_llama_flash_attn_patches(self, model):
-        """Apply LLaMA-specific flash attention patches."""
-        if (
-            self.model_config.model_type in ["llama", "llama4"]
-            and not self.cfg.trust_remote_code
-            and not self.cfg.gptq
-            and self.cfg.flash_attention
-            and not self.inference
-        ):
-            # TODO(MengqingCao): split these patches seperately
-            from axolotl.monkeypatch.llama_attn_hijack_flash import (
-                is_xformers_swiglu_available,
-                replace_llama_mlp_with_swiglu,
-                replace_llama_qkv_with_fused,
-            )
-
-            if self.cfg.flash_attn_fuse_mlp and is_xformers_swiglu_available():
-                LOG.info("Patching with SwiGLU...")
-                replace_llama_mlp_with_swiglu(model)
-
-            if self.cfg.flash_attn_fuse_qkv:
-                LOG.info("Patching with fused QKV...")
-                replace_llama_qkv_with_fused(model)
-
-    def _apply_unsloth_patches(self, model):
-        """Apply unsloth optimization patches."""
-        if self.cfg.unsloth_lora_mlp:
-            from axolotl.monkeypatch.unsloth_ import integrate_lora_mlp_patch
-
-            integrate_lora_mlp_patch(peft_model=model)
-
-        if self.cfg.unsloth_lora_qkv or self.cfg.unsloth_lora_o:
-            from axolotl.monkeypatch.unsloth_ import integrate_lora_patch
-
-            integrate_lora_patch(peft_model=model, cfg=self.cfg)
-
-        if self.cfg.unsloth_rope:
-            from axolotl.monkeypatch.unsloth_ import integrate_rope_embeddings
-
-            integrate_rope_embeddings()
-
-    def _apply_lora_kernel_patch(self, model):
-        """Apply LoRA kernel patches."""
-        if (
-            self.cfg.lora_mlp_kernel
-            or self.cfg.lora_qkv_kernel
-            or self.cfg.lora_o_kernel
-        ):
-            from axolotl.monkeypatch.lora_kernels import apply_lora_kernel_patches
-
-            apply_lora_kernel_patches(model=model, cfg=self.cfg)
--- a/src/axolotl/loaders/processor.py
+++ b/src/axolotl/loaders/processor.py
@@ -1,56 +0,0 @@
-"""Processor loading functionality for multi-modal models"""
-
-import logging
-from typing import Any
-
-import transformers
-from transformers import (
-    AutoProcessor,
-    PreTrainedTokenizerBase,
-)
-
-from axolotl.utils.dict import DictDefault
-
-LOG = logging.getLogger(__name__)
-
-
-def load_processor(cfg: DictDefault, tokenizer: PreTrainedTokenizerBase):
-    processor_kwargs: dict[str, Any] = {}  # Do we actually need this?
-
-    processor_cls = AutoProcessor
-    if cfg.processor_type:
-        processor_cls = getattr(transformers, cfg.processor_type)
-
-    processor = processor_cls.from_pretrained(
-        cfg.processor_config,
-        trust_remote_code=cfg.trust_remote_code or False,
-        tokenizer=tokenizer,
-        **processor_kwargs,
-    )
-
-    # Attempt to load image size from processor if available
-    if (
-        cfg.image_size is None
-        and hasattr(processor, "size")
-        and any(dim in processor.size for dim in ["width", "height"])
-    ):
-        im_width = None
-        im_height = None
-        if "width" in processor.size:
-            im_width = processor.size["width"]
-        if "height" in processor.size:
-            im_height = processor.size["height"]
-
-        # If both width and height are set, use a tuple
-        if im_width is not None and im_height is not None:
-            cfg.image_size = (im_width, im_height)
-        # If only width is set, use as integer
-        elif im_width is not None:
-            cfg.image_size = im_width
-        # If only height is set, use as integer
-        elif im_height is not None:
-            cfg.image_size = im_height
-
-        LOG.debug(f"Loaded image size: {cfg.image_size} from processor")
-
-    return processor
--- a/src/axolotl/loaders/tokenizer.py
+++ b/src/axolotl/loaders/tokenizer.py
@@ -1,281 +0,0 @@
-"""Tokenizer loading functionality and associated utils"""
-
-import json
-import logging
-import os
-
-import transformers
-from transformers import (
-    AddedToken,
-    AutoTokenizer,
-)
-
-from axolotl.integrations.base import PluginManager
-from axolotl.loaders.utils import get_linear_embedding_layers, load_model_config
-from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN
-from axolotl.utils.chat_templates import get_chat_template_from_config
-from axolotl.utils.distributed import (
-    barrier,
-    is_local_main_process,
-    is_main_process,
-)
-
-LOG = logging.getLogger(__name__)
-PLUGIN_MANAGER = PluginManager.get_instance()
-
-
-def modify_tokenizer_files(
-    tokenizer_path: str, token_mappings: dict[int, str], output_dir: str
-) -> str:
-    """
-    Modify tokenizer files to replace added_tokens strings, save to output directory,
-    and return the path to the modified tokenizer.
-
-    This only works with reserved tokens that were added to the tokenizer, not tokens
-    already part of the vocab.
-
-    Args:
-        tokenizer_path: Path or name of the original tokenizer
-        token_mappings: Dict mapping {token_id (int): new_token_string}
-        output_dir: Directory to save the modified tokenizer
-
-    Returns:
-        Path to the modified tokenizer directory
-
-    Ref: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941
-    """
-    # Create the tokenizer directory in output_dir if it doesn't exist
-    tokenizer_dir = os.path.join(output_dir, "tokenizer")
-    os.makedirs(tokenizer_dir, exist_ok=True)
-
-    if is_local_main_process():  # pylint: disable=too-many-nested-blocks
-        # Load the tokenizer
-        temp_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True)
-
-        # Save the tokenizer to the output directory
-        temp_tokenizer.save_pretrained(tokenizer_dir)
-
-        # Get the token IDs and map them to their new values
-        token_id_mappings = {
-            int(token_id): new_value for token_id, new_value in token_mappings.items()
-        }
-
-        # 1. Update tokenizer_config.json - added_tokens_decoder
-        config_path = os.path.join(tokenizer_dir, "tokenizer_config.json")
-        if os.path.exists(config_path):
-            with open(config_path, "r", encoding="utf-8") as f:
-                config_data = json.load(f)
-
-            # Update added_tokens_decoder
-            if "added_tokens_decoder" in config_data:
-                for token_id, new_value in token_id_mappings.items():
-                    token_id_str = str(token_id)
-                    if token_id_str in config_data["added_tokens_decoder"]:
-                        config_data["added_tokens_decoder"][token_id_str][
-                            "content"
-                        ] = new_value
-                    else:
-                        raise ValueError(
-                            f"Token ID {token_id_str} not found in added_tokens_decoder"
-                        )
-
-            # Write the updated config back
-            with open(config_path, "w", encoding="utf-8") as f:
-                json.dump(config_data, f, indent=2)
-
-        # 2. Update tokenizer.json - added_tokens
-        tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json")
-        if os.path.exists(tokenizer_path):
-            with open(tokenizer_path, "r", encoding="utf-8") as f:
-                tokenizer_data = json.load(f)
-
-            # Update added_tokens
-            if "added_tokens" in tokenizer_data:
-                for token_id, new_value in token_id_mappings.items():
-                    for i, token_entry in enumerate(tokenizer_data["added_tokens"]):
-                        if token_entry["id"] == token_id:
-                            tokenizer_data["added_tokens"][i]["content"] = new_value
-                            break
-                    else:
-                        # Reaching this section means the token_id was not found in tokenizer.json added_tokens
-                        raise ValueError(
-                            f"Token ID {token_id} not found in added_tokens"
-                        )
-            if "model" in tokenizer_data and "vocab" in tokenizer_data["model"]:
-                for token_id, new_value in token_id_mappings.items():
-                    for entry_val, entry_id in tokenizer_data["model"]["vocab"].items():
-                        if entry_id == token_id:
-                            del tokenizer_data["model"]["vocab"][entry_val]
-                            tokenizer_data["model"]["vocab"][new_value] = token_id
-                            break
-
-            # Write the updated tokenizer data back
-            with open(tokenizer_path, "w", encoding="utf-8") as f:
-                json.dump(tokenizer_data, f, indent=2)
-
-    barrier()
-    return tokenizer_dir
-
-
-def load_tokenizer(cfg):
-    """Load and configure the tokenizer based on the provided config."""
-    model_config = load_model_config(cfg)
-    tokenizer_kwargs = {}
-    use_fast = True  # this is the default
-
-    if cfg.tokenizer_use_fast is not None:
-        use_fast = cfg.tokenizer_use_fast
-    if cfg.tokenizer_legacy is not None:
-        # True is the default w/ https://github.com/huggingface/transformers/pull/25224
-        tokenizer_kwargs["legacy"] = cfg.tokenizer_legacy
-
-    tokenizer_cls = AutoTokenizer
-    if cfg.tokenizer_type:
-        tokenizer_cls = getattr(transformers, cfg.tokenizer_type)
-
-    # Set base tokenizer path
-    tokenizer_path = cfg.tokenizer_config
-
-    # Apply token string overrides if specified
-    if cfg.added_tokens_overrides:
-        # Modify tokenizer files and get path to modified tokenizer
-        tokenizer_path = modify_tokenizer_files(
-            tokenizer_path, cfg.added_tokens_overrides, output_dir=cfg.output_dir
-        )
-
-    tokenizer = tokenizer_cls.from_pretrained(
-        tokenizer_path,
-        trust_remote_code=cfg.trust_remote_code or False,
-        use_fast=use_fast,
-        **tokenizer_kwargs,
-    )
-
-    if (
-        tokenizer.__class__.__name__
-        in [
-            "LlamaTokenizer",
-            "LlamaTokenizerFast",
-            "CodeLlamaTokenizer",
-            "CodeLlamaTokenizerFast",
-        ]
-        and hasattr(tokenizer, "pad_token")
-        and not tokenizer.pad_token
-    ):
-        # set a pad_token, but use eos_token so we don't add a new token
-        tokenizer.pad_token = LLAMA_DEFAULT_EOS_TOKEN
-
-    if tokenizer.__class__.__name__ == "GPTNeoXTokenizerFast":
-        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-        os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-    # Mistral's official FA implementation requires left padding
-    if cfg.is_mistral_derived_model and cfg.flash_attention and not cfg.sample_packing:
-        tokenizer.padding_side = "left"
-
-    # Qwen base only has single token, so we need to set the special tokens
-    if cfg.is_qwen_derived_model:
-        token_ids = ["bos_token_id", "eos_token_id", "pad_token_id", "unk_token_id"]
-        for attr_name in token_ids:
-            if getattr(tokenizer, attr_name) is None:
-                setattr(tokenizer, attr_name, tokenizer.eod_id)
-
-        token_names = ["bos_token", "eos_token", "pad_token", "unk_token"]
-        for attr_name in token_names:
-            if getattr(tokenizer, attr_name) is None:
-                setattr(tokenizer, attr_name, "<|endoftext|>")
-
-    additional_special_tokens = None
-    if cfg.special_tokens:
-        special_tokens = cfg.special_tokens.to_dict()
-        additional_special_tokens = special_tokens.pop(
-            "additional_special_tokens", None
-        )
-        lora_modules_to_save = get_linear_embedding_layers(model_config.model_type)
-        for k, val in special_tokens.items():
-            # check if new special token is not already in tokenizer and
-            # is adapter training to make sure lora_modules_to_save is set
-            # pylint: disable=too-many-boolean-expressions
-            if (
-                (getattr(tokenizer, k) is None or getattr(tokenizer, k) != val)
-                and (len(tokenizer.encode(val, add_special_tokens=False)) > 2)
-                and cfg.adapter
-                and (
-                    not cfg.lora_modules_to_save
-                    or not all(
-                        x in cfg.lora_modules_to_save for x in lora_modules_to_save
-                    )
-                )
-                and k != "pad_token"
-            ):
-                lora_modules_to_save = ", ".join(
-                    [f"`{x}`" for x in lora_modules_to_save]
-                )
-                raise ValueError(
-                    f"Please set lora_modules_to_save to [{lora_modules_to_save}] when using an adapter and changing the special tokens."
-                )
-
-            tokenizer.add_special_tokens(
-                {k: AddedToken(val, rstrip=False, lstrip=False, normalized=False)}
-            )
-
-        # If we add bos_token and eos_token, we need to update the post processor to
-        # handle them correctly.
-        # https://github.com/huggingface/transformers/pull/24132
-        bos_or_eos_in_special_tokens = (
-            "bos_token" in cfg.special_tokens and "eos_token" in cfg.special_tokens
-        )
-        if (
-            tokenizer.__class__.__name__
-            in (
-                "LlamaTokenizerFast",
-                "CodeLlamaTokenizerFast",
-            )
-            and bos_or_eos_in_special_tokens
-        ):
-            tokenizer.update_post_processor()
-
-    if cfg.tokens:
-        tokenizer.add_tokens(
-            [
-                AddedToken(token, rstrip=False, lstrip=False, normalized=False)
-                for token in cfg.tokens
-            ]
-        )
-
-    # Additional special tokens are a List, and need to be treated differently than regular special
-    # tokens. We add them after we have called `add_tokens` in case these additional special tokens
-    # are new tokens.
-    #
-    # Usage:
-    #
-    # ```py
-    # special_tokens:
-    #   additional_special_tokens: ["<|im_start|>", "<|im_end|>"]
-    # ```
-    if additional_special_tokens is not None:
-        tokenizer.add_special_tokens(
-            {"additional_special_tokens": additional_special_tokens}
-        )
-
-    if is_main_process(use_environ=True):
-        LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
-        LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
-        LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
-        LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")
-
-    if cfg.chat_template:
-        chat_template_string = get_chat_template_from_config(
-            cfg=cfg,
-            tokenizer=tokenizer,
-        )
-        if cfg.default_system_message and cfg.chat_template == "chatml":
-            chat_template_string = chat_template_string.replace(
-                "You are a helpful assistant.", cfg.default_system_message
-            )
-
-        tokenizer.chat_template = chat_template_string
-    else:
-        LOG.info(
-            "No Chat template selected. Consider adding a chat template for easier inference."
-        )
-    return tokenizer
--- a/src/axolotl/loaders/utils.py
+++ b/src/axolotl/loaders/utils.py
@@ -1,211 +0,0 @@
-"""Utilities for axolotl.loaders module"""
-
-import contextlib
-import logging
-from typing import Type
-
-import addict
-import torch
-from transformers import AutoConfig, PretrainedConfig, PreTrainedModel
-
-from axolotl.utils.dict import DictDefault
-
-LOG = logging.getLogger(__name__)
-
-
-def get_module_class_from_name(
-    module: torch.nn.Module, name: str
-) -> Type[torch.nn.Module] | None:
-    """Gets a class from a module by its name. Copied from `accelerate.utils.dataclasses`
-    (https://github.com/huggingface/accelerate/blob/main/src/accelerate/utils/dataclasses.py#L2805).
-
-    Args:
-        module: The module to get the class from.
-        name: The name of the class.
-
-    Returns:
-        The class type of the matching module, or `None` if no match is found.
-    """
-    modules_children = list(module.children())
-    if module.__class__.__name__ == name:
-        return module.__class__
-
-    if len(modules_children) == 0:
-        return None
-
-    for child_module in modules_children:
-        module_class = get_module_class_from_name(child_module, name)
-        if module_class is not None:
-            return module_class
-
-    return None
-
-
-def check_model_config(cfg: DictDefault, model_config: PretrainedConfig):
-    """Validates and adjusts model config based on `axolotl` config.
-
-    This function performs several important checks and adjustments:
-        - Disables model caching for better memory efficiency
-        - Handles multimodal model-specific configurations
-        - Validates quantization settings
-        - Ensures proper LoRA configuration when using adapters with new tokens
-
-    Args:
-        cfg: Dictionary mapping `axolotl` config keys to values.
-        model_config: The model's configuration object from `transformers`.
-
-    Raises:
-        ValueError: If a multimodal model lacks text configuration, if GPTQ settings
-            are inconsistent, or if LoRA `modules_to_save` is improperly configured
-            with new tokens.
-    """
-    if hasattr(model_config, "use_cache"):
-        model_config.use_cache = False
-
-    if cfg.is_multimodal:
-        # For multimodal configs, use_cache is set in the text_config
-        if hasattr(model_config, "get_text_config"):
-            text_config = model_config.get_text_config()
-            if hasattr(text_config, "use_cache"):
-                text_config.use_cache = False
-        else:
-            raise ValueError(
-                "No text config found for multimodal model. Please raise an Issue with model details."
-            )
-
-        # Check if image_size is not set and load image size from model config if available
-        if (
-            cfg.image_size is None
-            and hasattr(model_config, "vision_config")
-            and hasattr(model_config.vision_config, "image_size")
-        ):
-            cfg.image_size = model_config.vision_config.image_size
-            LOG.debug(f"Loaded image size: {cfg.image_size} from model config")
-
-    quant_config_exists = (
-        hasattr(model_config, "quantization_config")
-        and model_config.quantization_config
-    )
-
-    # Detect compressed-tensors config
-    is_compressed_tensors_config = (
-        quant_config_exists
-        and model_config.quantization_config.get("quant_method") == "compressed-tensors"
-    )
-
-    if is_compressed_tensors_config:
-        if model_config.quantization_config.get("config_groups"):
-            LOG.warning(
-                "Found `config_groups` in a compressed-tensors config. "
-                "QAT integration with llmcompressor is not tested."
-            )
-        # Skip further quant checks for compressed-tensors
-        return
-
-    quant_config_method_is_gptq = (
-        quant_config_exists
-        and "quant_method" in model_config.quantization_config
-        and model_config.quantization_config["quant_method"] == "gptq"
-    )
-
-    if cfg.gptq and not quant_config_method_is_gptq:
-        raise ValueError(
-            "model_config.quantization_config is not set or quant_method is not set to gptq. "
-            "Please make sure to point to a GPTQ model."
-        )
-
-    lora_modules_to_save = get_linear_embedding_layers(model_config.model_type)
-    if (
-        cfg.adapter
-        and cfg.tokens
-        and (
-            not cfg.lora_modules_to_save
-            or not all(x in cfg.lora_modules_to_save for x in lora_modules_to_save)
-        )
-    ):
-        lora_modules_to_save_joined = ", ".join(
-            map(lambda x: f"`{x}`", lora_modules_to_save)
-        )
-        raise ValueError(
-            "`lora_modules_to_save` not properly set when adding new tokens. "
-            f"Please include [{lora_modules_to_save_joined}] in `lora_modules_to_save`."
-        )
-
-
-def load_model_config(cfg: DictDefault) -> PretrainedConfig | addict.Dict:
-    """Loads and configures a model configuration from HuggingFace or local sources.
-
-    This function determines the appropriate model config source, loads it, applies any
-    necessary overrides, and validates it for compatibility with the `axolotl` config.
-
-    Args:
-        cfg: Dictionary mapping `axolotl` config keys to values.
-
-    Returns:
-        A configured model configuration object (`AutoConfig` instance), or a simple
-            dictionary configuration for special cases like Mamba models.
-
-    Raises:
-        ValueError: If configuration loading fails for reasons other than special cases
-            that are handled (e.g., Mamba models).
-    """
-    model_config_name = cfg.base_model_config or cfg.base_model
-    if not model_config_name and cfg.tokenizer_config:
-        model_config_name = cfg.tokenizer_config
-    trust_remote_code = cfg.trust_remote_code is True
-    config_kwargs = {}
-    if cfg.revision_of_model:
-        config_kwargs["revision"] = cfg.revision_of_model
-    if cfg.num_labels:
-        # num_labels is used to initialize classifier models
-        config_kwargs["num_labels"] = cfg.num_labels
-    try:
-        model_config = AutoConfig.from_pretrained(
-            model_config_name,
-            trust_remote_code=trust_remote_code,
-            **config_kwargs,
-        )
-    except ValueError as error:
-        if "mamba" in model_config_name:
-            return addict.Dict(
-                {
-                    "model_type": "mamba",
-                }
-            )
-        raise error
-
-    if cfg.overrides_of_model_config:
-        for key, val in cfg.overrides_of_model_config.items():
-            setattr(model_config, key, val)
-
-    check_model_config(cfg, model_config)
-
-    return model_config
-
-
-def ensure_dtype(model: PreTrainedModel, dtype: torch.dtype = torch.bfloat16):
-    """Ensures all modules in the model are converted to the specified data type."""
-    for name, module in model.named_modules():
-        weight_mismatch = False
-        with contextlib.suppress(AttributeError):
-            weight_mismatch = module.weight.dtype != dtype
-
-        bias_mismatch = False
-        with contextlib.suppress(AttributeError):
-            bias_mismatch = module.bias.dtype != dtype
-
-        if weight_mismatch:
-            print(f"Converting module {name}.weight: {module.weight.dtype} -> {dtype}")
-        if bias_mismatch:
-            print(f"Converting module {name}.bias: {module.bias.dtype} -> {dtype}")
-        if weight_mismatch or bias_mismatch:
-            module.to(dtype)
-
-
-def get_linear_embedding_layers(model_type: str) -> list[str]:
-    """Returns layer names of linear embeddings needed for LoRA based on model type."""
-    if model_type == "gpt_neox":
-        return ["embed_in", "embed_out"]
-    if model_type == "falcon":
-        return ["word_embeddings", "lm_head"]
-    return ["embed_tokens", "lm_head"]
--- a/src/axolotl/monkeypatch/attention/init.py
+++ b/src/axolotl/monkeypatch/attention/init.py
@@ -1,19 +0,0 @@
-"""
-attention module for attention monkeypatches
-"""
-
-from transformers.integrations.flash_attention import flash_attention_forward
-
-
-def patch_xformers_attn_over_fa2():
-    from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
-
-    from .xformers import xformers_attention_forward
-
-    ALL_ATTENTION_FUNCTIONS["flash_attention_2"] = xformers_attention_forward
-
-
-def unpatch_xformers_attn_over_fa2():
-    from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
-
-    ALL_ATTENTION_FUNCTIONS["flash_attention_2"] = flash_attention_forward()
--- a/src/axolotl/monkeypatch/attention/ring_attn/init.py
+++ b/src/axolotl/monkeypatch/attention/ring_attn/init.py
@@ -0,0 +1,12 @@
+"""Init for ring attention monkeypatch module"""
+
+# pylint: disable=unused-import
+# flake8: noqa
+
+from .patch import (
+    RingAttnFunc,
+    get_ring_attn_group,
+    register_ring_attn,
+    set_ring_attn_group,
+    update_ring_attn_params,
+)
--- a/src/axolotl/monkeypatch/attention/ring_attn/adapters/init.py
+++ b/src/axolotl/monkeypatch/attention/ring_attn/adapters/init.py
--- a/src/axolotl/monkeypatch/attention/ring_attn/adapters/batch.py
+++ b/src/axolotl/monkeypatch/attention/ring_attn/adapters/batch.py
@@ -16,7 +16,11 @@ import torch
 import torch.distributed as dist
 import transformers
 import transformers.modeling_flash_attention_utils
-from ring_flash_attn import ring_flash_attn_func
+from ring_flash_attn import (
+    ring_flash_attn_func,
+    stripe_flash_attn_func,
+    zigzag_ring_flash_attn_func,
+)
 from ring_flash_attn.adapters.hf_adapter import check_params
 from transformers.modeling_flash_attention_utils import (
    _flash_supports_window_size,
@@ -24,12 +28,12 @@ from transformers.modeling_flash_attention_utils import (
 )
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS

-from axolotl.utils.schemas.enums import RingAttnFunc
+from axolotl.monkeypatch.attention.ring_attn.patch import RingAttnFunc

 RING_ATTN_FUNC_MAPPING = {
-    RingAttnFunc.BATCH_RING: torch.compile(ring_flash_attn_func),
-    # RingAttnFunc.BATCH_ZIGZAG: torch.compile(zigzag_ring_flash_attn_func),
-    # RingAttnFunc.BATCH_STRIPE: torch.compile(stripe_flash_attn_func),
+    RingAttnFunc.BATCH_RING: ring_flash_attn_func,
+    RingAttnFunc.BATCH_ZIGZAG: zigzag_ring_flash_attn_func,
+    RingAttnFunc.BATCH_STRIPE: stripe_flash_attn_func,
 }


--- a/src/axolotl/monkeypatch/attention/ring_attn/patch.py
+++ b/src/axolotl/monkeypatch/attention/ring_attn/patch.py
@@ -0,0 +1,147 @@
+"""
+Ring attention group registration and flash attention patching.
+
+Make use of the `ring-flash-attn` (https://github.com/zhuzilin/ring-flash-attention)
+package, specifically the `hf_adapter.substitute_hf_flash_attn` function to patch in
+their sequence parallel version of Flash Attention 2.
+"""
+
+from enum import Enum
+
+import torch
+import torch.distributed as dist
+from accelerate.logging import get_logger
+
+from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
+
+LOG = get_logger(__name__)
+
+
+RING_ATTN_GROUP = None
+
+
+def get_ring_attn_group() -> dist.ProcessGroup:
+    """
+    Getter for ring attention group on this rank.
+
+    Returns:
+        The process group for ring attention for this rank.
+    """
+    return RING_ATTN_GROUP
+
+
+def set_ring_attn_group(ring_attn_group: dist.ProcessGroup | None):
+    """
+    Setter for ring attention group on this rank.
+
+    Args:
+        Process group for ring attention.
+    """
+    global RING_ATTN_GROUP  # pylint: disable=global-statement
+    RING_ATTN_GROUP = ring_attn_group
+
+
+class RingAttnFunc(str, Enum):
+    """Enum class for supported `ring-flash-attn` implementations"""
+
+    # VARLEN_RING = "varlen_ring"
+    # VARLEN_ZIGZAG = "varlen_zigzag"
+    VARLEN_LLAMA3 = "varlen_llama3"
+    BATCH_RING = "batch_ring"
+    BATCH_ZIGZAG = "batch_zigzag"
+    BATCH_STRIPE = "batch_stripe"
+
+
+def register_ring_attn(
+    sequence_parallel_degree: int,
+    heads_k_stride: int | None,
+    ring_attn_func: RingAttnFunc | None,
+):
+    """
+    Create ring attention group and substitute flash attn with ring flash attn.
+
+    Args:
+        sequence_parallel_degree: Sequence parallelism factor.
+        heads_k_stride: Sequence parallelism K head stride size. Passed
+            through to `ring_flash_attn.substitute_hf_flash_attn`.
+        ring_attn_func: `ring_flash_attn` ring attention implemention. If sample
+            packing is enabled, it must be a `varlen` function; otherwise, it must be a
+            `batch` function.
+    """
+    if get_ring_attn_group() is not None:
+        LOG.info("Ring attention already registered, exiting early...")
+        return
+
+    LOG.info(
+        "Enabling ring attention sequence parallelism: "
+        f"each sequence will be processed across {sequence_parallel_degree} GPUs"
+    )
+
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+
+    assert sequence_parallel_degree <= world_size, (
+        f"sequence_parallel_degree ({sequence_parallel_degree}) "
+        f"must be less than or equal to world_size ({world_size})"
+    )
+    assert world_size % sequence_parallel_degree == 0, (
+        f"sequence_parallel_degree ({sequence_parallel_degree}) "
+        f"must evenly divide world_size ({world_size})"
+    )
+
+    # Assign ranks to sequence parallel groups
+    group_assignments = {}
+    for i in range(world_size // sequence_parallel_degree):
+        ring_attn_ranks = list(
+            range(
+                i * sequence_parallel_degree,
+                (i + 1) * sequence_parallel_degree,
+            )
+        )
+        group = dist.new_group(ranks=ring_attn_ranks, backend="nccl")
+
+        # Track which GPUs are in which groups
+        for r in ring_attn_ranks:
+            group_assignments[r] = i
+
+        if rank in ring_attn_ranks:
+            set_ring_attn_group(group)
+
+    # Log the GPU group assignments
+    if rank == 0:
+        LOG.info(f"Sequence parallel group assignments: {group_assignments}")
+
+    if ring_attn_func is RingAttnFunc.VARLEN_LLAMA3:
+        from ring_flash_attn import substitute_hf_flash_attn
+
+        substitute_hf_flash_attn(
+            process_group=get_ring_attn_group(), heads_k_stride=heads_k_stride or 1
+        )
+    elif ring_attn_func in [
+        RingAttnFunc.BATCH_RING,
+        RingAttnFunc.BATCH_ZIGZAG,
+        RingAttnFunc.BATCH_STRIPE,
+    ]:
+        from axolotl.monkeypatch.attention.ring_attn.adapters.batch import (
+            substitute_hf_flash_attn,
+        )
+
+        substitute_hf_flash_attn(
+            process_group=get_ring_attn_group(),
+            ring_attn_func=ring_attn_func,
+        )
+
+
+def update_ring_attn_params(position_ids: torch.Tensor | None):
+    """
+    Calculate the cumulative sequence lengths for the current forward pass and pass the
+    value to the substituted `ring_flash_attn`.
+
+    Args:
+        position_ids: Optional tensor of position IDs (for sample packed data).
+    """
+    from ring_flash_attn import update_ring_flash_attn_params
+
+    cu_seqlens, _ = get_cu_seqlens_from_pos_ids(position_ids)
+    cu_seqlens = cu_seqlens.squeeze().to(device=torch.cuda.current_device())
+    update_ring_flash_attn_params(cu_seqlens, get_ring_attn_group())
--- a/src/axolotl/monkeypatch/attention/xformers.py
+++ b/src/axolotl/monkeypatch/attention/xformers.py
@@ -1,160 +0,0 @@
-"""
-xformers attention implementation for packing
-"""
-
-from typing import Optional
-
-import torch
-import xformers
-import xformers.ops.fmha
-from transformers.modeling_flash_attention_utils import (
-    _upad_input,
-)
-
-from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
-
-xformers_attention = xformers.ops.fmha.memory_efficient_attention
-
-
-def xformers_attention_forward(
-    module: torch.nn.Module,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    dropout: float = 0.0,  # pylint: disable=unused-argument
-    scaling: Optional[float] = None,  # pylint: disable=unused-argument
-    sliding_window: Optional[int] = None,  # pylint: disable=unused-argument
-    softcap: Optional[float] = None,  # pylint: disable=unused-argument
-    cu_seq_lens_q: Optional[torch.LongTensor] = None,
-    cu_seq_lens_k: Optional[torch.LongTensor] = None,
-    max_length_q: Optional[int] = None,
-    max_length_k: Optional[int] = None,  # pylint: disable=unused-argument
-    **kwargs,  # pylint: disable=unused-argument
-):
-    # Get dimensions
-    # query: [batch, heads, seq_len, hidden_dim]
-    batch_size = query.size(0)
-    query_length = query.shape[2]
-    key_length = key.shape[2]
-
-    # Default causal mask
-    attn_bias = xformers.ops.LowerTriangularMask()
-
-    # Check if we have sliding window attention
-    has_sliding_window = sliding_window is not None and sliding_window < query_length
-
-    # Transpose dimensions for xformers (Q: [b, h, s, d] -> [b, s, h, d])
-    query = query.transpose(1, 2)
-    key = key.transpose(1, 2)
-    value = value.transpose(1, 2)
-
-    # Get GQA parameters
-    num_attention_heads = module.config.num_attention_heads
-    num_key_value_heads = module.config.num_key_value_heads
-    head_dim = query.size(-1)
-    is_gqa = num_attention_heads != num_key_value_heads
-    n_groups = num_attention_heads // num_key_value_heads if is_gqa else 1
-
-    # If position_ids is provided and check all examples do not contain only 1 sequence, If tensor in increasing
-    # then we probably have one sequence, otherwise it is packed. Additionally check we are in pre-fill/training stage.
-    # Use `flash_attn_varlen_func` to prevent cross-example attention and also allow padding free approach
-    if position_ids is not None and (
-        max_length_q is not None
-        or (query_length != 1 and not (torch.diff(position_ids, dim=-1) >= 0).all())
-    ):
-        if cu_seq_lens_q is None or cu_seq_lens_k is None:
-            cu_seq_lens_q = get_cu_seqlens_from_pos_ids(position_ids)[0]
-            cu_seq_lens_q = cu_seq_lens_q.squeeze()
-            seq_lengths = cu_seq_lens_q[1:] - cu_seq_lens_q[:-1]
-            attn_bias = (
-                xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask.from_seqlens(
-                    q_seqlen=seq_lengths.tolist(),
-                )
-            )
-        else:
-            query = query.reshape(-1, query.size(-2), query.size(-1))
-            key = key.reshape(-1, key.size(-2), key.size(-1))
-            value = value.reshape(-1, value.size(-2), value.size(-1))
-
-        # Handle GQA
-        if is_gqa:
-            key = key.repeat_interleave(n_groups, dim=2)
-            value = value.repeat_interleave(n_groups, dim=2)
-
-    elif attention_mask is not None:
-        query, key, value, _, cu_seq_lens, _ = _upad_input(
-            query, key, value, attention_mask, query_length
-        )
-        cu_seq_lens_q, cu_seq_lens_k = cu_seq_lens
-        seq_lengths = []
-        for i in range(len(cu_seq_lens_q) - 1):
-            seq_lengths.append(cu_seq_lens_q[i + 1] - cu_seq_lens_q[i])
-        attn_bias = xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask.from_seqlens(
-            q_seqlen=seq_lengths,
-            kv_seqlen=seq_lengths,
-        )
-
-        # Handle GQA
-        if is_gqa:
-            key = key.repeat_interleave(n_groups, dim=2)
-            value = value.repeat_interleave(n_groups, dim=2)
-    else:
-        # Handle Group Query Attention (GQA) using view/expand approach from reference
-        key = key.view(batch_size, key_length, num_key_value_heads, 1, head_dim)
-        value = value.view(batch_size, key_length, num_key_value_heads, 1, head_dim)
-        key = key.expand(
-            batch_size, key_length, num_key_value_heads, n_groups, head_dim
-        )
-        value = value.expand(
-            batch_size, key_length, num_key_value_heads, n_groups, head_dim
-        )
-
-        if module.training:
-            key = key.reshape(batch_size, key_length, num_attention_heads, head_dim)
-            value = value.reshape(batch_size, key_length, num_attention_heads, head_dim)
-
-            if has_sliding_window:
-                query = query.view(
-                    1, batch_size * query_length, num_attention_heads, head_dim
-                )
-                key = key.view(
-                    1, batch_size * key_length, num_attention_heads, head_dim
-                )
-                value = value.view(
-                    1, batch_size * key_length, num_attention_heads, head_dim
-                )
-        else:
-            query = query.view(
-                batch_size, query_length, num_key_value_heads, n_groups, head_dim
-            )
-
-            # If we need a sliding window attention
-            if has_sliding_window:
-                query = query.view(
-                    1,
-                    batch_size * query_length,
-                    num_key_value_heads,
-                    n_groups,
-                    head_dim,
-                )
-                key = key.view(
-                    1, batch_size * key_length, num_key_value_heads, n_groups, head_dim
-                )
-                value = value.view(
-                    1, batch_size * key_length, num_key_value_heads, n_groups, head_dim
-                )
-
-    # Run the xformers attention
-    attn_output = xformers_attention(
-        query,
-        key,
-        value,
-        attn_bias=attn_bias,
-    )
-
-    attn_output = attn_output.view(
-        batch_size, -1, attn_output.size(-2), attn_output.size(-1)
-    )
-    return attn_output, None
--- a/src/axolotl/monkeypatch/gemma3.py
+++ b/src/axolotl/monkeypatch/gemma3.py
@@ -7,16 +7,24 @@ from typing import Optional, Tuple, Union
 import torch
 from transformers.cache_utils import Cache
 from transformers.models.gemma3.modeling_gemma3 import (
+    _CONFIG_FOR_DOC,
+    GEMMA3_INPUTS_DOCSTRING,
    Gemma3CausalLMOutputWithPast,
    logger,
 )
 from transformers.utils import (
+    add_start_docstrings_to_model_forward,
    is_torchdynamo_compiling,
+    replace_return_docstrings,
 )
 from transformers.utils.deprecation import deprecate_kwarg


@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+@add_start_docstrings_to_model_forward(GEMMA3_INPUTS_DOCSTRING)
+@replace_return_docstrings(
+    output_type=Gemma3CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
+)
 def new_forward(
    self,
    input_ids: torch.LongTensor = None,
--- a/src/axolotl/monkeypatch/gradient_checkpointing/init.py
+++ b/src/axolotl/monkeypatch/gradient_checkpointing/init.py
@@ -1,63 +0,0 @@
-"""custom checkpointing utils"""
-
-import importlib
-from functools import partial
-
-from packaging import version
-
-from axolotl.monkeypatch.gradient_checkpointing.offload_cpu import (
-    CPU_Offloaded_Gradient_Checkpointer,
-)
-from axolotl.monkeypatch.gradient_checkpointing.offload_disk import (
-    Disco,
-)
-
-transformers_version = version.parse(importlib.metadata.version("transformers"))
-if transformers_version > version.parse("4.51.3"):
-    from transformers.modeling_layers import GradientCheckpointingLayer
-
-    def uses_gc_layers(decoder_layer):
-        return isinstance(decoder_layer.func.__self__, GradientCheckpointingLayer)
-
-else:
-
-    def uses_gc_layers(_):
-        return False
-
-
-def hf_grad_checkpoint_offload_wrapper(
-    decoder_layer, *args, use_reentrant=None
-):  # pylint: disable=unused-argument
-    if uses_gc_layers(decoder_layer):
-        return CPU_Offloaded_Gradient_Checkpointer.apply(
-            decoder_layer,
-            *args,
-        )
-
-    return CPU_Offloaded_Gradient_Checkpointer.apply(
-        (
-            decoder_layer.func.__self__
-            if isinstance(decoder_layer, partial)
-            else decoder_layer.__self__
-        ),
-        *args,
-    )
-
-
-def hf_grad_checkpoint_disk_offload_wrapper(
-    decoder_layer, *args, use_reentrant=None
-):  # pylint: disable=unused-argument
-    if uses_gc_layers(decoder_layer):
-        return Disco.apply(
-            decoder_layer,
-            *args,
-        )
-
-    return Disco.apply(
-        (
-            decoder_layer.func.__self__
-            if isinstance(decoder_layer, partial)
-            else decoder_layer.__self__
-        ),
-        *args,
-    )
--- a/src/axolotl/monkeypatch/gradient_checkpointing/offload_disk.py
+++ b/src/axolotl/monkeypatch/gradient_checkpointing/offload_disk.py
@@ -1,531 +0,0 @@
-"""
-DISCO - DIsk-based Storage and Checkpointing with Optimized prefetching
-"""
-
-# Copyright 2025 Axolotl AI. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import atexit
-import concurrent.futures
-import logging
-import os
-import queue
-import shutil
-import tempfile
-import threading
-import time
-import uuid
-from collections import deque
-from concurrent.futures import Future
-from typing import Dict
-
-import torch
-
-torch_cuda_amp_custom_fwd = torch.amp.custom_fwd(device_type="cuda")
-torch_cuda_amp_custom_bwd = torch.amp.custom_bwd(device_type="cuda")
-
-# Setup logger
-logger = logging.getLogger(__name__)
-
-
-class DiskOffloadManager:
-    """
-    Manages offloaded tensors and handles prefetching in a separate thread.
-    Includes synchronization to prevent race conditions.
-    """
-
-    def __init__(
-        self,
-        prefetch_size: int = 3,
-        prefetch_to_gpu: bool = True,
-        save_workers: int = 4,
-    ):
-        """
-        Args:
-            prefetch_size: Maximum number of tensors to prefetch in the background.
-            prefetch_to_gpu: Whether to prefetch tensors directly to GPU memory.
-            save_workers: Maximum number of concurrent save operations.
-        """
-        self.temp_dir = tempfile.mkdtemp(prefix="disco_")
-
-        # Track tensor paths and their status
-        self.tensor_paths: deque = deque()  # Ordered history of tensor paths (LIFO)
-        self.file_locks: Dict[str, threading.Lock] = (
-            {}
-        )  # Maps file_path -> threading.Lock()
-        # Maps file_path -> status ("saving", "ready", "prefetching", "loaded", "deleted")
-        self.file_status: Dict[str, str] = {}
-
-        self.max_prefetch = prefetch_size
-        self.prefetch_to_gpu = prefetch_to_gpu
-
-        # Thread synchronization
-        self.manager_lock = threading.RLock()  # Used for thread-safe operations
-
-        # Prefetch queue and cache
-        self.prefetch_queue: queue.Queue = queue.Queue()
-        self.prefetch_cache: Dict[str, torch.Tensor] = {}  # Maps file_path -> tensor
-
-        # Save queue and thread pool
-        self.save_queue: queue.Queue = queue.Queue()
-        self.save_pool = concurrent.futures.ThreadPoolExecutor(max_workers=save_workers)
-        self.save_futures: Dict[str, Future] = {}
-        self.save_semaphore = threading.Semaphore(
-            save_workers * 2
-        )  # Limit concurrent save operations
-
-        # Start prefetch worker thread
-        self.stop_event = threading.Event()
-        # start multiple threads for prefetching
-        self.prefetch_worker_count = 2
-        self.prefetch_workers = []
-        for _ in range(self.prefetch_worker_count):
-            worker = threading.Thread(target=self._prefetch_worker, daemon=True)
-            worker.start()
-            self.prefetch_workers.append(worker)
-
-        # Start save worker thread
-        self.save_worker = threading.Thread(target=self._save_worker, daemon=True)
-        self.save_worker.start()
-        self.idx = 0
-
-        atexit.register(self.cleanup)
-
-    def _save_worker(self):
-        """Background thread that processes the save queue"""
-        while not self.stop_event.is_set():
-            try:
-                save_item = self.save_queue.get(timeout=0.5)
-                if save_item is None:
-                    continue
-
-                tensor, file_path = save_item
-
-                # Submit the save task to the thread pool
-                future = self.save_pool.submit(
-                    self._save_tensor_to_disk, tensor, file_path
-                )
-                with self.manager_lock:
-                    self.save_futures[file_path] = future
-
-                self.save_queue.task_done()
-
-            except queue.Empty:
-                time.sleep(0.01)  # Small sleep to prevent CPU spinning
-                continue
-
-    def _save_tensor_to_disk(self, tensor: torch.Tensor, file_path: str):
-        """Actually save the tensor to disk"""
-        try:
-            # Save tensor to disk
-            cpu_tensor = tensor.detach().cpu()
-            torch.save(cpu_tensor, file_path)
-            del cpu_tensor
-
-            with self.manager_lock:
-                # Mark file as ready
-                self.file_status[file_path] = "ready"
-
-            # Release semaphore
-            self.save_semaphore.release()
-
-            return True
-        except FileNotFoundError as e:
-            logger.error(f"Error saving tensor to {file_path}: {e}")
-            with self.manager_lock:
-                self.file_status[file_path] = "error"
-
-            # Release semaphore
-            self.save_semaphore.release()
-
-            return False
-
-    def _prefetch_worker(self):
-        """Background thread that loads tensors from disk ahead of time"""
-        while not self.stop_event.is_set():
-            try:
-                file_path = self.prefetch_queue.get(timeout=0.5)
-                if file_path is None:
-                    continue
-
-                # Check if file is available and not already in cache
-                with self.manager_lock:
-                    if (
-                        file_path not in self.file_status
-                        or self.file_status[file_path] == "deleted"
-                    ):
-                        self.prefetch_queue.task_done()
-                    if file_path in self.prefetch_cache:
-                        self.prefetch_queue.task_done()
-                        continue
-
-                    # If file is still being saved, wait for it
-                    if (
-                        self.file_status[file_path] == "saving"
-                        and file_path in self.save_futures
-                    ):
-                        # Re-queue this prefetch request with a little delay
-                        self.prefetch_queue.task_done()
-                        time.sleep(0.1)
-                        self.prefetch_queue.put(file_path)
-                        continue
-
-                    # Mark file as being prefetched
-                    self.file_status[file_path] = "prefetching"
-
-                # Load tensor from disk and store in cache
-                try:
-                    if os.path.exists(file_path):
-                        if self.prefetch_to_gpu:
-                            tensor = torch.load(
-                                file_path,
-                                map_location=torch.device("cuda"),
-                                weights_only=True,
-                            )
-                        else:
-                            tensor = torch.load(file_path, weights_only=True)
-
-                        with self.manager_lock:
-                            self.prefetch_cache[file_path] = tensor
-                            self.file_status[file_path] = "ready"
-                    else:
-                        with self.manager_lock:
-                            if self.file_status.get(file_path) != "deleted":
-                                logger.warning(
-                                    f"Prefetch error: File not found {file_path}"
-                                )
-                                self.file_status[file_path] = "missing"
-
-                except FileNotFoundError as e:
-                    with self.manager_lock:
-                        if self.file_status.get(file_path) != "deleted":
-                            logger.warning(f"Prefetch error for {file_path}: {e}")
-                            self.file_status[file_path] = "error"
-
-                self.prefetch_queue.task_done()
-
-            except queue.Empty:
-                time.sleep(0.01)  # Small sleep to prevent CPU spinning
-                continue
-
-    def save_tensor(self, tensor: torch.Tensor):
-        """Save tensor to disk asynchronously and return file path with thread-safe operations"""
-        # Generate unique file path
-        self.idx += 1
-        file_path: str = os.path.join(
-            self.temp_dir, f"{self.idx:06d}-{uuid.uuid4()}.pt"
-        )
-
-        with self.manager_lock:
-            # Mark file as being saved
-            self.file_locks[file_path] = threading.Lock()
-            self.file_status[file_path] = "saving"
-            # Add to history
-            self.tensor_paths.append(file_path)
-
-        # Acquire semaphore to limit concurrent save operations
-        self.save_semaphore.acquire()  # pylint: disable=consider-using-with
-        # Queue tensor for saving in background
-        self.save_queue.put((tensor.detach(), file_path))
-
-        return file_path
-
-    def wait_for_save(self, file_path, timeout=None) -> None:
-        """Wait for a tensor to be saved to disk"""
-        start_time = time.time()
-        while timeout is None or time.time() - start_time < timeout:
-            with self.manager_lock:
-                if self.file_status.get(file_path) == "ready":
-                    return
-                if self.file_status.get(file_path) in ["error", "missing", "deleted"]:
-                    return
-
-                if file_path in self.save_futures:
-                    future = self.save_futures[file_path]
-                    if future.done():
-                        return
-
-            # Small sleep to prevent CPU spinning
-            time.sleep(0.01)
-
-        # Timeout
-        logger.warning(f"Timeout waiting for tensor to be saved: {file_path}")
-        return
-
-    def load_tensor(self, file_path, target_device="cuda"):
-        """Load tensor from disk or prefetch cache with proper synchronization"""
-        # Wait for tensor to be saved if it's still in progress
-        self.wait_for_save(file_path)
-
-        tensor = None
-
-        # Try to get from cache first
-        with self.manager_lock:
-            # Check if tensor is already in cache
-            if file_path in self.prefetch_cache:
-                tensor = self.prefetch_cache[file_path]
-                del self.prefetch_cache[file_path]
-                self.file_status[file_path] = "loaded"
-
-        if tensor is not None:
-            # Ensure tensor is on correct device
-            if target_device != "cpu" and tensor.device.type == "cpu":
-                tensor = tensor.to(target_device, non_blocking=True)
-            return tensor
-
-        # If not in cache, load directly from disk
-        try:
-            if not os.path.exists(file_path):
-                logger.error(f"File not found for loading: {file_path}")
-                raise FileNotFoundError(f"File not found: {file_path}")
-
-            tensor = torch.load(file_path, weights_only=True)
-
-            with self.manager_lock:
-                self.file_status[file_path] = "loaded"
-
-            if target_device != "cpu":
-                tensor = tensor.to(target_device, non_blocking=True)
-
-            return tensor
-
-        except Exception as e:
-            logger.error(f"Error loading tensor from {file_path}: {e}")
-            raise
-
-    def _safe_delete_file(self, file_path):
-        """Safely delete a file with proper synchronization"""
-        with self.manager_lock:
-            # Make sure any save operation is completed
-            if file_path in self.save_futures:
-                future = self.save_futures[file_path]
-                try:
-                    if not future.done():
-                        future.cancel()
-                    del self.save_futures[file_path]
-                except FileNotFoundError as e:
-                    logger.warning(
-                        f"Error canceling save operation for {file_path}: {e}"
-                    )
-
-            # Only delete if file exists and is not being prefetched
-            status = self.file_status.get(file_path)
-            if status in ["ready", "loaded", "error", "missing"]:
-                try:
-                    if os.path.exists(file_path):
-                        os.remove(file_path)
-                    self.file_status[file_path] = "deleted"
-                    return True
-                except FileNotFoundError as e:
-                    logger.warning(f"Error deleting file {file_path}: {e}")
-            return False
-
-    def trigger_prefetch(self, n=None):
-        """Trigger prefetching of the next N tensors with proper synchronization"""
-        if n is None:
-            n = self.max_prefetch
-
-        prefetch_paths = []
-        with self.manager_lock:
-            # Find files that are ready to be prefetched (not already in cache or being prefetched)
-            for path in reversed(self.tensor_paths):
-                if (
-                    path not in self.prefetch_cache
-                    and self.file_status.get(path) == "ready"
-                ):
-                    prefetch_paths.append(path)
-                    if len(prefetch_paths) >= n:
-                        break
-
-        # Queue files for prefetching
-        for path in prefetch_paths:
-            self.prefetch_queue.put(path)
-
-    def cleanup_tensor(self, file_path: str):
-        """Clean up a specific tensor file after it's been used"""
-        with self.manager_lock:
-            if file_path in self.tensor_paths:
-                self.tensor_paths.remove(file_path)
-
-            # Remove from prefetch cache if present
-            if file_path in self.prefetch_cache:
-                del self.prefetch_cache[file_path]
-
-            # Remove from save futures if present
-            if file_path in self.save_futures:
-                future = self.save_futures[file_path]
-                if not future.done():
-                    future.cancel()
-                del self.save_futures[file_path]
-
-        # Try to delete the file
-        self._safe_delete_file(file_path)
-
-    def cleanup(self):
-        """Clean up all temp files and stop prefetch thread with proper synchronization"""
-        self.stop_event.set()
-
-        # Cancel all pending save operations
-        with self.manager_lock:
-            for _, future in self.save_futures.items():
-                if not future.done():
-                    future.cancel()
-            self.save_futures.clear()
-
-        # Drain the save queue
-        while not self.save_queue.empty():
-            try:
-                self.save_queue.get_nowait()
-                self.save_queue.task_done()
-            except queue.Empty:
-                break
-
-        # Shutdown the save pool
-        self.save_pool.shutdown(wait=False)
-
-        # Join the save worker thread
-        if self.save_worker.is_alive():
-            self.save_worker.join(timeout=2.0)
-
-        # Join the prefetch worker threads
-        for thread in self.prefetch_workers:
-            if thread.is_alive():
-                thread.join(timeout=2.0)
-
-        # Clear cache and remove all temporary files
-        with self.manager_lock:
-            self.prefetch_cache.clear()
-            paths_to_delete = list(self.tensor_paths)
-            self.tensor_paths.clear()
-
-        # Delete all temporary files
-        for path in paths_to_delete:
-            self._safe_delete_file(path)
-
-        # Remove temp directory
-        try:
-            if os.path.exists(self.temp_dir):
-                shutil.rmtree(self.temp_dir, ignore_errors=True)
-        except FileNotFoundError as e:
-            logger.warning(f"Error removing temporary directory {self.temp_dir}: {e}")
-
-
-class Disco(torch.autograd.Function):
-    """
-    Disco: DIsk-based Storage and Checkpointing with Optimized prefetching
-    Advanced disk-based gradient checkpointer with prefetching.
-    """
-
-    # Shared manager instance across all checkpointing operations
-    _manager = None
-
-    @staticmethod
-    def get_instance(prefetch_size=1, prefetch_to_gpu=True, save_workers=4):
-        """Get or create the offload manager"""
-        if Disco._manager is None:
-            Disco._manager = DiskOffloadManager(
-                prefetch_size=prefetch_size,
-                prefetch_to_gpu=prefetch_to_gpu,
-                save_workers=save_workers,
-            )
-        return Disco._manager
-
-    @staticmethod
-    @torch_cuda_amp_custom_fwd
-    def forward(
-        ctx,
-        forward_function,
-        hidden_states,
-        *args,
-        prefetch_size=1,
-        prefetch_to_gpu=True,
-        save_workers=4,
-    ):
-        """Forward pass that offloads activations to disk asynchronously"""
-        # Get or create the manager
-        manager = Disco.get_instance(
-            prefetch_size=prefetch_size,
-            prefetch_to_gpu=prefetch_to_gpu,
-            save_workers=save_workers,
-        )
-
-        # Save tensor to disk asynchronously
-        file_path = manager.save_tensor(hidden_states)
-
-        # Run forward pass immediately without waiting for save to complete
-        with torch.no_grad():
-            output = forward_function(hidden_states, *args)
-
-        # Store what we need for backward
-        ctx.save_for_backward(torch.tensor([0]))  # Dummy tensor
-        ctx.file_path = file_path
-        ctx.forward_function = forward_function
-        ctx.args = args
-
-        return output
-
-    @staticmethod
-    @torch_cuda_amp_custom_bwd
-    def backward(ctx, *grad_outputs):
-        """Backward pass that loads activations from disk with prefetching"""
-        # Get the manager
-        manager = Disco._manager
-
-        # Trigger prefetching for future tensors
-        # This happens at the start of backward, so should have time to complete
-        manager.trigger_prefetch()
-
-        # Load hidden states from disk or prefetch cache
-        file_path = ctx.file_path
-        try:
-            # Ensure the file is saved before we try to load it
-            manager.wait_for_save(file_path)
-
-            hidden_states = manager.load_tensor(file_path)
-            hidden_states.requires_grad = True
-
-            # Compute gradients
-            with torch.enable_grad():
-                output = ctx.forward_function(hidden_states, *ctx.args)
-
-                # Handle tuple outputs properly
-                if isinstance(output, tuple):
-                    if len(grad_outputs) == len(output):
-                        torch.autograd.backward(output, grad_outputs)
-                    else:
-                        torch.autograd.backward(output, grad_outputs[0])
-                else:
-                    torch.autograd.backward(output, grad_outputs[0])
-
-            # Clean up the file after we're done with it
-            manager.cleanup_tensor(file_path)
-
-            return (
-                (
-                    None,  # forward_function
-                    hidden_states.grad,  # hidden_states grad
-                )
-                + (None,) * len(ctx.args)  # for each arg
-                + (
-                    None,  # prefetch_size
-                    None,  # prefetch_to_gpu
-                    None,  # save_workers
-                )
-            )
-
-        except Exception as e:
-            logger.error(f"Error in backward pass: {e}")
-            # Clean up the file even on error
-            manager.cleanup_tensor(file_path)
-            raise
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -18,8 +18,6 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
    "mixtral",
    "qwen2",
    "qwen2_moe",
-    "qwen3",
-    "qwen3_moe",
    "falcon",
    "phi",
    "phi3",
--- a/src/axolotl/monkeypatch/peft/init.py
+++ b/src/axolotl/monkeypatch/peft/init.py
--- a/src/axolotl/monkeypatch/peft/utils.py
+++ b/src/axolotl/monkeypatch/peft/utils.py
@@ -1,78 +0,0 @@
-"""
-Patch prepare_model_for_kbit_training to not upcast everything
-"""
-
-import inspect
-import logging
-
-import peft
-
-import axolotl
-from axolotl.monkeypatch.utils import detab_code
-
-LOG = logging.getLogger(__name__)
-
-ORIGINAL_PREPARE_CODE = """
-        for param in model.parameters():
-            if (
-                (param.dtype == torch.float16) or (param.dtype == torch.bfloat16)
-            ) and param.__class__.__name__ != "Params4bit":
-                param.data = param.data.to(torch.float32)
-"""
-
-PATCHED_PREPARE_CODE = """
-        for name, param in model.named_parameters():
-            if (
-                (param.dtype == torch.float16) or (param.dtype == torch.bfloat16)
-            ) and param.__class__.__name__ != "Params4bit" and all(embed_name not in name for embed_name in ["embed_tokens", "lm_head"]):
-                param.data = param.data.to(torch.float32)
-"""
-
-
-def get_peft_prep_code() -> str:
-    prepare = inspect.getsource(peft.utils.other.prepare_model_for_kbit_training)
-    return prepare
-
-
-def check_peft_prep_code_is_patchable() -> bool:
-    prep_code = get_peft_prep_code()
-    prep_code, _ = detab_code(prep_code)
-    return ORIGINAL_PREPARE_CODE in prep_code
-
-
-def patch_peft_prep_code():
-    """
-    monkeypatch create_accelerator_and_postprocess so it checks for additional kwargs
-    """
-
-    try:
-        prep_code = get_peft_prep_code()
-    except OSError:
-        return
-    peft.utils.other._original_create_accelerator_and_postprocess = (  # pylint: disable=protected-access
-        prep_code
-    )
-    prep_code, _ = detab_code(prep_code)
-    if ORIGINAL_PREPARE_CODE not in prep_code:
-        return
-
-    prep_code = prep_code.replace(ORIGINAL_PREPARE_CODE, PATCHED_PREPARE_CODE)
-    prep_code = prep_code.replace(
-        "def prepare_model_for_kbit_training(",
-        "def fixed_prepare_model_for_kbit_training(",
-        1,
-    )
-
-    items_to_import = []
-    for item in dir(peft.utils.other):
-        if item in prep_code:
-            items_to_import.append(item)
-
-    exec(  # pylint: disable=exec-used  # nosec B102
-        "from peft.utils.other import (" + ", ".join(x for x in items_to_import) + ")",
-        globals(),
-    )
-    exec(prep_code, globals())  # pylint: disable=exec-used  # nosec B102
-    LOG.info("patching prepare_model_for_kbit_training to allow for overrides")
-    peft.utils.other.prepare_model_for_kbit_training = fixed_prepare_model_for_kbit_training  # pylint: disable=protected-access  # pylint: disable=undefined-variable  # noqa: F821
-    axolotl.loaders.model.prepare_model_for_kbit_training = fixed_prepare_model_for_kbit_training  # pylint: disable=protected-access  # pylint: disable=undefined-variable  # noqa: F821
--- a/src/axolotl/monkeypatch/ring_attn/init.py
+++ b/src/axolotl/monkeypatch/ring_attn/init.py
@@ -1,22 +0,0 @@
-"""Init for ring attention monkeypatch module"""
-
-# pylint: disable=unused-import
-# flake8: noqa
-
-from .patch import (
-    get_ring_attn_group,
-    patch_prepare_data_loader,
-    patch_prepare_device_mesh,
-    register_ring_attn,
-    set_ring_attn_group,
-    update_ring_attn_params,
-)
-
-__all__ = (
-    "get_ring_attn_group",
-    "patch_prepare_data_loader",
-    "patch_prepare_device_mesh",
-    "register_ring_attn",
-    "set_ring_attn_group",
-    "update_ring_attn_params",
-)
--- a/src/axolotl/monkeypatch/ring_attn/adapters/init.py
+++ b/src/axolotl/monkeypatch/ring_attn/adapters/init.py
--- a/src/axolotl/monkeypatch/ring_attn/patch.py
+++ b/src/axolotl/monkeypatch/ring_attn/patch.py
@@ -1,225 +0,0 @@
-"""Ring attention group registration and flash attention patching.
-
-Make use of the `ring-flash-attn` (https://github.com/zhuzilin/ring-flash-attention)
-package, specifically the `hf_adapter.substitute_hf_flash_attn` function to patch in
-their sequence parallel version of Flash Attention 2.
-
-We also provide some patches for accelerate functions to prepare the dataloader for
-sequence parallelism training.
-"""
-
-import inspect
-
-import accelerate
-import torch
-import torch.distributed as dist
-from accelerate.logging import get_logger
-
-from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
-from axolotl.utils.schemas.enums import RingAttnFunc
-
-LOG = get_logger(__name__)
-
-
-RING_ATTN_GROUP = None
-
-ORIGINAL_PREPARE_DATALOADER_CODE = """            submesh_fsdp_size = 1
-            submesh_dp_size = 1
-            submesh_tp_size = 1
-            if "tp" in torch_device_mesh.mesh_dim_names:
-                submesh_tp_size = torch_device_mesh["tp"].size()
-            if "dp" in torch_device_mesh.mesh_dim_names:
-                submesh_dp_size = torch_device_mesh["dp"].size()
-            if "fsdp" in torch_device_mesh.mesh_dim_names:
-                submesh_fsdp_size = torch_device_mesh["fsdp"].size()
-            process_index = process_index // submesh_tp_size"""
-
-NEW_PREPARE_DATALOADER_CODE = """            submesh_fsdp_size = 1
-            submesh_dp_size = 1
-            submesh_tp_size = 1
-            submesh_cp_size = 1
-            if "cp" in torch_device_mesh.mesh_dim_names:
-                submesh_cp_size = torch_device_mesh["cp"].size()
-            if "tp" in torch_device_mesh.mesh_dim_names:
-                submesh_tp_size = torch_device_mesh["tp"].size()
-            if "dp" in torch_device_mesh.mesh_dim_names:
-                submesh_dp_size = torch_device_mesh["dp"].size()
-            if "fsdp" in torch_device_mesh.mesh_dim_names:
-                submesh_fsdp_size = torch_device_mesh["fsdp"].size()
-            process_index = process_index // (submesh_tp_size * submesh_cp_size)"""
-
-
-def get_ring_attn_group() -> dist.ProcessGroup:
-    """Getter for ring attention group on this rank."""
-    if RING_ATTN_GROUP is None:
-        raise RuntimeError("register_ring_attn() not yet called")
-    return RING_ATTN_GROUP
-
-
-def set_ring_attn_group(ring_attn_group: dist.ProcessGroup | None):
-    """Setter for ring attention group on this rank."""
-    global RING_ATTN_GROUP  # pylint: disable=global-statement
-    RING_ATTN_GROUP = ring_attn_group
-
-
-def register_ring_attn(
-    sequence_parallel_degree: int,
-    heads_k_stride: int | None,
-    ring_attn_func: RingAttnFunc | None,
-):
-    """Create ring attention group and substitute flash attn with ring flash attn.
-
-    Args:
-        sequence_parallel_degree: Sequence parallelism factor.
-        heads_k_stride: Sequence parallelism K head stride size. Passed through to
-            `varlen_llama3` `ring_flash_attn` implementation.
-        ring_attn_func: `ring_flash_attn` ring attention implemention. If sample
-            packing is enabled, it must be a `varlen` function; otherwise, it must be a
-            `batch` function.
-    """
-    rank = dist.get_rank()
-    world_size = dist.get_world_size()
-
-    if rank == 0:
-        LOG.info(
-            "Enabling ring attention sequence parallelism: "
-            f"each sequence will be processed across {sequence_parallel_degree} GPUs"
-        )
-
-    assert sequence_parallel_degree <= world_size, (
-        f"sequence_parallel_degree ({sequence_parallel_degree}) "
-        f"must be less than or equal to world_size ({world_size})"
-    )
-    assert world_size % sequence_parallel_degree == 0, (
-        f"sequence_parallel_degree ({sequence_parallel_degree}) "
-        f"must evenly divide world_size ({world_size})"
-    )
-
-    # Assign ranks to sequence parallel groups
-    group_assignments = {}
-    for i in range(world_size // sequence_parallel_degree):
-        ring_attn_ranks = list(
-            range(
-                i * sequence_parallel_degree,
-                (i + 1) * sequence_parallel_degree,
-            )
-        )
-        group = dist.new_group(ranks=ring_attn_ranks, backend="nccl")
-
-        # Track which GPUs are in which groups
-        for r in ring_attn_ranks:
-            group_assignments[r] = i
-
-        if rank in ring_attn_ranks:
-            set_ring_attn_group(group)
-
-    # Log the GPU group assignments
-    if rank == 0:
-        LOG.info(f"Sequence parallel group assignments: {group_assignments}")
-
-    if ring_attn_func is RingAttnFunc.VARLEN_LLAMA3:
-        from ring_flash_attn import substitute_hf_flash_attn
-
-        substitute_hf_flash_attn(
-            process_group=get_ring_attn_group(), heads_k_stride=heads_k_stride or 1
-        )
-    elif ring_attn_func is RingAttnFunc.BATCH_RING:
-        from axolotl.monkeypatch.ring_attn.adapters.batch import (
-            substitute_hf_flash_attn,
-        )
-
-        substitute_hf_flash_attn(
-            process_group=get_ring_attn_group(),
-            ring_attn_func=ring_attn_func,
-        )
-
-
-def update_ring_attn_params(position_ids: torch.Tensor | None):
-    """
-    Calculate the cumulative sequence lengths for the current forward pass and pass the
-    value to the substituted `ring_flash_attn`.
-
-    Args:
-        position_ids: Optional tensor of position IDs (for sample packed data).
-    """
-    from ring_flash_attn import update_ring_flash_attn_params
-
-    cu_seqlens, _ = get_cu_seqlens_from_pos_ids(position_ids)
-    cu_seqlens = cu_seqlens.squeeze().to(device=torch.cuda.current_device())
-    update_ring_flash_attn_params(cu_seqlens, get_ring_attn_group())
-
-
-def patch_prepare_data_loader():
-    """Patch `accelerate.data_loader.prepare_data_loader` to respect the SP degree.
-
-    Raies:
-        RuntimeError: If source code to patch does not exist.
-    """
-    original_fn = accelerate.data_loader.prepare_data_loader
-    original_source = inspect.getsource(original_fn)
-
-    if ORIGINAL_PREPARE_DATALOADER_CODE not in original_source:
-        raise RuntimeError(
-            "SP patch failed - target snippet not found. "
-            "Check accelerate's version or update the patch."
-        )
-
-    patched_source = original_source.replace(
-        ORIGINAL_PREPARE_DATALOADER_CODE, NEW_PREPARE_DATALOADER_CODE
-    )
-
-    # Create a new function from the patched source
-    namespace = {}
-    exec(  # pylint: disable=exec-used  # nosec B102
-        patched_source, accelerate.data_loader.__dict__, namespace
-    )
-    patched_function = namespace["prepare_data_loader"]
-
-    accelerate.data_loader.prepare_data_loader = patched_function
-    LOG.info("Patched accelerate.data_loader.prepare_data_loader for SP support")
-
-
-def patch_prepare_device_mesh(sequence_parallel_degree: int):
-    """Patches the `Accelerator._prepare_device_mesh` method to create a device mesh
-    that includes sequence parallelism with the specified degree.
-
-    Args:
-        sequence_parallel_degree (int): The degree of sequence parallelism to use.
-    """
-
-    def _prepare_device_mesh(self):
-        """Prepare the device mesh for distributed training. The dataloader will
-        determine how to load data based on the device mesh.
-        """
-        if self.state.torch_tp_plugin:
-            return self.state.torch_tp_plugin.torch_device_mesh
-        if (
-            self.distributed_type == accelerate.accelerator.DistributedType.DEEPSPEED
-            and hasattr(self.state, "ds_device_mesh")
-        ):
-            return self.state.ds_device_mesh
-
-        # Create device mesh with sequence parallelism
-        world_size = dist.get_world_size()
-        mesh_shape = (
-            world_size // sequence_parallel_degree,
-            sequence_parallel_degree,
-        )
-        device_ids = list(range(world_size))
-
-        # Note that we use "cp" instead of "sp" to match the PyTorch native "context
-        # parallelism" implementation naming
-        return dist.DeviceMesh(
-            "cuda",
-            torch.tensor(device_ids).reshape(mesh_shape),
-            mesh_dim_names=("dp", "cp"),
-        )
-
-    # Replace the original method with our new method
-    # pylint: disable=protected-access
-    accelerate.accelerator.Accelerator._prepare_device_mesh = _prepare_device_mesh
-
-    LOG.info(
-        "Successfully patched Accelerator._prepare_device_mesh "
-        f"with sequence_parallel_degree={sequence_parallel_degree}"
-    )
--- a/src/axolotl/prompt_strategies/chat_template.py
+++ b/src/axolotl/prompt_strategies/chat_template.py
@@ -424,20 +424,6 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):

            LOG.debug(f"Should train: {should_train}")

-            # turn not trainable, skip having to find the turn indices
-            # unless last turn and train_on_eos/train_on_eot is all
-            if not should_train and (
-                self.train_on_eos != "all" and self.train_on_eot != "all"
-            ):
-                if index == len(turns) - 1:
-                    LOG.warning(
-                        "Last turn is not trainable, skipping having to find the turn indices. "
-                        "This may cause incorrect last EOT/EOS token to be unmasked."
-                        "This is likely a dataset design issue. Please ensure last turn is trainable."
-                    )
-
-                continue
-
            turn_start_idx, turn_end_idx = self.find_turn(turns=turns, turn_idx=index)

            LOG.debug(f"Turn indices: start={turn_start_idx}, end={turn_end_idx}")
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -2,17 +2,17 @@

 import importlib
 import inspect
-import logging
 import os
 import signal
 import sys
 import weakref
-from contextlib import ExitStack
+from contextlib import nullcontext
 from pathlib import Path
 from typing import Any, Dict

 import torch
 import transformers.modelcard
+from accelerate.logging import get_logger
 from accelerate.utils import save_fsdp_model
 from datasets import Dataset
 from huggingface_hub.errors import OfflineModeIsEnabled
@@ -21,23 +21,19 @@ from transformers import PreTrainedModel, PreTrainedTokenizer, ProcessorMixin
 from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
 from transformers.trainer import Trainer

-from axolotl.cli.art import print_axolotl_text_art
 from axolotl.common.datasets import TrainDatasetMeta
 from axolotl.contribs.lgpl import (  # pylint: disable = no-name-in-module
    fix_untrained_tokens,
 )
 from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFRLTrainerBuilder
-from axolotl.integrations.base import PluginManager
-from axolotl.loaders import (
-    ModelLoader,
-    load_processor,
-    load_tokenizer,
+from axolotl.core.trainers.mixins.sequence_parallel import (
+    SequenceParallelContextManager,
 )
-from axolotl.utils.ctx_managers.sequence_parallel import SequenceParallelContextManager
+from axolotl.integrations.base import PluginManager
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import cleanup_distributed
 from axolotl.utils.freeze import freeze_layers_except
-from axolotl.utils.schemas.enums import RLType
+from axolotl.utils.models import load_model, load_processor, load_tokenizer
 from axolotl.utils.trainer import setup_trainer

 try:
@@ -45,7 +41,7 @@ try:
 except ImportError:
    BetterTransformer = None

-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__)


 def setup_model_and_tokenizer(
@@ -66,6 +62,7 @@ def setup_model_and_tokenizer(
    # Load tokenizer
    LOG.debug(
        f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}",
+        main_process_only=True,
    )
    tokenizer = load_tokenizer(cfg)

@@ -80,8 +77,7 @@ def setup_model_and_tokenizer(
        msg += " and peft_config..."
    LOG.debug(msg)

-    model_loader = ModelLoader(cfg, tokenizer, processor=processor)
-    model, peft_config = model_loader.load()
+    model, peft_config = load_model(cfg, tokenizer, processor=processor)
    if model.generation_config is not None:
        model.generation_config.do_sample = True

@@ -111,15 +107,14 @@ def setup_reference_model(
        Reference model if needed for RL training, `None` otherwise.
    """
    model_ref = None
-    if cfg.rl and cfg.rl != RLType.ORPO:
+    if cfg.rl and cfg.rl != "orpo":
        if cfg.adapter and not cfg.rl_adapter_ref_model:
            # use built-in trl autounwrap
            LOG.debug("Passing model_ref: None to RL trainer")
            model_ref = None  # explicit setting to None
        else:
            # load the model again for model_ref/baseline
-            model_loader = ModelLoader(cfg, tokenizer, reference_model=True)
-            model_ref, _ = model_loader.load()
+            model_ref, _ = load_model(cfg, tokenizer, reference_model=True)
    return model_ref


@@ -193,33 +188,28 @@ def execute_training(
        trainer: The configured trainer object.
        resume_from_checkpoint: Path to checkpoint to resume from, if applicable.
    """
-    with ExitStack() as stack:
-        # Define the context managers to use
-        if cfg.flash_optimum:
-            stack.enter_context(
-                torch.backends.cuda.sdp_kernel(
-                    enable_flash=True,
-                    enable_math=True,
-                    enable_mem_efficient=True,
-                )
-            )
+    # Define the context managers to use
+    flash_context = (
+        torch.backends.cuda.sdp_kernel(
+            enable_flash=True,
+            enable_math=True,
+            enable_mem_efficient=True,
+        )
+        if cfg.flash_optimum
+        else nullcontext()
+    )
+    sequence_parallel_context = (
+        SequenceParallelContextManager(
+            model=trainer.model,
+            sequence_parallel_degree=cfg.sequence_parallel_degree,
+            ring_attn_func=cfg.ring_attn_func,
+        )
+        if cfg.sequence_parallel_degree > 1
+        else nullcontext()
+    )

-        if cfg.sequence_parallel_degree > 1:
-            models = [trainer.model]
-            if hasattr(trainer, "ref_model"):
-                models.append(trainer.ref_model)
-
-            stack.enter_context(
-                SequenceParallelContextManager(
-                    models=models,
-                    sequence_parallel_degree=cfg.sequence_parallel_degree,
-                    gradient_accumulation_steps=cfg.gradient_accumulation_steps,
-                    ring_attn_func=cfg.ring_attn_func,
-                    heads_k_stride=cfg.heads_k_stride,
-                )
-            )
-
-        LOG.info("Starting trainer...")
+    LOG.info("Starting trainer...")
+    with flash_context, sequence_parallel_context:
        trainer.train(resume_from_checkpoint=resume_from_checkpoint)


@@ -526,8 +516,6 @@ def train(
    Returns:
        Tuple of (model, tokenizer) after training
    """
-    print_axolotl_text_art()
-
    # Setup model, tokenizer, (causal or RLHF) trainer, etc.
    (
        trainer,
@@ -537,9 +525,6 @@ def train(
        processor,
    ) = setup_model_and_trainer(cfg, dataset_meta)

-    plugin_manager = PluginManager.get_instance()
-    plugin_manager.post_trainer_create(cfg, trainer)
-
    # Handle untrained tokens if configured
    safe_serialization = cfg.save_safetensors is True
    train_dataset = dataset_meta.train_dataset
@@ -562,6 +547,7 @@ def train(
    if not cfg.use_ray:
        cleanup_distributed()

+    plugin_manager = PluginManager.get_instance()
    plugin_manager.post_train(cfg, model)

    return model, tokenizer, trainer
--- a/src/axolotl/utils/init.py
+++ b/src/axolotl/utils/init.py
@@ -43,12 +43,3 @@ def set_pytorch_cuda_alloc_conf():
            os.environ["PYTORCH_CUDA_ALLOC_CONF"] = (
                "expandable_segments:True,roundup_power2_divisions:16"
            )
-
-
-def patch_optimized_env():
-    """
-    Patch environment variables to improve VRAM usage and increase download speed
-    """
-    if os.getenv("HF_HUB_ENABLE_HF_TRANSFER") is None:
-        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
-    set_pytorch_cuda_alloc_conf()
--- a/src/axolotl/utils/callbacks/init.py
+++ b/src/axolotl/utils/callbacks/init.py
@@ -868,28 +868,3 @@ class GCCallback(TrainerCallback):
    ):
        torch.cuda.empty_cache()
        gc.collect()
-
-
-def colab_inference_post_train_callback(trainer: Trainer):
-    class ColabCallback(TrainerCallback):
-        """Callback to prep model for inference on Google Colab"""
-
-        def __init__(self, cfg):
-            self.gpu_name = torch.cuda.get_device_name(0)
-            self.cfg = cfg
-
-        def on_train_end(
-            self, args, state, control, **kwargs
-        ):  # pylint: disable=unused-argument
-            """
-            handle T4 gpu, we need to convert attention to eager for inference
-            """
-            if "Tesla T4" in self.gpu_name and self.cfg.xformers_attention:
-                trainer.model.config._attn_implementation = (  # pylint: disable=protected-access
-                    "eager"
-                )
-            trainer.model.gradient_checkpointing_disable()
-            trainer.model.config.use_cache = True
-            trainer.model.eval()
-
-    return ColabCallback
--- a/src/axolotl/utils/callbacks/mlflow_.py
+++ b/src/axolotl/utils/callbacks/mlflow_.py
@@ -1,7 +1,6 @@
 """MLFlow module for trainer callbacks"""

 import logging
-import os
 from shutil import copyfile
 from tempfile import NamedTemporaryFile
 from typing import TYPE_CHECKING
@@ -17,11 +16,6 @@ if TYPE_CHECKING:
 LOG = logging.getLogger("axolotl.callbacks")


-def should_log_artifacts() -> bool:
-    truths = ["TRUE", "1", "YES"]
-    return os.getenv("HF_MLFLOW_LOG_ARTIFACTS", "FALSE").upper() in truths
-
-
 class SaveAxolotlConfigtoMlflowCallback(TrainerCallback):
    # pylint: disable=duplicate-code
    """Callback to save axolotl config to mlflow"""
@@ -38,18 +32,13 @@ class SaveAxolotlConfigtoMlflowCallback(TrainerCallback):
    ):
        if is_main_process():
            try:
-                if should_log_artifacts():
-                    with NamedTemporaryFile(
-                        mode="w", delete=False, suffix=".yml", prefix="axolotl_config_"
-                    ) as temp_file:
-                        copyfile(self.axolotl_config_path, temp_file.name)
-                        mlflow.log_artifact(temp_file.name, artifact_path="")
-                        LOG.info(
-                            "The Axolotl config has been saved to the MLflow artifacts."
-                        )
-                else:
+                with NamedTemporaryFile(
+                    mode="w", delete=False, suffix=".yml", prefix="axolotl_config_"
+                ) as temp_file:
+                    copyfile(self.axolotl_config_path, temp_file.name)
+                    mlflow.log_artifact(temp_file.name, artifact_path="")
                    LOG.info(
-                        "Skipping logging artifacts to MLflow (hf_mlflow_log_artifacts is false)"
+                        "The Axolotl config has been saved to the MLflow artifacts."
                    )
            except (FileNotFoundError, ConnectionError) as err:
                LOG.warning(f"Error while saving Axolotl config to MLflow: {err}")
--- a/src/axolotl/utils/config/init.py
+++ b/src/axolotl/utils/config/init.py
@@ -11,10 +11,9 @@ from transformers.utils.import_utils import is_torch_npu_available

 from axolotl.integrations.base import PluginManager
 from axolotl.integrations.config import merge_input_args
-from axolotl.loaders import MULTIMODAL_AUTO_MODEL_MAPPING
-from axolotl.loaders.utils import load_model_config
 from axolotl.utils.bench import log_gpu_memory_usage
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.models import MULTIMODAL_AUTO_MODEL_MAPPING, load_model_config
 from axolotl.utils.schemas.config import (
    AxolotlConfigWCapabilities as AxolotlConfigWCapabilitiesBase,
 )
@@ -60,7 +59,7 @@ def choose_device(cfg):

 def resolve_dtype(cfg):
    if (
-        not cfg.fp16 and cfg.bf16 == "auto" and not cfg.use_ray
+        cfg.bf16 == "auto" and not cfg.use_ray
    ):  # if we use ray we want to defer this check to the worker node
        if is_torch_bf16_gpu_available():
            LOG.debug("bf16 support detected, enabling for this configuration.")
@@ -71,9 +70,6 @@ def resolve_dtype(cfg):
            if cfg.fp16 is None and not cfg.float16:
                cfg.fp16 = True

-    if cfg.fp16 and cfg.bf16 == "auto":
-        cfg.bf16 = False
-
    if cfg.device == "mps":
        cfg.load_in_8bit = False
        cfg.tf32 = False
--- a/src/axolotl/utils/ctx_managers/init.py
+++ b/src/axolotl/utils/ctx_managers/init.py
@@ -1,6 +0,0 @@
-"""Init for context manager submodule"""
-
-# pylint: disable=unused-import
-# flake8: noqa
-
-from .sequence_parallel import SequenceParallelContextManager
--- a/src/axolotl/utils/ctx_managers/sequence_parallel.py
+++ b/src/axolotl/utils/ctx_managers/sequence_parallel.py
@@ -1,376 +0,0 @@
-"""Module for Axolotl trainer sequence parallelism manager and utilities"""
-
-import functools
-import inspect
-
-import torch
-import torch.distributed as dist
-from torch import nn
-from torch.utils.hooks import RemovableHandle
-from transformers.modeling_outputs import CausalLMOutputWithPast
-from transformers.utils import ModelOutput
-
-from axolotl.monkeypatch.ring_attn import (
-    get_ring_attn_group,
-    patch_prepare_data_loader,
-    patch_prepare_device_mesh,
-    register_ring_attn,
-    update_ring_attn_params,
-)
-from axolotl.utils.schemas.enums import RingAttnFunc
-
-
-# TODO(djsaunde): implement zigzag, stripe patterns here (and elsewhere) in this
-# module. Currently, we just focus on batch ring and varlen llama3 for simplicity.
-def apply_sequence_parallelism(
-    batch: dict[str, torch.Tensor],
-    local_rank: int,
-    local_world_size: int,
-    gradient_accumulation_steps: int,
-    ring_attn_func: RingAttnFunc,  # pylint: disable=unused-argument
-) -> tuple[dict[str, torch.Tensor], int, int]:
-    """
-    Apply sequence parallelism slicing to a batch.
-
-    Special handling is implemented for integer logits_to_keep, which indicates
-    to only keep the last N tokens in the sequence during generation.
-
-    Args:
-        batch: Batch dictionary (e.g., input_ids, attention_mask, etc.).
-        local_rank: Local rank in the sequence parallel group.
-        local_world_size: World size of the sequence parallel group.
-        gradient_accumulation_steps: Number of steps to accumulate gradients over.
-        ring_attn_func: Which ring attention function to use. Currently unused, but
-            related to above TODO.
-
-    Returns:
-        tuple of:
-            - Batch dictionary with sliced tensors.
-            - The original sequence length before padding.
-            - The number of padding tokens added.
-    """
-    original_seq_len = batch["input_ids"].size(1)
-
-    # Update ring attention params if needed
-    if batch.get("position_ids") is not None:
-        update_ring_attn_params(position_ids=batch["position_ids"])
-    else:
-        # If position_ids aren't already in the batch, create them
-        batch["position_ids"] = torch.arange(
-            0,
-            original_seq_len,
-            dtype=torch.long,
-            device=batch["input_ids"].device,
-        ).expand(batch["input_ids"].size(0), -1)
-
-    if "logits_to_keep" in batch and isinstance(batch["logits_to_keep"], int):
-        logits_to_keep = batch["logits_to_keep"]
-
-        # Calculate which positions in the full sequence contain the last N tokens
-        start_position = max(0, original_seq_len - logits_to_keep)
-        chunk_size = original_seq_len // local_world_size
-        rank_start = local_rank * chunk_size
-        rank_end = rank_start + chunk_size
-
-        # Create a boolean mask tensor for this rank's chunk
-        mask = torch.zeros(
-            chunk_size,
-            dtype=torch.bool,
-            device=batch["input_ids"].device,
-        )
-
-        if rank_end > start_position:
-            # Calculate how many of the last N tokens fall within this rank's range
-            tokens_in_rank = min(rank_end, original_seq_len) - max(
-                rank_start, start_position
-            )
-
-            # Calculate where these tokens start in the local chunk
-            local_start_idx = max(0, start_position - rank_start)
-
-            # Set the appropriate positions in the mask to True
-            mask[local_start_idx : local_start_idx + tokens_in_rank] = True
-
-        # Replace the integer with the boolean mask
-        batch["logits_to_keep"] = mask
-
-    # Add padding to make sequence length divisible by local_world_size
-    total_seq_len = original_seq_len
-    pad_len = 0
-    divisor = min(local_world_size, 64)
-    if total_seq_len % divisor != 0:
-        pad_len = divisor - (total_seq_len % divisor)
-
-        # Apply padding to all relevant tensors
-        for key in batch:
-            if (
-                isinstance(batch[key], torch.Tensor)
-                and batch[key].dim() > 1
-                and batch[key].size(1) == total_seq_len
-            ):
-                # Create padding tensor
-                pad_value = -100 if key == "labels" else 0
-                padding = torch.full(
-                    (batch[key].size(0), pad_len, *batch[key].shape[2:]),
-                    pad_value,
-                    dtype=batch[key].dtype,
-                    device=batch[key].device,
-                )
-
-                # Concatenate padding to the right side of the tensor
-                batch[key] = torch.cat([batch[key], padding], dim=1)
-            if key == "logits_to_keep":
-                # Create padding tensor
-                padding = torch.ones(
-                    1,
-                    dtype=batch[key].dtype,
-                    device=batch[key].device,
-                )
-
-                # Concatenate padding to the right side of the tensor
-                batch[key] = torch.cat([batch[key], padding], dim=0)
-
-        # Update the total sequence length after padding
-        total_seq_len = batch["input_ids"].size(1)
-
-    # Slice batch for sequence parallel
-    for key in batch:
-        if not isinstance(batch[key], torch.Tensor) or batch[key].dim() <= 1:
-            continue
-
-        # Split in sequential fashion and grab this rank's chunk
-        if batch[key].size(1) == total_seq_len:
-            batch[key] = (
-                batch[key].chunk(local_world_size, dim=1)[local_rank].contiguous()
-            )
-        elif key == "logits_to_keep":
-            batch[key] = (
-                batch[key].chunk(local_world_size, dim=0)[local_rank].contiguous()
-            )
-
-        # Handle num_items_in_batch
-        if "num_items_in_batch" in batch:
-            # Approximation; this needed since num_items_in_batch may be counted across
-            # all samples in a gradient accumulated batch, not on a per-step basis.
-            batch["num_items_in_batch"] = (
-                batch["labels"] != -100
-            ).sum() * gradient_accumulation_steps
-
-    return batch, original_seq_len, pad_len
-
-
-class SequenceParallelContextManager:
-    """Context manager for sequence parallelism operations.
-
-    This class provides a context that will automatically apply sequence parallelism
-    during model forward passes using a pre-forward hook, and gather outputs from
-    across the sequence parallelism group using a post-forward hook.
-
-    Args:
-        models: List of models to apply sequence parallelism to pre- and post- forward
-            hooks.
-        sequence_parallel_degree: Number of processes to split sequences over.
-        gradient_accumulation_steps: Number of steps to accumulate gradients over.
-        ring_attn_func: Which ring attention function to use. Currently unused.
-        heads_k_stride: Sequence parallelism K head stride size. Passed through to
-            `varlen_llama3` `ring_flash_attn` implementation.
-    """
-
-    def __init__(
-        self,
-        models: list[nn.Module],
-        sequence_parallel_degree: int,
-        gradient_accumulation_steps: int,
-        ring_attn_func: RingAttnFunc,
-        heads_k_stride: int | None,
-    ):
-        self.models = models
-        self.sequence_parallel_degree = sequence_parallel_degree
-        self.gradient_accumulation_steps = gradient_accumulation_steps
-        self.ring_attn_func = ring_attn_func
-        self.heads_k_stride = heads_k_stride
-        self._register_ring_attn()
-
-        # Set distributed info for local rank
-        self.process_group = get_ring_attn_group()
-        self.local_rank = dist.get_rank(self.process_group)
-        self.local_world_size = dist.get_world_size(self.process_group)
-
-        # Will store hook handles for removal
-        self.hook_handles: list[RemovableHandle] = []
-
-        # Store original sequence length and padding information
-        self.original_seq_len = 0
-        self.pad_len = 0
-
-        # Create a partially applied version of the apply_sequence_parallelism function
-        self.apply_sequence_parallelism = functools.partial(
-            apply_sequence_parallelism,
-            local_rank=self.local_rank,
-            local_world_size=self.local_world_size,
-            gradient_accumulation_steps=self.gradient_accumulation_steps,
-            ring_attn_func=self.ring_attn_func,
-        )
-
-    def __enter__(self):
-        self._register_model_hooks()
-
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        # Remove all hooks
-        for handle in self.hook_handles:
-            handle.remove()
-        self.hook_handles = []
-
-        # TODO(djsaunde): Un-patch attention and accelerate functions (low priority)
-
-    def _register_ring_attn(self):
-        # Initialize ring attn for sequence parallelism
-        register_ring_attn(
-            sequence_parallel_degree=self.sequence_parallel_degree,
-            heads_k_stride=self.heads_k_stride,
-            ring_attn_func=self.ring_attn_func,
-        )
-
-        # Patches for accelerate functionality
-        patch_prepare_data_loader()
-        patch_prepare_device_mesh(
-            sequence_parallel_degree=self.sequence_parallel_degree
-        )
-
-    def _register_model_hooks(self):
-        # Forward pre-hook to apply sequence parallelism
-        def sequence_parallel_pre_hook(_, args, kwargs):
-            # Get parameter names from the model's forward function
-            forward_params = list(
-                inspect.signature(self.models[0].forward).parameters.keys()
-            )
-
-            updated_kwargs = kwargs.copy()
-            for i, arg in enumerate(args):
-                if i < len(forward_params):
-                    updated_kwargs[forward_params[i]] = arg
-
-            # Any excess positional arguments are kept as-is
-            remaining_args = args[len(forward_params) :]
-
-            # Apply sequence parallelism to updated kwargs
-            updated_kwargs, self.original_seq_len, self.pad_len = (
-                self.apply_sequence_parallelism(updated_kwargs)
-            )
-
-            return remaining_args, updated_kwargs
-
-        # Forward post-hook to gather outputs
-        def sequence_parallel_post_hook(_, __, output: ModelOutput) -> ModelOutput:
-            # Gather the sharded outputs
-            output = self._gather_outputs(output)
-
-            # Remove padding if it was added
-            if self.pad_len > 0:
-                for key, value in output.items():
-                    if isinstance(value, torch.Tensor) and value.dim() > 1:
-                        if value.size(1) == self.original_seq_len + self.pad_len:
-                            # Slice to remove padding
-                            output[key] = value[:, : self.original_seq_len].contiguous()
-
-            return output
-
-        # Register both hooks
-        for model in self.models:
-            self.hook_handles.append(
-                model.register_forward_pre_hook(
-                    sequence_parallel_pre_hook, with_kwargs=True
-                )
-            )
-            self.hook_handles.append(
-                model.register_forward_hook(sequence_parallel_post_hook)
-            )
-
-    def _gather_outputs(self, output: CausalLMOutputWithPast) -> CausalLMOutputWithPast:
-        """Gather sharded outputs from all ranks and reconstruct the full tensor."""
-        for key, value in output.items():
-            if isinstance(value, torch.Tensor) and value.dim() > 1:
-                output[key] = AllGatherWithGrad.apply(value, self.process_group)
-
-        return output
-
-
-class AllGatherWithGrad(torch.autograd.Function):
-    """Custom autograd function for all-gather to preserve gradients."""
-
-    @staticmethod
-    def forward(
-        ctx: torch.autograd.function.FunctionCtx,
-        input_tensor: torch.Tensor,
-        group: dist.ProcessGroup,
-    ) -> torch.Tensor:
-        """
-        Forward pass of all-gather of data with sequence dimension.
-
-        Args:
-            ctx: `torch.autograd` function context.
-            input_tensor: Tensor from model output with sequence dimension.
-            group: `torch.distributed` process group.
-
-        Returns:
-            Tensor from gathering the `input_tensor` from across the process group and
-                concatenating along the sequence dimension.
-        """
-        ctx.group = group
-        ctx.rank = dist.get_rank(group)
-        world_size = dist.get_world_size(group)
-
-        # Gather shape metadata
-        local_shape = torch.tensor(list(input_tensor.shape), device=input_tensor.device)
-        all_shapes = [torch.zeros_like(local_shape) for _ in range(world_size)]
-        dist.all_gather(all_shapes, local_shape, group=group)
-
-        # Store sequence lengths for backward pass
-        seq_lens = [int(shape[1].item()) for shape in all_shapes]
-        ctx.seq_lens = seq_lens
-
-        # Perform all_gather operation
-        gathered = [
-            torch.zeros(
-                tuple(shape.tolist()),
-                dtype=input_tensor.dtype,
-                device=input_tensor.device,
-            )
-            for shape in all_shapes
-        ]
-        dist.all_gather(gathered, input_tensor, group=group)
-
-        # Concatenate tensors along sequence dimension
-        result = torch.cat(gathered, dim=1)
-
-        return result
-
-    @staticmethod
-    def backward(
-        ctx: torch.autograd.function.FunctionCtx, grad_output: torch.Tensor
-    ) -> tuple[torch.Tensor, None]:
-        """
-        Backward pass for all-gather operation.
-
-        Extracts the gradient slice corresponding to this rank's original input
-        from the full gradient tensor.
-
-        Args:
-            ctx: `torch.autograd` function context.
-            grad_output: Gradient from subsequent layers with respect to the
-                concatenated output tensor.
-
-        Returns:
-            Tuple containing the gradient slice for this rank's input tensor and `None`
-                for the process group parameter which doesn't require gradients.
-        """
-        rank = ctx.rank
-        seq_lens = ctx.seq_lens
-
-        # Extract gradient for this rank's chunk
-        offset = sum(seq_lens[:rank])
-        grad_slice = grad_output[:, offset : offset + seq_lens[rank]].contiguous()
-
-        return grad_slice, None
--- a/src/axolotl/utils/data/rl.py
+++ b/src/axolotl/utils/data/rl.py
@@ -10,7 +10,6 @@ import yaml
 from datasets import Dataset, DatasetDict, concatenate_datasets, load_from_disk

 from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
-from axolotl.loaders import load_tokenizer
 from axolotl.prompt_strategies.dpo import load as load_dpo
 from axolotl.prompt_strategies.kto import load as load_kto
 from axolotl.prompt_strategies.orpo import load as load_orpo
@@ -18,9 +17,9 @@ from axolotl.utils.data.shared import datasets_w_name_generator, load_dataset_w_
 from axolotl.utils.data.utils import deduplicate_and_log_datasets, md5
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import is_main_process, zero_first
-from axolotl.utils.schemas.enums import RLType
+from axolotl.utils.models import load_tokenizer

-LOG = logging.getLogger(__name__)
+LOG = logging.getLogger("axolotl")


 def _get_path(ds_hash, cfg):
@@ -72,7 +71,6 @@ def map_dataset(cfg, data_set, ds_transform_fn, tokenizer, **map_kwargs):
    data_set = data_set.map(
        ds_transform_fn,
        desc="Mapping RL Dataset",
-        num_proc=cfg.dataset_processes,
        **map_kwargs,
    )

@@ -82,7 +80,7 @@ def map_dataset(cfg, data_set, ds_transform_fn, tokenizer, **map_kwargs):
 def drop_long_rl_seq(
    sample, rl, tokenizer, sequence_len  # pylint: disable=invalid-name
 ):
-    if rl in (RLType.DPO, RLType.IPO, RLType.ORPO, RLType.SIMPO):
+    if rl in ("dpo", "ipo", "orpo", "simpo"):
        if not (
            sample.get("prompt") and sample.get("chosen") and sample.get("rejected")
        ):
@@ -102,7 +100,7 @@ def drop_long_rl_seq(
            len_prompt + len_rejected
        ) <= sequence_len

-    if rl is RLType.KTO:
+    if rl == "kto":
        if not (sample.get("prompt") and sample.get("completion")):
            raise ValueError("Prompt and completion keys are required for KTO datasets")

@@ -116,7 +114,7 @@ def drop_long_rl_seq(

        return (len_prompt + len_completion) <= sequence_len

-    if rl is RLType.GRPO:
+    if rl == "grpo":
        return True

    raise ValueError("Unknown RL type")
@@ -139,9 +137,9 @@ def load_prepare_preference_datasets(cfg):
            if _type:
                if isinstance(_type, DictDefault):
                    _type = "user_defined.default"
-                if _cfg.rl is RLType.ORPO:
+                if _cfg.rl == "orpo":
                    ds_transform_fn = load_orpo(_type, _cfg, dataset_idx=i)
-                elif _cfg.rl is RLType.KTO:
+                elif _cfg.rl == "kto":
                    ds_transform_fn = load_kto(_type, _cfg, dataset_idx=i)
                else:
                    ds_transform_fn = load_dpo(_type, _cfg, dataset_idx=i)
@@ -152,7 +150,7 @@ def load_prepare_preference_datasets(cfg):
                split_datasets[i] = map_dataset(
                    cfg, data_set, ds_transform_fn, tokenizer, **map_kwargs
                )
-            elif _cfg.rl is RLType.KTO:
+            elif _cfg.rl == "kto":
                ds_transform_fn = load_kto(_type, _cfg, dataset_idx=i)
                map_kwargs = {}
                if isinstance(ds_transform_fn, tuple):
@@ -187,7 +185,7 @@ def load_prepare_preference_datasets(cfg):
                    )

        combined_datasets = concatenate_datasets(split_datasets)
-        combined_datasets = combined_datasets.shuffle(seed=cfg.seed or 42)
+        combined_datasets = combined_datasets.shuffle(seed=cfg.seed)

        return combined_datasets

@@ -207,8 +205,6 @@ def load_prepare_preference_datasets(cfg):
                eval_dataset = load_split(cfg.test_datasets, cfg)
        if not eval_dataset:
            if cfg.val_set_size:
-                seed = cfg.seed if cfg.seed is not None else 42
-
                # ensure we end up with the same fingerprint by doing rank0 first and being able to cache
                to_hash_train = (
                    train_dataset._fingerprint  # pylint: disable=protected-access
@@ -217,7 +213,7 @@ def load_prepare_preference_datasets(cfg):
                    + "|"
                    + "train"
                    + "|"
-                    + str(seed)
+                    + str(cfg.seed or 42)
                )
                to_hash_test = (
                    train_dataset._fingerprint  # pylint: disable=protected-access
@@ -226,13 +222,13 @@ def load_prepare_preference_datasets(cfg):
                    + "|"
                    + "test"
                    + "|"
-                    + str(seed)
+                    + str(cfg.seed or 42)
                )
                train_fingerprint = md5(to_hash_train)
                test_fingerprint = md5(to_hash_test)
                ds_w_test_split = train_dataset.train_test_split(
                    test_size=cfg.val_set_size,
-                    seed=seed,
+                    seed=cfg.seed,
                    shuffle=False,
                    train_new_fingerprint=train_fingerprint,
                    test_new_fingerprint=test_fingerprint,
--- a/Show More
+++ b/Show More