handle empty offset for quant state

2025-05-01 13:01:00 -04:00
141 changed files with 3497 additions and 6953 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -31,11 +31,6 @@ jobs:
            python_version: "3.11"
            pytorch: 2.7.0
            axolotl_extras:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.7.0
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -99,11 +94,6 @@ jobs:
            python_version: "3.11"
            pytorch: 2.7.0
            axolotl_extras:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.7.0
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -3,7 +3,7 @@ name: docker-multigpu-tests-biweekly
 on:
  pull_request:
    paths:
-      - 'tests/e2e/multigpu/**.py'
+      - 'tests/e2e/multigpu/*.py'
      - 'requirements.txt'
      - 'setup.py'
      - 'pyproject.toml'
--- a/.github/workflows/preview-docs.yml
+++ b/.github/workflows/preview-docs.yml
@@ -4,12 +4,6 @@ on:
  pull_request:
    types: [opened, synchronize, reopened]
    # Run the workflow only when one of these files changes
    paths:
      - '**/*.md'      # any Markdown file
      - '**/*.qmd'     # any Quarto file
      - '_quarto.yaml'
 permissions:
  checks: write
  contents: write
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -18,96 +18,9 @@ jobs:
        env:
          SKIP: no-commit-to-branch
  preload-cache:
    name: Preload HF cache
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        python_version: ["3.11"]
        pytorch_version: ["2.6.0"]
    timeout-minutes: 20
    env:
      AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
    steps:
      - name: Check out repository code
        uses: actions/checkout@v4
      - name: Restore HF cache
        id: hf-cache-restore
        uses: actions/cache/restore@v4
        with:
          path: |
            /home/runner/.cache/huggingface/hub/datasets--*
            /home/runner/.cache/huggingface/hub/models--*
          key: ${{ runner.os }}-hf-hub-cache-v2
      - name: Setup Python
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
          cache: 'pip' # caching pip dependencies
      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
      - name: Install PyTorch
        run: |
          pip3 install torch==${{ matrix.pytorch_version }}
      - name: Install dependencies
        run: |
          pip3 show torch
          pip3 install --no-build-isolation -U -e .
          python scripts/unsloth_install.py | sh
          python scripts/cutcrossentropy_install.py | sh
          pip3 install -r requirements-dev.txt -r requirements-tests.txt
      - name: Make sure PyTorch version wasn't clobbered
        run: |
          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
      - name: Ensure axolotl CLI was installed
        run: |
          axolotl --help
      - name: Pre-Download dataset fixture
        run: |
          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
      - name: Run tests
        run: |
          pytest -v tests/conftest.py
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v5
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          files: ./coverage.xml
          flags: unittests,pytorch-${{ matrix.pytorch_version }}
          fail_ci_if_error: false
      - name: cleanup pip cache
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
      - name: Save HF cache
        id: hf-cache
        uses: actions/cache/save@v4
        with:
          path: |
            /home/runner/.cache/huggingface/hub/datasets--*
            /home/runner/.cache/huggingface/hub/models--*
          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
  pytest:
    name: PyTest
    runs-on: ubuntu-latest
    needs: [preload-cache]
    strategy:
      fail-fast: false
      max-parallel: 2
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -44,104 +44,12 @@ jobs:
        env:
          SKIP: no-commit-to-branch
 #  preload-cache:
 #    name: Preload HF cache
 #    runs-on: ubuntu-latest
 #    strategy:
 #      fail-fast: false
 #      matrix:
 #        python_version: ["3.11"]
 #        pytorch_version: ["2.6.0"]
 #    timeout-minutes: 20
 #
 #    env:
 #      AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
 #
 #    steps:
 #      - name: Check out repository code
 #        uses: actions/checkout@v4
 #
 #      - name: Restore HF cache
 #        id: hf-cache-restore
 #        uses: actions/cache/restore@v4
 #        with:
 #          path: |
 #            /home/runner/.cache/huggingface/hub/datasets--*
 #            /home/runner/.cache/huggingface/hub/models--*
 #          key: ${{ runner.os }}-hf-hub-cache-v2
 #
 #      - name: Restore Cache from S3
 #        id: hf-cache-restore-s3
 #        run: |
 #          mkdir -p /home/runner/.cache/huggingface/hub
 #          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd
 #
 #      - name: Setup Python
 #        uses: actions/setup-python@v5
 #        with:
 #          python-version: ${{ matrix.python_version }}
 #          cache: 'pip' # caching pip dependencies
 #
 #      - name: upgrade pip
 #        run: |
 #          pip3 install --upgrade pip
 #          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
 #
 #      - name: Install PyTorch
 #        run: |
 #          pip3 install torch==${{ matrix.pytorch_version }}
 #
 #      - name: Install dependencies
 #        run: |
 #          pip3 show torch
 #          pip3 install --no-build-isolation -U -e .
 #          python scripts/unsloth_install.py | sh
 #          python scripts/cutcrossentropy_install.py | sh
 #          pip3 install -r requirements-dev.txt -r requirements-tests.txt
 #
 #      - name: Make sure PyTorch version wasn't clobbered
 #        run: |
 #          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
 #
 #      - name: Ensure axolotl CLI was installed
 #        run: |
 #          axolotl --help
 #
 #      - name: Pre-Download dataset fixture
 #        run: |
 #          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
 #
 #      - name: Run tests
 #        run: |
 #          pytest -v tests/conftest.py
 #
 #      - name: Upload coverage to Codecov
 #        uses: codecov/codecov-action@v5
 #        with:
 #          token: ${{ secrets.CODECOV_TOKEN }}
 #          files: ./coverage.xml
 #          flags: unittests,pytorch-${{ matrix.pytorch_version }}
 #          fail_ci_if_error: false
 #
 #      - name: cleanup pip cache
 #        run: |
 #          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
 #
 #      - name: Save HF cache
 #        id: hf-cache
 #        uses: actions/cache/save@v4
 #        with:
 #          path: |
 #            /home/runner/.cache/huggingface/hub/datasets--*
 #            /home/runner/.cache/huggingface/hub/models--*
 #          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
  pytest:
    name: PyTest
    runs-on: ubuntu-latest
 #    needs: [preload-cache]
    strategy:
      fail-fast: false
      max-parallel: 2
      matrix:
        python_version: ["3.11"]
        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
@@ -151,20 +59,14 @@ jobs:
      - name: Check out repository code
        uses: actions/checkout@v4
-#      - name: Restore HF cache
+      - name: Restore HF cache
-#        id: hf-cache-restore
+        id: hf-cache-restore
-#        uses: actions/cache/restore@v4
+        uses: actions/cache/restore@v4
-#        with:
+        with:
-#          path: |
+          path: |
-#            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/datasets--*
-#            /home/runner/.cache/huggingface/hub/models--*
+            /home/runner/.cache/huggingface/hub/models--*
-#          key: ${{ runner.os }}-hf-hub-cache-v2
+          key: ${{ runner.os }}-hf-hub-cache-v2
      - name: Restore Cache from S3
        id: hf-cache-restore-s3
        run: |
          mkdir -p /home/runner/.cache/huggingface/hub
          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd
      - name: Setup Python
        uses: actions/setup-python@v5
@@ -219,12 +121,21 @@ jobs:
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
      - name: Save HF cache
        id: hf-cache
        uses: actions/cache/save@v4
        with:
          path: |
            /home/runner/.cache/huggingface/hub/datasets--*
            /home/runner/.cache/huggingface/hub/models--*
          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
  pytest-sdist:
    name: PyTest from Source Dist
    runs-on: ubuntu-latest
 #    needs: [preload-cache]
    strategy:
      fail-fast: false
      max-parallel: 1
      matrix:
        python_version: ["3.11"]
        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
@@ -234,20 +145,14 @@ jobs:
      - name: Check out repository code
        uses: actions/checkout@v4
-#      - name: Restore HF cache
+      - name: Restore HF cache
-#        id: hf-cache-restore
+        id: hf-cache-restore
-#        uses: actions/cache/restore@v4
+        uses: actions/cache/restore@v4
-#        with:
+        with:
-#          path: |
+          path: |
-#            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/datasets--*
-#            /home/runner/.cache/huggingface/hub/models--*
+            /home/runner/.cache/huggingface/hub/models--*
-#          key: ${{ runner.os }}-hf-hub-cache-v2
+          key: ${{ runner.os }}-hf-hub-cache-v2
      - name: Restore Cache from S3
        id: hf-cache-restore-s3
        run: |
          mkdir -p /home/runner/.cache/huggingface/hub
          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd
      - name: Setup Python
        uses: actions/setup-python@v5
@@ -294,8 +199,16 @@ jobs:
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
      - name: Save HF cache
        id: hf-cache
        uses: actions/cache/save@v4
        with:
          path: |
            /home/runner/.cache/huggingface/hub/datasets--*
            /home/runner/.cache/huggingface/hub/models--*
          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
  docker-e2e-tests-1st:
    # Run this job first as a gate for running the remainder of the test matrix
    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
@@ -342,8 +255,6 @@ jobs:
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
    timeout-minutes: 90
    # Only run the remainder of the matrix if the first e2e check passed;
    # this is to save on wasted compute costs for known failures that get caught in the first run
    needs: [pre-commit, pytest, docker-e2e-tests-1st]
    strategy:
@@ -356,6 +267,12 @@ jobs:
            pytorch: 2.6.0
            num_gpus: 1
            axolotl_extras: llmcompressor
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.4.1
            num_gpus: 1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -368,12 +285,6 @@ jobs:
            pytorch: 2.7.0
            num_gpus: 1
            axolotl_extras:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.7.0
            num_gpus: 1
            axolotl_extras:
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -398,43 +309,3 @@ jobs:
      - name: Run tests job on Modal
        run: |
          modal run cicd.e2e_tests
  docker-e2e-cleanup:
    runs-on: [self-hosted, modal]
    timeout-minutes: 90
    needs: [docker-e2e-tests]
    strategy:
      fail-fast: false
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
            num_gpus: 1
            axolotl_extras: vllm
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Install Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
          pip install modal==0.71.8 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
          modal run cicd.cleanup
--- a/.runpod/src/handler.py
+++ b/.runpod/src/handler.py
@@ -57,9 +57,7 @@ async def handler(job):
    logger.info("Training Complete.")
    # Cleanup
    if "WANDB_API_KEY" in os.environ:
    del os.environ["WANDB_API_KEY"]
    if "HF_TOKEN" in os.environ:
    del os.environ["HF_TOKEN"]
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -48,22 +48,8 @@ quartodoc:
      contents:
        - core.trainers.base
        - core.trainers.trl
        - core.trainers.mamba
        - core.trainers.relora
        - core.trainers.dpo.trainer
        - core.trainers.grpo.trainer
        - core.trainers.grpo.sampler
        - core.trainers.utils
    - title: Mixins
      desc: Mixin classes for augmenting trainers
      contents:
        - core.trainers.mixins.optimizer
        - core.trainers.mixins.rng_state_loader
        - core.trainers.mixins.scheduler
    - title: Context Managers
      desc: Context managers for altering trainer behaviors
      contents:
        - utils.ctx_managers.sequence_parallel
    - title: Prompt Strategies
      desc: Prompt formatting strategies
      contents:
@@ -138,8 +124,7 @@ quartodoc:
        - utils.optimizers.adopt
        - utils.data.pretraining
        - utils.data.sft
-        - utils.gradient_checkpointing.offload_cpu
+        - utils.gradient_checkpointing.unsloth
        - utils.gradient_checkpointing.offload_disk
    - title: Schemas
      desc: Pydantic data models for Axolotl config
      contents:
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -18,7 +18,7 @@ pytest -v --durations=10 \
  --cov-append
 # Run patched tests excluding lora kernels with coverage append
-pytest --full-trace -vvv --durations=10 \
+pytest -v --durations=10 \
  --ignore=tests/e2e/patched/lora_kernels \
  /workspace/axolotl/tests/e2e/patched \
  --cov=axolotl \
--- a/cicd/cleanup.py
+++ b/cicd/cleanup.py
@@ -1,19 +0,0 @@
 """Modal app to run axolotl GPU cleanup"""
 from .single_gpu import VOLUME_CONFIG, app, cicd_image, run_cmd
@app.function(
    image=cicd_image,
    timeout=60 * 60,
    cpu=8.0,
    memory=131072,
    volumes=VOLUME_CONFIG,
 )
 def cleanup():
    run_cmd("./cicd/cleanup.sh", "/workspace/axolotl")
@app.local_entrypoint()
 def main():
    cleanup.remote()
--- a/cicd/cleanup.sh
+++ b/cicd/cleanup.sh
@@ -1,6 +0,0 @@
 #!/bin/bash
 set -e
 # cleanup old cache files for datasets processing and intermediate mappings
 find /workspace/data/huggingface-cache/hub/datasets -name "cache-*" -type f -mtime +1 -exec rm {} \;
 find /workspace/data/huggingface-cache/hub/datasets -name "*.lock" -type f -mtime +1 -exec rm {} \;
--- a/cicd/e2e_tests.py
+++ b/cicd/e2e_tests.py
@@ -1,12 +1,75 @@
 """Modal app to run axolotl GPU tests"""
-from .single_gpu import GPU_CONFIG, VOLUME_CONFIG, app, cicd_image, run_cmd
+# pylint: disable=duplicate-code
 import os
 import pathlib
 import tempfile
 import jinja2
 import modal
 from jinja2 import select_autoescape
 from modal import App, Image
 cicd_path = pathlib.Path(__file__).parent.resolve()
 template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
 template_env = jinja2.Environment(
    loader=template_loader, autoescape=select_autoescape()
 )
 df_template = template_env.get_template("Dockerfile.jinja")
 df_args = {
    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
    "CUDA": os.environ.get("CUDA", "121"),
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
    "HF_HOME": "/workspace/data/huggingface-cache/hub",
 }
 dockerfile_contents = df_template.render(**df_args)
 temp_dir = tempfile.mkdtemp()
 with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
    f.write(dockerfile_contents)
 cicd_image = Image.from_dockerfile(
    pathlib.Path(temp_dir) / "Dockerfile",
    context_mount=None,
    force_build=True,
    gpu="A10G",
 ).env(df_args)
 app = App("Axolotl CI/CD", secrets=[])
 hf_cache_volume = modal.Volume.from_name(
    "axolotl-ci-hf-hub-cache", create_if_missing=True
 )
 VOLUME_CONFIG = {
    "/workspace/data/huggingface-cache/hub": hf_cache_volume,
 }
 N_GPUS = int(os.environ.get("N_GPUS", 1))
 GPU_CONFIG = modal.gpu.L40S(count=N_GPUS)
 def run_cmd(cmd: str, run_folder: str):
    import subprocess  # nosec
    # Propagate errors from subprocess.
    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
        exit(exit_code)  # pylint: disable=consider-using-sys-exit
@app.function(
    image=cicd_image,
    gpu=GPU_CONFIG,
-    timeout=90 * 60,  # 90 min
+    timeout=60 * 60,
    cpu=8.0,
    memory=131072,
    volumes=VOLUME_CONFIG,
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -70,7 +70,7 @@ def run_cmd(cmd: str, run_folder: str):
    image=cicd_image,
    gpu=GPU_CONFIG,
    timeout=90 * 60,
-    cpu=16.0,
+    cpu=8.0,
    memory=131072 * N_GPUS,
    volumes=VOLUME_CONFIG,
 )
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
@@ -1,66 +0,0 @@
 """Modal app to run axolotl GPU tests"""
 # pylint: disable=duplicate-code
 import os
 import pathlib
 import tempfile
 import jinja2
 import modal
 from jinja2 import select_autoescape
 from modal import App, Image
 cicd_path = pathlib.Path(__file__).parent.resolve()
 template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
 template_env = jinja2.Environment(
    loader=template_loader, autoescape=select_autoescape()
 )
 df_template = template_env.get_template("Dockerfile.jinja")
 df_args = {
    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
    "CUDA": os.environ.get("CUDA", "121"),
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
    "HF_HOME": "/workspace/data/huggingface-cache/hub",
 }
 dockerfile_contents = df_template.render(**df_args)
 temp_dir = tempfile.mkdtemp()
 with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
    f.write(dockerfile_contents)
 cicd_image = Image.from_dockerfile(
    pathlib.Path(temp_dir) / "Dockerfile",
    context_mount=None,
    force_build=True,
    gpu="A10G",
 ).env(df_args)
 app = App("Axolotl CI/CD", secrets=[])
 hf_cache_volume = modal.Volume.from_name(
    "axolotl-ci-hf-hub-cache", create_if_missing=True
 )
 VOLUME_CONFIG = {
    "/workspace/data/huggingface-cache/hub": hf_cache_volume,
 }
 N_GPUS = int(os.environ.get("N_GPUS", 1))
 GPU_CONFIG = modal.gpu.L40S(count=N_GPUS)
 def run_cmd(cmd: str, run_folder: str):
    import subprocess  # nosec
    # Propagate errors from subprocess.
    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
        exit(exit_code)  # pylint: disable=consider-using-sys-exit
--- a/codecov.yml
+++ b/codecov.yml
@@ -19,7 +19,7 @@ coverage:
        if_no_uploads: error
        if_not_found: success
        if_ci_failed: error
-        only_pulls: true
+        only_pulls: false
        flags: null
        paths: null
    patch:
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -32,8 +32,6 @@ tokenizer_legacy:
 resize_token_embeddings_to_32x:
 # Optional[bool] Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.
 shrink_embeddings:
 # Optional[bool] Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs
 embeddings_skip_upcast:
 # Whether to load the model with randomly initialized weights. Useful for
 # pre-training a model from scratch or debugging purposes.
 random_init_weights:
@@ -75,12 +73,11 @@ load_in_8bit: true
 load_in_4bit:
 # Use CUDA bf16
-bf16: true # bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection. require >=ampere
+bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
 # Use CUDA fp16
 fp16: true
 # Use CUDA tf32
 tf32: true # require >=ampere
 # Note: if bf16 is set to 'auto', and fp16 is set to true, we will prefer the explict fp16 setting
 # No AMP (automatic mixed precision)
 bfloat16: true # require >=ampere
@@ -187,8 +184,8 @@ datasets:
    # adding a system turn with empty content.
    drop_system_message:
-    # Optional[bool]. (for Qwen3 template only) Whether to split the assistant content based on a reasoning trace inside delimited tags
+    # Optional[bool]. Whether to split the assistant turn based on a reasoning trace inside delimited tags
-    # See example at `docs/dataset-formats/conversation.qmd`
+    # defaults to False
    split_thinking:
    # IMPORTANT: The following fields determine which parts of the conversation to train on.
@@ -505,7 +502,6 @@ save_strategy: # Set to `"no"` to skip checkpoint saves, `"epoch"` at end of eac
 save_steps: # Leave empty to save at each epoch, integer for every N steps. float for fraction of total steps
 saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
 save_total_limit: # Checkpoints saved at a time
 save_only_model: # Save only the model weights, skipping the optimizer. Using this means you can't resume from checkpoints.
 # Maximum number of iterations to train for. It precedes num_epochs which means that
 # if both are set, num_epochs will not be guaranteed.
 # e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
@@ -539,7 +535,7 @@ train_on_inputs: false
 # Note that training loss may have an oscillating pattern with this enabled.
 group_by_length: false
-# Whether to use gradient checkpointing. Available options are: true, false, "offload", "offload_disk".
+# Whether to use gradient checkpointing. Available options are: true, false, "offload".
 # https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
 gradient_checkpointing: false
 # additional kwargs to pass to the trainer for gradient checkpointing
@@ -551,7 +547,7 @@ gradient_checkpointing: false
 early_stopping_patience: 3
 # Specify a scheduler and kwargs to use with the optimizer
-lr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | 'linear' | 'cosine_with_restarts' | 'polynomial' | 'constant' | 'constant_with_warmup' | 'inverse_sqrt' | 'reduce_lr_on_plateau' | 'cosine_with_min_lr' | 'warmup_stable_decay' | empty for cosine
+lr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | empty for cosine
 lr_scheduler_kwargs:
 cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
 cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)
@@ -613,7 +609,6 @@ lr_div_factor: # Learning rate div factor
 # - optimi_adamw
 # - ao_adamw_8bit
 # - ao_adamw_fp8
 # - came_pytorch
 optimizer:
 # Dictionary of arguments to pass to the optimizer
 optim_args:
@@ -633,9 +628,7 @@ weight_decay:
 # adamw hyperparams
 adam_beta1:
 adam_beta2:
 adam_beta3:  # only used for CAME Optimizer
 adam_epsilon:
 adam_epsilon2:  # only used for CAME Optimizer
 # Gradient clipping max norm
 max_grad_norm:
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -196,34 +196,6 @@ datasets:
 It is not necessary to set both `message_field_training` and `message_field_training_detail` at once.
 :::
 8. (For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.
 ```yaml
 datasets:
  - path: ...
    type: chat_template
    chat_template: qwen3
    split_thinking: true
 ```
 For example, a content can look like:
 ```json
 {
  "content": "<think>Some thinking outputs</think>Output after thinking."
 }
 ```
 After split, it will look like:
 ```json
 {
  "reasoning_content": "Some thinking outputs",
  "content": "Output after thinking..."
 }
 ```
 ## sharegpt
 ::: {.callout-important}
--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -8,10 +8,6 @@ format:
 This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai).
 ::: {.callout-important}
 For Blackwell GPUs, please use the tags with Pytorch 2.7.0 and CUDA 12.8.
 :::
 ## Base
 The base image is the most minimal image that can install Axolotl. It is based on the `nvidia/cuda` image. It includes python, torch, git, git-lfs, awscli, pydantic, and more.
--- a/docs/getting-started.qmd
+++ b/docs/getting-started.qmd
@@ -104,7 +104,7 @@ the `alpaca` dataset format, which has the following format:
 Please see our [Dataset Formats](dataset-formats) for more dataset formats and how to
 format them.
-2. Prepare your JSONL data in the specified format (in this case, the expected `alpaca`
+2. Prepare your JSONL data in the specified format (in this case, the expected `alpaca
 format):
 ```json
@@ -120,12 +120,6 @@ axolotl train my_training.yml
 ## Common Tasks {#sec-common-tasks}
 ::: {.callout-tip}
 The same yaml file is used for training, inference, and merging.
 :::
 ### Testing Your Model {#sec-testing}
 After training, test your model:
@@ -134,16 +128,6 @@ After training, test your model:
 axolotl inference my_training.yml --lora-model-dir="./outputs/lora-out"
 ```
 More details can be found in [Inference](inference.qmd).
 ### Using a UI {#sec-ui}
 Launch a Gradio interface:
 ```bash
 axolotl inference my_training.yml --lora-model-dir="./outputs/lora-out" --gradio
 ```
 ### Preprocessing Data {#sec-preprocessing}
 For large datasets, preprocess first:
@@ -152,22 +136,14 @@ For large datasets, preprocess first:
 axolotl preprocess my_training.yml
 ```
-Please make sure to set `dataset_prepared_path: ` in your config to set the path to save the prepared dataset.
+### Using a UI {#sec-ui}
-More details can be found in [Dataset Preprocessing](dataset_preprocessing.qmd).
+Launch a Gradio interface:
 ### Merging LoRA weights {#sec-merging-lora}
 To merge the LoRA weights back into the base model, run:
 ```bash
-axolotl merge-lora my_training.yml --lora-model-dir="./outputs/lora-out"
+axolotl inference my_training.yml --lora-model-dir="./outputs/lora-out" --gradio
 ```
 The merged model will be saved in the `{output_dir}/merged` directory.
 More details can be found in [Merging LoRA weights](inference.qmd#sec-merging).
 ## Next Steps {#sec-next-steps}
 Now that you have the basics, you might want to:
@@ -180,7 +156,6 @@ Now that you have the basics, you might want to:
 Check our other guides for details on these topics:
 - [Configuration Guide](config.qmd) - Full configuration options
 - [Dataset Loading](dataset-loading.qmd) - Loading datasets from various sources
 - [Dataset Formats](dataset-formats) - Working with different data formats
 - [Multi-GPU Training](multi-gpu.qmd)
 - [Multi-Node Training](multi-node.qmd)
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -25,10 +25,6 @@ Please make sure to have Pytorch installed before installing Axolotl in your loc
 Follow the instructions at: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
 :::
 ::: {.callout-important}
 For Blackwell GPUs, please use Pytorch 2.7.0 and CUDA 12.8.
 :::
 ### PyPI Installation (Recommended) {#sec-pypi}
 ```{.bash}
@@ -76,10 +72,6 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it \
 ```
 :::
 ::: {.callout-important}
 For Blackwell GPUs, please use `axolotlai/axolotl:main-py3.11-cu128-2.7.0` or the cloud variant `axolotlai/axolotl-cloud:main-py3.11-cu128-2.7.0`.
 :::
 Please refer to the [Docker documentation](docker.qmd) for more information on the different Docker images that are available.
 ## Cloud Environments {#sec-cloud}
--- a/docs/multi-gpu.qmd
+++ b/docs/multi-gpu.qmd
@@ -87,7 +87,20 @@ We support sequence parallelism (SP) via the
 allows one to split up sequences across GPUs, which is useful in the event that a
 single sequence causes OOM errors during model training.
-See our [dedicated guide](sequence_parallelism.qmd) for more information.
+First, install `ring-flash-attn`, recommended via `pip install axolotl[ring-flash-attn]`,
 or from source with `pip install .[ring-flash-attn]`.
 Your Axolotl YAML config should contain the following lines:
 ```{.yaml}
 sequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU
 flash_attention: true  # Required with sequence parallelism
 # Optional; strides across the key dimension. Larger values use more memory but will make training faster.
 heads_k_stride: 1
 ```
 See our [dedicated guide](sequence_parallelism.qmd) for more details.
 ### FSDP + QLoRA {#sec-fsdp-qlora}
--- a/docs/sequence_parallelism.qmd
+++ b/docs/sequence_parallelism.qmd
@@ -3,6 +3,8 @@ title: Sequence Parallelism
 description: Train with long sequences split across multiple GPUs.
 ---
 # Sequence Parallelism
 Sequence parallelism is a technique that splits sequences across multiple GPUs,
 allowing you to train with very long sequences that wouldn't fit on a single GPU. Each
 GPU processes a different portion of the sequence, and the results are aggregated
@@ -25,7 +27,7 @@ To enable sequence parallelism, add the following to your configuration file:
 sequence_parallel_degree: 4  # Split sequences across 4 GPUs
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 heads_k_stride: 1
-# Optional; one of "varlen_llama3" or "batch_ring". Defaults to
+# Optional; one of "varlen_llama3", "batch_ring", "batch_zigzag", "batch_stripe". Defaults to
 # "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
 ring_attn_func:
 ```
@@ -41,7 +43,7 @@ When sequence parallelism is enabled:
 1. Each sequence is divided into equal chunks across the GPUs in a sequence parallel group
 2. The data collator handles the chunking of input_ids, attention_mask, labels, and position_ids
-3. Position IDs are adjusted to maintain proper relative positions
+3. Position IDs are adjusted to maintain proper relative positions, especially for packed sequences
 4. The trainer uses special ring communication patterns for attention operations
 ## Requirements
@@ -67,11 +69,9 @@ sequence_len: 8192
 ...
 sequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU
 flash_attention: true  # Required with sequence parallelism
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 heads_k_stride: 1
 # Optional; one of "varlen_llama3" or "batch_ring". Defaults to
 # "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
 ring_attn_func:
 ...
 ```
--- a/examples/llama-4/README.md
+++ b/examples/llama-4/README.md
@@ -34,5 +34,3 @@ We provide a script to delinearize Llama 4 linearized models into regular Huggin
 ```bash
 axolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir
 ```
 Note: This only works with the non-quantized linearized model. If you have an adapter, merge it with the *non-quantized linearized* model before delinearizing.
--- a/examples/mistral/bigstral-ds-zero3.yaml
+++ b/examples/mistral/bigstral-ds-zero3.yaml
--- a/examples/mistral/devstral-small-2505.yml
+++ b/examples/mistral/devstral-small-2505.yml
@@ -1,48 +0,0 @@
 base_model: mistralai/Devstral-Small-2505
 processor_type: AutoProcessor
 # these 3 lines are needed for now to handle vision chat templates w images
 skip_prepare_dataset: true
 remove_unused_columns: false
 sample_packing: false
 chat_template: mistral_v7_tekken
 datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]
    field_messages: messages
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.01
 output_dir: ./outputs/out
 sequence_len: 2048
 pad_to_sequence_len: false
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 logging_steps: 1
 flash_attention: false
 eager_attention:
 warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
--- a/examples/orpheus/README.md
+++ b/examples/orpheus/README.md
@@ -1,341 +0,0 @@
 # Finetuning LLMs to output audio
 In this example, we finetune Orpcanopylabs/orpheus-tts-0.1-pretrained (a LLaMA 3.2 3b model) to output audio.
 The `finetune.yml` withe current settings will run on any Nvidia GPU with 45GB VRAM or more. If you adjust the batch size it can easily run on any GPU under 24GB.
 ## Dataset pre-processing for pre-training
 If you are adding another voice in English, please jump ahead to finetuning pre-processing.
 For this to work, we need to preprocess our dataset. Since we are expecting to output audio, we will need to add tokens to the tokenizer.
 Using this code, it will download the SNAC model and add the correct tokens and upload the final dataset.
 ```python
 import torch
 from snac import SNAC
 from datasets import load_dataset
 from huggingface_hub import snapshot_download
 from datasets import load_dataset
 import random
 import torchaudio.transforms as T
 from transformers import AutoTokenizer
 import os
 my_original_dataset_name = "<huggingface-id-of-dataset-that-we-want-to-preprocess>"
 name_to_push_dataset_to = "<huggingface-id-of-where-to-save-dataset>"
 dsn = my_original_dataset_name
 snapshot_download(
    repo_id=dsn,
    repo_type="dataset",
    revision="main",
    max_workers=64,
 )
 ds = load_dataset(dsn, split="train")
 ds_sample_rate = ds[0]["audio"]["sampling_rate"]
 model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
 model = model.to("mps")
 def tokenise_audio(waveform):
  waveform = torch.from_numpy(waveform).unsqueeze(0)
  waveform = waveform.to(dtype=torch.float32)
  resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)
  waveform = resample_transform(waveform)
  waveform = waveform.unsqueeze(0).to("cuda")
  #generate the codes from snac
  with torch.inference_mode():
    codes = model.encode(waveform)
  all_codes = []
  for i in range(codes[0].shape[1]):
    all_codes.append(codes[0][0][i].item()+128266)
    all_codes.append(codes[1][0][2*i].item()+128266+4096)
    all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))
    all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))
    all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))
    all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))
    all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))
  return all_codes
 def add_codes(example):
    # Always initialize codes_list to None
    codes_list = None
    try:
        answer_audio = example.get("audio")
        # If there's a valid audio array, tokenise it
        if answer_audio and "array" in answer_audio:
            audio_array = answer_audio["array"]
            codes_list = tokenise_audio(audio_array)
    except Exception as e:
        print(f"Skipping row due to error: {e}")
        # Keep codes_list as None if we fail
    example["codes_list"] = codes_list
    return example
 ds = ds.map(add_codes, remove_columns=["audio"])
 #@title Load Tokenizer
 tokeniser_length = 128256
 start_of_text = 128000
 end_of_text = 128009
 start_of_speech = tokeniser_length + 1
 end_of_speech = tokeniser_length + 2
 start_of_human = tokeniser_length + 3
 end_of_human = tokeniser_length + 4
 start_of_ai = tokeniser_length + 5
 end_of_ai =  tokeniser_length + 6
 pad_token = tokeniser_length + 7
 audio_tokens_start = tokeniser_length + 10
 tokenizer_name = "canopylabs/orpheus-3b-0.1-pretrained"
 tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
 num_proc = os.cpu_count() - 2
 ds = ds.filter(lambda x: x["codes_list"] is not None)
 ds = ds.filter(lambda x: len(x["codes_list"]) > 0)
 #@title Create Input Ids
 def remove_duplicate_frames(example):
    vals = example["codes_list"]
    if len(vals) % 7 != 0:
        raise ValueError("Input list length must be divisible by 7")
    result = vals[:7]
    removed_frames = 0
    for i in range(7, len(vals), 7):
        current_first = vals[i]
        previous_first = result[-7]
        if current_first != previous_first:
            result.extend(vals[i:i+7])
        else:
            removed_frames += 1
    example["codes_list"] = result
    return example
 ds = ds.map(remove_duplicate_frames, num_proc=num_proc)
 def create_input_ids(example):
    text_ids = tokenizer.encode({example['text']},  add_special_tokens=True)
    text_ids.append(end_of_text)
    example["text_tokens"] = text_ids
    input_ids = (
        [start_of_human]
        + example["text_tokens"]
        + [end_of_human]
        + [start_of_ai]
        + [start_of_speech]
        + example["codes_list"]
        + [end_of_speech]
        + [end_of_ai]
    )
    example["input_ids"] = input_ids
    example["labels"] = input_ids
    example["attention_mask"] = [1] * len(input_ids)
    return example
 ds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=["text", "codes_list"])
 #@title Remove unnecessary columns
 columns_to_keep = ["input_ids", "labels", "attention_mask"]
 columns_to_remove = [col for col in ds.column_names if col not in columns_to_keep]
 ds = ds.remove_columns(columns_to_remove)
 ds.push_to_hub(name_to_push_dataset_to)
 ```
 ## Finetune pre-processing
 Use this code to add a new voice.
 ```python
 import torch
 from snac import SNAC
 from datasets import load_dataset
 from huggingface_hub import snapshot_download
 from datasets import load_dataset
 import random
 import torchaudio.transforms as T
 from transformers import AutoTokenizer
 import os
 my_original_dataset_name = "<huggingface-id-of-dataset-that-we-want-to-preprocess>"
 name_to_push_dataset_to = "<huggingface-id-of-where-to-save-dataset>"
 dsn = my_original_dataset_name
 snapshot_download(
    repo_id=dsn,
    repo_type="dataset",
    revision="main",
    max_workers=64,
 )
 ds = load_dataset(dsn, split="train")
 ds_sample_rate = ds[0]["audio"]["sampling_rate"]
 model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
 model = model.to("mps")
 def tokenise_audio(waveform):
  waveform = torch.from_numpy(waveform).unsqueeze(0)
  waveform = waveform.to(dtype=torch.float32)
  resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)
  waveform = resample_transform(waveform)
  waveform = waveform.unsqueeze(0).to("cuda")
  #generate the codes from snac
  with torch.inference_mode():
    codes = model.encode(waveform)
  all_codes = []
  for i in range(codes[0].shape[1]):
    all_codes.append(codes[0][0][i].item()+128266)
    all_codes.append(codes[1][0][2*i].item()+128266+4096)
    all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))
    all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))
    all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))
    all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))
    all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))
  return all_codes
 def add_codes(example):
    # Always initialize codes_list to None
    codes_list = None
    try:
        answer_audio = example.get("audio")
        # If there's a valid audio array, tokenise it
        if answer_audio and "array" in answer_audio:
            audio_array = answer_audio["array"]
            codes_list = tokenise_audio(audio_array)
    except Exception as e:
        print(f"Skipping row due to error: {e}")
        # Keep codes_list as None if we fail
    example["codes_list"] = codes_list
    return example
 ds = ds.map(add_codes, remove_columns=["audio"])
 #@title Load Tokenizer
 tokeniser_length = 128256
 start_of_text = 128000
 end_of_text = 128009
 start_of_speech = tokeniser_length + 1
 end_of_speech = tokeniser_length + 2
 start_of_human = tokeniser_length + 3
 end_of_human = tokeniser_length + 4
 start_of_ai = tokeniser_length + 5
 end_of_ai =  tokeniser_length + 6
 pad_token = tokeniser_length + 7
 audio_tokens_start = tokeniser_length + 10
 tokenizer_name = "canopylabs/orpheus-3b-0.1-pretrained"
 tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
 num_proc = os.cpu_count() - 2
 ds = ds.filter(lambda x: x["codes_list"] is not None)
 ds = ds.filter(lambda x: len(x["codes_list"]) > 0)
 #@title Create Input Ids
 def remove_duplicate_frames(example):
    vals = example["codes_list"]
    if len(vals) % 7 != 0:
        raise ValueError("Input list length must be divisible by 7")
    result = vals[:7]
    removed_frames = 0
    for i in range(7, len(vals), 7):
        current_first = vals[i]
        previous_first = result[-7]
        if current_first != previous_first:
            result.extend(vals[i:i+7])
        else:
            removed_frames += 1
    example["codes_list"] = result
    return example
 ds = ds.map(remove_duplicate_frames, num_proc=num_proc)
 tok_info = '''*** HERE you can modify the text prompt
 i.e. if you wanted a multispeaker model like canopylabs/orpheus-3b-0.1-ft, you can pass:
 f"{example["source"]}:  {example["text"]}", as is passed.
 '''
 print(tok_info)
 def create_input_ids(example):
    text_ids = tokenizer.encode(f"{example['speaker_id']}: {example['text']}",  add_special_tokens=True)
    text_ids.append(end_of_text)
    example["text_tokens"] = text_ids
    input_ids = (
        [start_of_human]
        + example["text_tokens"]
        + [end_of_human]
        + [start_of_ai]
        + [start_of_speech]
        + example["codes_list"]
        + [end_of_speech]
        + [end_of_ai]
    )
    example["input_ids"] = input_ids
    example["labels"] = input_ids
    example["attention_mask"] = [1] * len(input_ids)
    return example
 ds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=["text", "codes_list"])
 #@title Remove unnecessary columns
 columns_to_keep = ["input_ids", "labels", "attention_mask"]
 columns_to_remove = [col for col in ds.column_names if col not in columns_to_keep]
 ds = ds.remove_columns(columns_to_remove)
 ds.push_to_hub(name_to_push_dataset_to)
 ```
 ## Training
 After preprocessing is done, fill out the blanks in finetune.yml and simply run `axolotl train finetune.yml`
 ## Inference
 For inference, please refer to the original [orpheus github](https://github.com/canopyai/Orpheus-TTS/tree/main).
--- a/examples/orpheus/finetune.yml
+++ b/examples/orpheus/finetune.yml
@@ -1,52 +0,0 @@
 base_model: canopylabs/orpheus-3b-0.1-pretrained
 hub_model_id: <your-hub-model-id>
 plugins:
  - axolotl.integrations.liger.LigerPlugin
 liger_rope: true
 liger_rms_norm: true
 liger_glu_activation: true
 liger_fused_linear_cross_entropy: true
 datasets:
  - path: <your-hf-dataset-id>
    type:  # leave empty to load pre-tokenized
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.01
 output_dir: ./outputs/out
 sequence_len: 8192
 sample_packing: true
 pad_to_sequence_len: true
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 8
 micro_batch_size: 4
 num_epochs: 3
 optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 2e-5
 bf16: auto
 tf32: false
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_steps: 20
 evals_per_epoch: 5
 saves_per_epoch: 5
 weight_decay: 0.05
 special_tokens:
  pad_token: <custom_token_7>
--- a/examples/qwen2/dpo.yaml
+++ b/examples/qwen2/dpo.yaml
@@ -2,6 +2,7 @@ base_model: Qwen/Qwen2.5-0.5B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 chat_template: qwen_25
 rl: dpo
 datasets:
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,17 +6,16 @@ triton>=3.0.0
 mamba-ssm==1.2.0.post1
 xformers>=0.0.23.post1
 autoawq==0.2.7.post3
-liger-kernel==0.5.9
+liger-kernel==0.5.8
 # END section
 packaging==23.2
 huggingface_hub==0.31.0
 peft==0.15.2
 transformers==4.51.3
 tokenizers>=0.21.1
 accelerate==1.6.0
-datasets==3.5.1
+datasets==3.5.0
 deepspeed>=0.15.4
 trl==0.17.0
 hf_xet==1.1.0
--- a/setup.py
+++ b/setup.py
@@ -67,13 +67,13 @@ def parse_requirements(extras_require_map):
            if (major, minor) >= (2, 7):
                _install_requires.pop(_install_requires.index(xformers_version))
                # _install_requires.append("xformers==0.0.29.post3")  # xformers seems to be hard pinned to 2.6.0
-                extras_require_map["vllm"] = ["vllm==0.8.5.post1"]
+                extras_require_map["vllm"] = ["vllm==0.8.5"]
            elif (major, minor) >= (2, 6):
                _install_requires.pop(_install_requires.index(xformers_version))
                _install_requires.append(
                    "xformers==0.0.29.post2"
                )  # vllm needs post2 w torch 2.6
-                extras_require_map["vllm"] = ["vllm==0.8.5.post1"]
+                extras_require_map["vllm"] = ["vllm==0.8.5"]
            elif (major, minor) >= (2, 5):
                _install_requires.pop(_install_requires.index(xformers_version))
                if patch == 0:
@@ -142,7 +142,6 @@ extras_require = {
        "apollo-torch",
        "lomo-optim==0.1.1",
        "torch-optimi==0.2.1",
        "came_pytorch==0.1.3",
    ],
    "ray": [
        "ray[train]",
--- a/src/axolotl/cli/args.py
+++ b/src/axolotl/cli/args.py
@@ -82,12 +82,6 @@ class VllmServeCliArgs:
            "hardware support this feature."
        },
    )
    serve_module: Optional[str] = field(
        default=None,
        metadata={
            "help": "Module to serve. If not set, the default module will be used."
        },
    )
@dataclass
--- a/src/axolotl/cli/art.py
+++ b/src/axolotl/cli/art.py
@@ -16,15 +16,8 @@ AXOLOTL_LOGO = """
    @@@@  @@@@@@@@@@@@@@@@
 """
 HAS_PRINTED_LOGO = False
 def print_axolotl_text_art():
    """Prints axolotl ASCII art."""
    global HAS_PRINTED_LOGO  # pylint: disable=global-statement
    if HAS_PRINTED_LOGO:
        return
    if is_main_process():
        HAS_PRINTED_LOGO = True
        print(AXOLOTL_LOGO)
--- a/src/axolotl/cli/evaluate.py
+++ b/src/axolotl/cli/evaluate.py
@@ -15,7 +15,7 @@ from axolotl.cli.checks import check_accelerate_default_config, check_user_token
 from axolotl.cli.config import load_cfg
 from axolotl.common.datasets import load_datasets, load_preference_datasets
 from axolotl.evaluate import evaluate
-from axolotl.utils import patch_optimized_env
+from axolotl.utils import set_pytorch_cuda_alloc_conf
 from axolotl.utils.dict import DictDefault
 LOG = logging.getLogger(__name__)
@@ -32,7 +32,7 @@ def do_evaluate(cfg: DictDefault, cli_args: TrainerCliArgs) -> None:
        cli_args: CLI arguments.
    """
    # Enable expandable segments for cuda allocation to improve VRAM usage
-    patch_optimized_env()
+    set_pytorch_cuda_alloc_conf()
    # pylint: disable=duplicate-code
    print_axolotl_text_art()
--- a/src/axolotl/cli/main.py
+++ b/src/axolotl/cli/main.py
@@ -29,7 +29,7 @@ from axolotl.cli.utils import (
    filter_none_kwargs,
 )
 from axolotl.integrations.lm_eval.cli import lm_eval
-from axolotl.utils import patch_optimized_env
+from axolotl.utils import set_pytorch_cuda_alloc_conf
 from axolotl.utils.schemas.config import AxolotlInputConfig
@@ -55,8 +55,6 @@ def preprocess(config: str, cloud: Optional[str] = None, **kwargs) -> None:
        kwargs: Additional keyword arguments which correspond to CLI args or `axolotl`
            config options.
    """
    patch_optimized_env()
    if cloud:
        from axolotl.cli.cloud import do_cli_preprocess
@@ -102,7 +100,7 @@ def train(
            config options.
    """
    # Enable expandable segments for cuda allocation to improve VRAM usage
-    patch_optimized_env()
+    set_pytorch_cuda_alloc_conf()
    if "use_ray" in kwargs and kwargs["use_ray"]:
        accelerate = False
--- a/src/axolotl/cli/preprocess.py
+++ b/src/axolotl/cli/preprocess.py
@@ -18,7 +18,6 @@ from axolotl.cli.checks import check_accelerate_default_config, check_user_token
 from axolotl.cli.config import load_cfg
 from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
 from axolotl.common.datasets import load_datasets, load_preference_datasets
 from axolotl.integrations.base import PluginManager
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.trainer import disable_datasets_caching
@@ -48,10 +47,7 @@ def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None:
        cfg.dataset_prepared_path = DEFAULT_DATASET_PREPARED_PATH
    with disable_datasets_caching():
-        plugin_manager = PluginManager.get_instance()
+        if cfg.rl:
        if plugin_manager.load_datasets(cfg, preprocess=True):
            pass
        elif cfg.rl:
            load_preference_datasets(cfg=cfg, cli_args=cli_args)
        else:
            load_datasets(cfg=cfg, cli_args=cli_args)
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -18,7 +18,7 @@ from axolotl.cli.config import load_cfg
 from axolotl.common.datasets import load_datasets, load_preference_datasets
 from axolotl.integrations.base import PluginManager
 from axolotl.train import train
-from axolotl.utils import patch_optimized_env
+from axolotl.utils import set_pytorch_cuda_alloc_conf
 from axolotl.utils.config import normalize_config, resolve_dtype
 from axolotl.utils.dict import DictDefault
@@ -36,16 +36,13 @@ def do_train(cfg: DictDefault, cli_args: TrainerCliArgs):
        cli_args: Training-specific CLI arguments.
    """
    # Enable expandable segments for cuda allocation to improve VRAM usage
-    patch_optimized_env()
+    set_pytorch_cuda_alloc_conf()
    print_axolotl_text_art()
    check_accelerate_default_config()
    if int(os.getenv("LOCAL_RANK", "0")) == 0:
        check_user_token()
    plugin_manager = PluginManager.get_instance()
    dataset_meta = plugin_manager.load_datasets(cfg, preprocess=False)
    if not dataset_meta:
    if cfg.rl:
        dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args)
    else:
--- a/src/axolotl/cli/utils.py
+++ b/src/axolotl/cli/utils.py
@@ -20,9 +20,8 @@ from transformers import (
    ProcessorMixin,
 )
 from axolotl.loaders import load_processor, load_tokenizer
 from axolotl.loaders.model import ModelLoader
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_model, load_processor, load_tokenizer
 LOG = logging.getLogger(__name__)
@@ -319,8 +318,7 @@ def load_model_and_tokenizer(
    tokenizer = load_tokenizer(cfg)
    LOG.info("loading model...")
-    model_loader = ModelLoader(cfg, tokenizer, inference=inference)
+    model, _ = load_model(cfg, tokenizer, inference=inference)
    model, _ = model_loader.load()
    processor = None
    if cfg.is_multimodal:
--- a/src/axolotl/cli/vllm_serve.py
+++ b/src/axolotl/cli/vllm_serve.py
@@ -6,6 +6,7 @@ from pathlib import Path
 from typing import Union
 from trl.scripts.vllm_serve import ScriptArguments
 from trl.scripts.vllm_serve import main as vllm_serve_main
 from axolotl.cli.config import load_cfg
@@ -27,9 +28,6 @@ def do_vllm_serve(
    cfg = load_cfg(config)
    model = cfg.base_model
    serve_module = cli_args.get("serve_module", "trl.scripts.vllm_serve")
    vllm_serve_main = getattr(__import__(serve_module, fromlist=["main"]), "main")
    tensor_parallel_size = (
        cli_args.get("tensor_parallel_size") or cfg.vllm.tensor_parallel_size
    )
--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -10,11 +10,10 @@ from datasets import Dataset
 import axolotl.monkeypatch.data.batch_dataset_fetcher  # pylint: disable=unused-import  # noqa: F401
 from axolotl.cli.args import PreprocessCliArgs, TrainerCliArgs
 from axolotl.loaders import load_processor, load_tokenizer
 from axolotl.utils.data import prepare_dataset
 from axolotl.utils.data.rl import load_prepare_preference_datasets
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.schemas.enums import RLType
+from axolotl.utils.models import load_processor, load_tokenizer
 from axolotl.utils.tokenization import check_dataset_labels
 LOG = logging.getLogger(__name__)
@@ -49,7 +48,6 @@ def load_datasets(
    *,
    cfg: DictDefault,
    cli_args: PreprocessCliArgs | TrainerCliArgs | None = None,
    debug: bool = False,
 ) -> TrainDatasetMeta:
    """
    Loads one or more training or evaluation datasets, calling
@@ -58,7 +56,6 @@ def load_datasets(
    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        cli_args: Command-specific CLI arguments.
        debug: Whether to print out tokenization of sample
    Returns:
        Dataclass with fields for training and evaluation datasets and the computed
@@ -80,25 +77,20 @@ def load_datasets(
        preprocess_iterable=preprocess_iterable,
    )
-    if (  # pylint: disable=too-many-boolean-expressions
+    if cli_args and (
        cli_args
        and (
        cli_args.debug
        or cfg.debug
        or cli_args.debug_text_only
        or int(cli_args.debug_num_examples) > 0
-        )
+    ):
    ) or debug:
        LOG.info("check_dataset_labels...")
-        num_examples = cli_args.debug_num_examples if cli_args else 1
+        train_samples = sample_dataset(train_dataset, cli_args.debug_num_examples)
        text_only = cli_args.debug_text_only if cli_args else False
        train_samples = sample_dataset(train_dataset, num_examples)
        check_dataset_labels(
            train_samples,
            tokenizer,
-            num_examples=num_examples,
+            num_examples=cli_args.debug_num_examples,
-            text_only=text_only,
+            text_only=cli_args.debug_text_only,
        )
        LOG.info("printing prompters...")
@@ -134,7 +126,7 @@ def load_preference_datasets(
    total_num_steps: Optional[int] = int(
        math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
    )
-    if cfg.rl is RLType.GRPO:
+    if cfg.rl == "grpo":
        total_num_steps = None
    if cli_args.debug or cfg.debug:
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -21,7 +21,6 @@ import importlib.util
 import inspect
 import logging
 import math
 import os
 import sys
 from abc import abstractmethod
 from pathlib import Path
@@ -59,7 +58,6 @@ from axolotl.core.training_args import (
    AxolotlTrainingArguments,
 )
 from axolotl.integrations.base import PluginManager
 from axolotl.loaders.utils import ensure_dtype
 from axolotl.monkeypatch.multipack import SUPPORTED_MULTIPACK_MODEL_TYPES
 from axolotl.monkeypatch.relora import ReLoRACallback
 from axolotl.monkeypatch.trainer.lr import patch_trainer_get_lr
@@ -74,7 +72,6 @@ from axolotl.utils.callbacks import (
    SaveBetterTransformerModelCallback,
    bench_eval_callback_factory,
    causal_lm_bench_eval_callback_factory,
    colab_inference_post_train_callback,
    log_prediction_callback_factory,
 )
 from axolotl.utils.callbacks.lisa import lisa_callback_factory
@@ -87,7 +84,8 @@ from axolotl.utils.collators import (
    V2BatchSamplerDataCollatorForSeq2Seq,
 )
 from axolotl.utils.collators.mm_chat import MultiModalChatDataCollator
-from axolotl.utils.schemas.enums import CustomSupportedOptimizers, RLType
+from axolotl.utils.models import ensure_dtype
 from axolotl.utils.schemas.enums import CustomSupportedOptimizers
 try:
    import torch._dynamo  # pylint: disable=ungrouped-imports
@@ -170,9 +168,6 @@ class TrainerBuilderBase(abc.ABC):
                )
            )
        if self.cfg.gc_steps:
            callbacks.append(GCCallback(gc_steps=self.cfg.gc_steps))
        if self.cfg.use_wandb:
            callbacks.append(
                SaveAxolotlConfigtoWandBCallback(self.cfg.axolotl_config_path)
@@ -254,6 +249,9 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        if self.cfg.loss_watchdog_threshold is not None:
            callbacks.append(LossWatchDogCallback(self.cfg))
        if self.cfg.gc_steps:
            callbacks.append(GCCallback(gc_steps=self.cfg.gc_steps))
        return callbacks
    def get_post_trainer_create_callbacks(self, trainer):
@@ -295,10 +293,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers:
            callbacks.append(lisa_callback_factory(trainer))
        if any("COLAB_" in key for key in os.environ):
            ColabCallback = colab_inference_post_train_callback(trainer)
            callbacks.append(ColabCallback(self.cfg))
        callbacks.extend(super().get_post_trainer_create_callbacks(trainer=trainer))
        return callbacks
@@ -353,7 +347,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        training_arguments_kwargs["warmup_steps"] = warmup_steps
        training_arguments_kwargs["logging_steps"] = logging_steps
-        if self.cfg.seed is not None:
+        if self.cfg.seed:
            training_arguments_kwargs["seed"] = self.cfg.seed
        if self.cfg.gradient_checkpointing:
@@ -387,12 +381,8 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            training_arguments_kwargs["adam_beta1"] = self.cfg.adam_beta1
        if self.cfg.adam_beta2:
            training_arguments_kwargs["adam_beta2"] = self.cfg.adam_beta2
        if self.cfg.adam_beta3:
            training_arguments_kwargs["adam_beta3"] = self.cfg.adam_beta3
        if self.cfg.adam_epsilon:
            training_arguments_kwargs["adam_epsilon"] = self.cfg.adam_epsilon
        if self.cfg.adam_epsilon2:
            training_arguments_kwargs["adam_epsilon2"] = self.cfg.adam_epsilon2
        if self.cfg.max_grad_norm:
            training_arguments_kwargs["max_grad_norm"] = self.cfg.max_grad_norm
@@ -551,6 +541,8 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        report_to = []
        if self.cfg.use_wandb:
            report_to.append("wandb")
            if self.cfg.wandb_name:
                training_arguments_kwargs["run_name"] = self.cfg.wandb_name
        if self.cfg.use_mlflow:
            report_to.append("mlflow")
        if self.cfg.use_tensorboard:
@@ -710,20 +702,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                optimizer_cls = ADOPT
                adam_kwargs["decouple"] = True
                optimizer_kwargs.update(adam_kwargs)
            elif self.cfg.optimizer == "came_pytorch":
                from came_pytorch import CAME
                optimizer_cls = CAME
                beta1 = training_arguments_kwargs.get("adam_beta1", 0.9)
                beta2 = training_arguments_kwargs.get("adam_beta2", 0.999)
                beta3 = training_arguments_kwargs.get("adam_beta3", 0.9999)
                eps1 = training_arguments_kwargs.get("adam_epsilon", 1e-30)
                eps2 = training_arguments_kwargs.get("adam_epsilon2", 1e-16)
                adam_kwargs["betas"] = (beta1, beta2, beta3)
                adam_kwargs["eps"] = (eps1, eps2)
                optimizer_kwargs.update(adam_kwargs)
            # Parse any additional optimizer args from config
            if self.cfg.optim_args:
@@ -798,6 +776,11 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                self.cfg.kd_top_k_before_softmax
            )
        training_arguments_kwargs["sequence_parallel_degree"] = (
            self.cfg.sequence_parallel_degree
        )
        training_arguments_kwargs["ring_attn_func"] = self.cfg.ring_attn_func
        if self.cfg.reward_model:
            training_args_cls = AxolotlRewardConfig
        elif self.cfg.process_reward_model:
@@ -818,15 +801,14 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        data_collator_kwargs = {
            "padding": True,  # True/"longest" is the default
        }
        multiple = 64
        if self.cfg.pad_to_sequence_len:
-            data_collator_kwargs["pad_to_multiple_of"] = multiple * math.ceil(
+            data_collator_kwargs["pad_to_multiple_of"] = 64 * math.ceil(
-                self.cfg.sequence_len / multiple
+                self.cfg.sequence_len / 64
            )
        else:
            # A100 is best at 64, while others at 8. Let's use the larger so we don't have to check
            # https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html
-            data_collator_kwargs["pad_to_multiple_of"] = multiple
+            data_collator_kwargs["pad_to_multiple_of"] = 64
        if self.cfg.reward_model:
            data_collator_kwargs["max_length"] = self.cfg.sequence_len
@@ -1032,10 +1014,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            training_args_kwargs["dataloader_prefetch_factor"] = (
                self.cfg.dataloader_prefetch_factor
            )
        if self.cfg.seed is not None:
            training_args_kwargs["seed"] = self.cfg.seed
        if self.cfg.gradient_checkpointing:
            training_args_kwargs["gradient_checkpointing"] = (
                self.cfg.gradient_checkpointing
@@ -1059,8 +1037,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            # default to saving each epoch if not defined
            training_args_kwargs["save_strategy"] = "epoch"
        training_args_kwargs["save_only_model"] = self.cfg.save_only_model
        if self.cfg.dataset_processes:
            training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
@@ -1080,7 +1056,7 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
        training_args_cls = None
        blocklist_args_kwargs = []
-        if self.cfg.rl is RLType.SIMPO:
+        if self.cfg.rl == "simpo":
            training_args_cls = AxolotlCPOConfig
            training_args_kwargs["loss_type"] = "simpo"
            training_args_kwargs["max_length"] = self.cfg.sequence_len
@@ -1088,13 +1064,13 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            if self.cfg.cpo_alpha is not None:
                training_args_kwargs["cpo_alpha"] = self.cfg.cpo_alpha
-        elif self.cfg.rl is RLType.ORPO:
+        elif self.cfg.rl == "orpo":
            training_args_cls = AxolotlORPOConfig
            training_args_kwargs["max_length"] = self.cfg.sequence_len
            if self.cfg.max_prompt_len:
                training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len
-        elif self.cfg.rl is RLType.KTO:
+        elif self.cfg.rl == "kto":
            training_args_cls = AxolotlKTOConfig
            training_args_kwargs["desirable_weight"] = (
@@ -1108,14 +1084,14 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            if self.cfg.max_prompt_len:
                training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len
-        elif self.cfg.rl is RLType.GRPO:
+        elif self.cfg.rl == "grpo":
            training_args_cls = GRPOStrategy.get_training_args_class()
            training_args_kwargs.update(GRPOStrategy.set_training_args_kwargs(self.cfg))
            blocklist_args_kwargs = GRPOStrategy.get_blocklist_args_kwargs()
        else:
            training_args_cls = AxolotlDPOConfig
-            if self.cfg.rl is RLType.IPO:
+            if self.cfg.rl == "ipo":
                training_args_kwargs["loss_type"] = "ipo"
            training_args_kwargs["max_length"] = self.cfg.sequence_len
            training_args_kwargs["max_completion_length"] = None
@@ -1158,76 +1134,67 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
    def build(self, total_num_steps):
        training_args = self.build_training_arguments(total_num_steps)
-        trainer_kwargs = {}
+        dpo_trainer_kwargs = {}
-        if self.cfg.rl is RLType.IPO:
+        if self.cfg.rl == "ipo":
            if self.cfg.dpo_label_smoothing:
-                trainer_kwargs["label_smoothing"] = self.cfg.dpo_label_smoothing
+                dpo_trainer_kwargs["label_smoothing"] = self.cfg.dpo_label_smoothing
        if self.eval_dataset:
-            trainer_kwargs["eval_dataset"] = self.eval_dataset
+            dpo_trainer_kwargs["eval_dataset"] = self.eval_dataset
        if self.cfg.adapter and self.peft_config:
-            if self.cfg.rl is not RLType.GRPO:
+            dpo_trainer_kwargs["peft_config"] = self.peft_config
                trainer_kwargs["peft_config"] = self.peft_config
        if self.cfg.precompute_ref_log_probs is not None:
-            trainer_kwargs["precompute_ref_log_probs"] = (
+            dpo_trainer_kwargs["precompute_ref_log_probs"] = (
                self.cfg.precompute_ref_log_probs
            )
-        if self.cfg.rl is RLType.GRPO:
+        if self.cfg.rl == "grpo":
-            trainer_cls = GRPOStrategy.get_trainer_class(
+            trainer_cls = GRPOStrategy.get_trainer_class()
                sequence_parallel=self.cfg.sequence_parallel_degree > 1
            )
            trainer_cls_args = [self.model]
            trainer_cls_args.extend(GRPOStrategy.set_trainer_args(self.cfg))
-            trainer_kwargs.update(GRPOStrategy.set_trainer_kwargs(self.cfg))
+            dpo_trainer_kwargs.update(GRPOStrategy.set_trainer_kwargs(self.cfg))
-        elif self.cfg.rl in [RLType.DPO, RLType.IPO]:
+        elif self.cfg.rl in ["dpo", "ipo"]:
            trainer_cls = DPOStrategy.get_trainer_class()
            trainer_cls_args = [self.model, self.model_ref]
-        elif self.cfg.rl is RLType.ORPO:
+        elif self.cfg.rl == "orpo":
            trainer_cls = AxolotlORPOTrainer
            trainer_cls_args = [self.model]
-        elif self.cfg.rl is RLType.KTO:
+        elif self.cfg.rl in ["kto"]:
            trainer_cls = AxolotlKTOTrainer
            trainer_cls_args = [self.model]
-        elif self.cfg.rl is RLType.SIMPO:
+        elif self.cfg.rl in ["simpo"]:
            trainer_cls = AxolotlCPOTrainer
            trainer_cls_args = [self.model]
        else:
            raise ValueError(f"Unsupported RL: {self.cfg.rl}")
        if self.cfg.plugins:
            plugin_manager = PluginManager.get_instance()
            temp_trainer_cls = plugin_manager.get_trainer_cls(self.cfg)
            if temp_trainer_cls is not None:
                trainer_cls = temp_trainer_cls
        sig = inspect.signature(trainer_cls)
        if "tokenizer" in sig.parameters.keys():
-            trainer_kwargs["tokenizer"] = self.tokenizer
+            dpo_trainer_kwargs["tokenizer"] = self.tokenizer
        else:
-            trainer_kwargs["processing_class"] = self.tokenizer
+            dpo_trainer_kwargs["processing_class"] = self.tokenizer
        if self.cfg.datasets is not None and (
            trainer_cls is DPOStrategy.get_trainer_class()
        ):
-            trainer_kwargs["dataset_tags"] = [
+            dpo_trainer_kwargs["dataset_tags"] = [
                d["path"] for d in self.cfg.datasets if not Path(d["path"]).is_dir()
            ]
-        trainer = trainer_cls(
+        dpo_trainer = trainer_cls(
            *trainer_cls_args,
            args=training_args,
            train_dataset=self.train_dataset,
            callbacks=self.get_callbacks(),
-            **trainer_kwargs,
+            **dpo_trainer_kwargs,
        )
        if self.cfg.fsdp:
-            ensure_dtype(trainer.model, dtype=self.cfg.torch_dtype)
+            ensure_dtype(dpo_trainer.model, dtype=self.cfg.torch_dtype)
-            if self.cfg.rl in [RLType.DPO, RLType.IPO] and trainer.ref_model:
+            if self.cfg.rl in ["dpo", "ipo"] and dpo_trainer.ref_model:
-                ensure_dtype(trainer.ref_model, dtype=self.cfg.torch_dtype)
+                ensure_dtype(dpo_trainer.ref_model, dtype=self.cfg.torch_dtype)
-        trainer = self.hook_post_create_trainer(trainer)
+        dpo_trainer = self.hook_post_create_trainer(dpo_trainer)
-        for callback in self.get_post_trainer_create_callbacks(trainer):
+        for callback in self.get_post_trainer_create_callbacks(dpo_trainer):
-            trainer.add_callback(callback)
+            dpo_trainer.add_callback(callback)
-        return trainer
+        return dpo_trainer
 class HFPPOTrainerBuilder(TrainerBuilderBase):
--- a/src/axolotl/core/trainers/init.py
+++ b/src/axolotl/core/trainers/init.py
@@ -5,7 +5,7 @@
 from .base import AxolotlTrainer
 from .dpo.trainer import AxolotlDPOTrainer
-from .grpo.trainer import AxolotlGRPOSequenceParallelTrainer, AxolotlGRPOTrainer
+from .grpo.trainer import AxolotlGRPOTrainer
 from .mamba import AxolotlMambaTrainer
 from .relora import ReLoRATrainer
 from .trl import (
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -29,6 +29,7 @@ from axolotl.core.trainers.mixins import (
    OptimizerMixin,
    RngLoaderMixin,
    SchedulerMixin,
    SequenceParallelMixin,
 )
 from axolotl.core.trainers.utils import (
    sanitize_kwargs_for_ds_tagging,
@@ -39,7 +40,9 @@ from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
 LOG = logging.getLogger(__name__)
-class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
+class AxolotlTrainer(
    SchedulerMixin, OptimizerMixin, RngLoaderMixin, SequenceParallelMixin, Trainer
 ):
    """Extend the base Trainer for axolotl helpers"""
    args = None  # type: "AxolotlTrainingArguments"  # type: ignore[name-defined]
@@ -65,6 +68,10 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
        if self.args.orpo_alpha:
            self.loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
        # Initialize sequence parallelism if enabled
        if self.args.sequence_parallel_degree > 1:
            self._setup_sequence_parallel()
    def _wrap_model(self, model, training=True, dataloader=None):
        if self.args.torch_compile:
            torch._dynamo.config.accumulated_cache_size_limit = (  # pylint: disable=protected-access
@@ -107,16 +114,14 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
            packing_efficiency_estimate=self.args.sample_packing_efficiency,
            batch_max_len=batch_max_len,
            batch_size=batch_size,
            group_size=self.args.sample_packing_group_size,
            bin_size=self.args.sample_packing_bin_size,
            sequential=self.args.sample_packing_sequentially,
            drop_last=True,
        )
    def _get_train_sampler(self) -> Sampler | None:
        """
-        Helper method to get the sampler for training. Handles cases for sample packing
+        Helper method to get the sampler for training. Handles cases for sequence
-        and curriculum sampling (sequential).
+        parallelism, sample packing, and curriculum sampling (sequential).
        Returns:
            If the dataset is non-empty, a sampler is returned, the type of which
@@ -125,7 +130,9 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
        use_sample_packing = self.args.sample_packing and not self.args.pretraining
        # Determine the base sampler first
-        if self.args.curriculum_sampling:
+        if self.args.sequence_parallel_degree > 1:
            base_sampler = self._sp_get_train_sampler(self.train_dataset)
        elif self.args.curriculum_sampling:
            base_sampler = SequentialSampler(self.train_dataset)
        elif use_sample_packing:
            base_sampler = RandomSampler(self.train_dataset)
@@ -144,7 +151,8 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
    def _get_eval_sampler(self, eval_dataset: Dataset | None = None) -> Sampler | None:
        """
-        Helper method to get the sampler for evaluation. Handles sample packing case.
+        Helper method to get the sampler for evaluation. Handles sequence parallelism
        and sample packing cases.
        Returns:
            If the dataset is non-empty, a sampler is returned, the type of which
@@ -158,7 +166,9 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
        )
        # Determine the base sampler
-        if use_multipack:
+        if self.args.sequence_parallel_degree > 1:
            base_sampler = self._sp_get_eval_sampler(eval_dataset)
        elif use_multipack:
            base_sampler = SequentialSampler(eval_dataset)
        else:
            return super()._get_eval_sampler(eval_dataset)
@@ -224,6 +234,14 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
        ):
            self.accelerator.even_batches = False
        # Return unprepared dataloader if using sequence parallelism
        # TODO(djsaunde): We might be able to use `accelerate`'s dataloader preparation
        # if we use `dispatch_batches` and `slice_fn_for_dispatch` properly (i.e.,
        # slice each batch along the sequence dimension).
        if self.args.sequence_parallel_degree > 1:
            return dataloader
        # Otherwise prepare with accelerator
        return self.accelerator.prepare_data_loader(dataloader)
    def get_train_dataloader(self) -> DataLoader:
@@ -267,7 +285,12 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
            return dataloader
-        if self.args.sample_packing and self.args.eval_sample_packing is not False:
+        # Handle sample packing or sequence parallelism
        if (
            self.args.sample_packing
            and self.args.eval_sample_packing is not False
            or self.args.sequence_parallel_degree > 1
        ):
            # Get appropriate data collator
            self.data_collator = (  # pylint: disable=attribute-defined-outside-init
                self.eval_data_collator
@@ -277,6 +300,17 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
            if "length" in eval_dataset.column_names:
                eval_dataset = eval_dataset.remove_columns(["length"])
            # Handle dataset preprocessing for SP
            if self.args.sequence_parallel_degree > 1:
                if isinstance(eval_dataset, datasets.Dataset):
                    eval_dataset = self._remove_unused_columns(
                        eval_dataset, description="evaluation"
                    )
                else:
                    self.data_collator = self._get_collator_with_removed_columns(  # pylint: disable=attribute-defined-outside-init
                        self.data_collator, description="evaluation"
                    )
            # Use eval_batch_size for sample packing, per_device_eval_batch_size otherwise
            batch_size = (
                self.args.eval_batch_size
@@ -337,13 +371,15 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
                num_items_in_batch=num_items_in_batch,
            )
-        return super().compute_loss(
+        loss = super().compute_loss(
            model,
            inputs,
            return_outputs=return_outputs,
            num_items_in_batch=num_items_in_batch,
        )
        return loss
    @staticmethod
    def orpo_concatenate_inputs(inputs, label_pad_token=-100, pad_token=0, device=None):
        concatenated_batch = {}
--- a/src/axolotl/core/trainers/dpo/init.py
+++ b/src/axolotl/core/trainers/dpo/init.py
@@ -1,11 +1,14 @@
-"""DPO Specific Strategy for training"""
+"""
 DPO Specific Strategy for training
 """
 from axolotl.core.trainers.dpo.trainer import AxolotlDPOTrainer
 from axolotl.utils.schemas.enums import RLType
 class DPOStrategy:
-    """Strategy for DPO training"""
+    """
    Strategy for DPO training
    """
    @classmethod
    def get_trainer_class(cls):
@@ -20,7 +23,7 @@ class DPOStrategy:
    @classmethod
    def set_training_args_kwargs(cls, cfg):
        training_args_kwargs = {}
-        if cfg.rl is RLType.IPO:
+        if cfg.rl == "ipo":
            training_args_kwargs["loss_type"] = "ipo"
        training_args_kwargs["max_length"] = cfg.sequence_len
        training_args_kwargs["max_completion_length"] = None
--- a/src/axolotl/core/trainers/dpo/trainer.py
+++ b/src/axolotl/core/trainers/dpo/trainer.py
@@ -1,15 +1,31 @@
-"""DPO trainer for axolotl"""
+"""
 DPO trainer for axolotl
 """
 import gc
 import random
 from functools import wraps
-from typing import Any, Dict, Union
+from typing import Any, Dict, Optional, Union
 import pandas as pd
 import torch
 import wandb
 from accelerate import PartialState
 from datasets import Dataset, IterableDataset
 from peft.optimizers import create_loraplus_optimizer
 from torch import nn
-from transformers import Trainer
+from torch.utils.data import DataLoader
 from transformers import (
    BaseImageProcessor,
    FeatureExtractionMixin,
    PreTrainedTokenizerBase,
    ProcessorMixin,
    Trainer,
 )
 from transformers.trainer_utils import EvalLoopOutput
 from transformers.utils import is_sagemaker_mp_enabled
-from trl import DPOTrainer
+from trl import DPOConfig, DPOTrainer, maybe_apply_chat_template, maybe_extract_prompt
 from trl.trainer.utils import log_table_to_comet_experiment
 from axolotl.core.trainers.mixins import RngLoaderMixin, SchedulerMixin
 from axolotl.core.trainers.utils import (
@@ -22,7 +38,9 @@ if is_sagemaker_mp_enabled():
 class AxolotlDPOTrainer(RngLoaderMixin, SchedulerMixin, DPOTrainer):
-    """Extend the base DPOTrainer for axolotl helpers."""
+    """
    Extend the base DPOTrainer for axolotl helpers
    """
    tag_names = ["axolotl", "dpo"]
@@ -67,9 +85,8 @@ class AxolotlDPOTrainer(RngLoaderMixin, SchedulerMixin, DPOTrainer):
    @wraps(DPOTrainer.push_to_hub)
    def push_to_hub(self, *args, **kwargs) -> str:
        """
-        Overwrite the `push_to_hub` method in order to force-add the tags when pushing
+        Overwrite the `push_to_hub` method in order to force-add the tags when pushing the
-        the model on the Hub. Please refer to `~transformers.Trainer.push_to_hub`
+        model on the Hub. Please refer to `~transformers.Trainer.push_to_hub` for more details.
        for more details.
        """
        kwargs = sanitize_kwargs_for_ds_tagging(
            dataset_tags=self.dataset_tags, kwargs=kwargs
@@ -78,6 +95,64 @@ class AxolotlDPOTrainer(RngLoaderMixin, SchedulerMixin, DPOTrainer):
        return super().push_to_hub(*args, **kwargs)
    # TODO: remove this once https://github.com/huggingface/trl/pull/3377 is in a release
    def _prepare_dataset(
        self,
        dataset: Union[Dataset, IterableDataset],
        processing_class: Union[
            PreTrainedTokenizerBase,
            BaseImageProcessor,
            FeatureExtractionMixin,
            ProcessorMixin,
        ],
        args: DPOConfig,
        dataset_name: str,
    ) -> Union[Dataset, IterableDataset]:
        # Build the kwargs for the `map` function
        map_kwargs: Dict[str, Any] = {"writer_batch_size": 10}
        if isinstance(dataset, Dataset):  # IterableDataset does not support num_proc
            map_kwargs["num_proc"] = args.dataset_num_proc
        with PartialState().main_process_first():
            # Extract prompt if needed
            if isinstance(
                dataset, Dataset
            ):  # `IterableDataset.map` does not support `desc`
                map_kwargs["desc"] = f"Extracting prompt in {dataset_name} dataset"
            dataset = dataset.map(maybe_extract_prompt, **map_kwargs)
            # Apply the chat template if needed
            if isinstance(
                dataset, Dataset
            ):  # `IterableDataset.map` does not support `desc`
                map_kwargs["desc"] = f"Applying chat template to {dataset_name} dataset"
            dataset = dataset.map(
                maybe_apply_chat_template,
                fn_kwargs={"tokenizer": processing_class, "tools": args.tools},
                **map_kwargs,
            )
            # Tokenize the dataset
            if isinstance(
                dataset, Dataset
            ):  # `IterableDataset.map` does not support `desc`
                map_kwargs["desc"] = f"Tokenizing {dataset_name} dataset"
            dataset = dataset.map(
                self.tokenize_row if not self.is_vision_model else self.process_row,
                remove_columns=["chosen", "rejected"],
                fn_kwargs={
                    "processing_class": processing_class,
                    "max_prompt_length": args.max_prompt_length,
                    "max_completion_length": args.max_completion_length,
                    # for enc-dec, we add the special tokens ([bos_token] + prompt + [eos_token]; completion + [eos_token])
                    "add_special_tokens": False,
                },
                **map_kwargs,
            )
        return dataset
    @staticmethod
    def tokenize_row(
        features,
@@ -102,8 +177,12 @@ class AxolotlDPOTrainer(RngLoaderMixin, SchedulerMixin, DPOTrainer):
            # dpo trainer may incorrectly prepend the bos_token_id to the dpo outputs
            if res["chosen_input_ids"][0] == processing_class.bos_token_id:
                res["chosen_input_ids"] = res["chosen_input_ids"][1:]
                res["chosen_labels"] = res["chosen_labels"][1:]
                res["chosen_attention_mask"] = res["chosen_attention_mask"][1:]
            if res["rejected_input_ids"][0] == processing_class.bos_token_id:
                res["rejected_input_ids"] = res["rejected_input_ids"][1:]
                res["rejected_labels"] = res["rejected_labels"][1:]
                res["rejected_attention_mask"] = res["rejected_attention_mask"][1:]
        return res
@@ -117,3 +196,67 @@ class AxolotlDPOTrainer(RngLoaderMixin, SchedulerMixin, DPOTrainer):
        gc.collect()
        torch.cuda.empty_cache()
        return loss
    # TODO: remove this once https://github.com/huggingface/trl/pull/3377 is in a release
    def evaluation_loop(
        self,
        dataloader: DataLoader,
        description: str,
        prediction_loss_only: Optional[bool] = None,
        ignore_keys: Optional[list[str]] = None,
        metric_key_prefix: str = "eval",
    ) -> EvalLoopOutput:
        """
        Overriding built-in evaluation loop to store metrics for each batch.
        Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.
        Works both with or without labels.
        """
        # Sample and save to game log if requested (for one batch to save time)
        if self.generate_during_eval:
            # Generate random indices within the range of the total number of samples
            num_samples = len(dataloader.dataset)
            random_indices = random.sample(
                range(num_samples), k=self.args.eval_batch_size
            )
            # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader
            random_batch_dataset = dataloader.dataset.select(random_indices)
            random_batch = self.data_collator(random_batch_dataset)
            random_batch = self._prepare_inputs(random_batch)
            policy_output_decoded, ref_output_decoded = (
                self.generate_from_model_and_ref(self.model, random_batch)
            )
            table = pd.DataFrame(
                columns=["Prompt", "Policy", "Ref Model"],
                data=[
                    [prompt, pol[len(prompt) :], ref[len(prompt) :]]
                    for prompt, pol, ref in zip(
                        random_batch_dataset["prompt"],
                        policy_output_decoded,
                        ref_output_decoded,
                    )
                ],
            )
            if "wandb" in self.args.report_to and self.accelerator.is_main_process:
                wandb.log({"game_log": wandb.Table(data=table)})
            if "comet_ml" in self.args.report_to:
                log_table_to_comet_experiment(
                    name="game_log.csv",
                    table=table,
                )
        # Base evaluation
        initial_output = super().evaluation_loop(
            dataloader,
            description,
            prediction_loss_only,
            ignore_keys,
            metric_key_prefix,
        )
        return initial_output
--- a/src/axolotl/core/trainers/grpo/init.py
+++ b/src/axolotl/core/trainers/grpo/init.py
@@ -1,41 +1,37 @@
-"""GRPO Specific Strategy for training"""
+"""
 GRPO Specific Strategy for training
 """
 import importlib
 import inspect
 import logging
 from typing import Any
 from trl.trainer.grpo_trainer import RewardFunc
-from axolotl.core.trainers.grpo.args import AxolotlGRPOConfig
+from axolotl.core.trainers.grpo.trainer import AxolotlGRPOTrainer
 from axolotl.core.trainers.grpo.trainer import (
    AxolotlGRPOSequenceParallelTrainer,
    AxolotlGRPOTrainer,
 )
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.schemas.trl import TRLConfig
-LOG = logging.getLogger(__name__)
+LOG = logging.getLogger("axolotl")
 class GRPOStrategy:
-    """Strategy for GRPO training"""
+    """
    Strategy for GRPO training
    """
    @classmethod
-    def get_trainer_class(
+    def get_trainer_class(cls):
        cls, sequence_parallel: bool
    ) -> type[AxolotlGRPOTrainer] | type[AxolotlGRPOSequenceParallelTrainer]:
        if sequence_parallel:
            return AxolotlGRPOSequenceParallelTrainer
        return AxolotlGRPOTrainer
    @classmethod
-    def get_training_args_class(cls) -> type[AxolotlGRPOConfig]:
+    def get_training_args_class(cls):
        from axolotl.core.trainers.grpo.args import AxolotlGRPOConfig
        return AxolotlGRPOConfig
    @classmethod
-    def set_training_args_kwargs(cls, cfg: DictDefault) -> dict[str, Any]:
+    def set_training_args_kwargs(cls, cfg):
-        grpo_args_kwargs: dict[str, Any] = {}
+        grpo_args_kwargs = {}
        if not hasattr(cfg, "trl") or not cfg.trl:
            return grpo_args_kwargs
@@ -44,8 +40,8 @@ class GRPOStrategy:
        if trl.use_vllm:
            grpo_args_kwargs["use_vllm"] = trl.use_vllm
-            grpo_args_kwargs["vllm_server_host"] = trl.vllm_server_host or trl.vllm.host  # type: ignore[attr-defined]
+            grpo_args_kwargs["vllm_server_host"] = trl.vllm_server_host or trl.vllm.host
-            grpo_args_kwargs["vllm_server_port"] = trl.vllm_server_port or trl.vllm.port  # type: ignore[attr-defined]
+            grpo_args_kwargs["vllm_server_port"] = trl.vllm_server_port or trl.vllm.port
            if trl.vllm_server_timeout:
                grpo_args_kwargs["vllm_server_timeout"] = trl.vllm_server_timeout
            if trl.vllm_guided_decoding_regex:
@@ -106,18 +102,17 @@ class GRPOStrategy:
        return grpo_args_kwargs
    @classmethod
-    def set_trainer_args(cls, cfg: DictDefault) -> list[Any]:
+    def set_trainer_args(cls, cfg):
        trainer_args = []
        if cfg.trl and cfg.trl.reward_funcs:
            reward_funcs = []
            for reward_func_fqn in cfg.trl.reward_funcs:
                reward_funcs.append(cls.get_reward_func(reward_func_fqn))
            trainer_args.append(reward_funcs)
        return trainer_args
    @classmethod
-    def set_trainer_kwargs(cls, cfg: DictDefault) -> dict[str, Any]:
+    def set_trainer_kwargs(cls, cfg):
        trainer_kwargs = {}
        if cfg.trl and cfg.trl.reward_processing_classes:
            trainer_kwargs["reward_processing_classes"] = (
@@ -131,7 +126,7 @@ class GRPOStrategy:
        return None
    @classmethod
-    def get_blocklist_args_kwargs(cls) -> list[str]:
+    def get_blocklist_args_kwargs(cls):
        return ["dataset_num_proc"]
    @classmethod
@@ -142,13 +137,13 @@ class GRPOStrategy:
        Args:
            reward_func_fqn (str): Fully qualified name of the reward function (e.g. r1_grpo.gsm8k_transform),
                or a HF hub path to the reward model.
        Raises:
            ValueError: If the reward function does not accept at least two arguments.
        Returns:
            RewardFunc: A callable that accepts prompts and completions and returns rewards,
                or a path to a reward model.
        Raises:
            ValueError: If the reward function does not accept at least two arguments.
        """
        try:
            # use importlib to dynamically load the reward function from the module
--- a/src/axolotl/core/trainers/grpo/args.py
+++ b/src/axolotl/core/trainers/grpo/args.py
@@ -11,4 +11,6 @@ from axolotl.core.training_args import AxolotlTrainingMixins
@dataclass
 class AxolotlGRPOConfig(AxolotlTrainingMixins, GRPOConfig):
-    """Axolotl GRPO Config for GRPO training"""
+    """
    Axolotl GRPO Config for GRPO training
    """
--- a/src/axolotl/core/trainers/grpo/sampler.py
+++ b/src/axolotl/core/trainers/grpo/sampler.py
@@ -1,172 +0,0 @@
 """Repeat random sampler (similar to the one implemented in
 https://github.com/huggingface/trl/blob/main/trl/trainer/grpo_trainer.py) that adds
 sequence parallelism functionality; i.e., duplicating data across ranks in the same
 sequence parallel group.
 """
 from typing import Iterator, Sized
 import torch
 from torch.utils.data import Sampler
 class SequenceParallelRepeatRandomSampler(Sampler):
    """Sampler for GRPO training with sequence parallelism.
    This sampler ensures:
    - Ranks in the same sequence parallel (SP) group receive identical data.
    - Each index is repeated multiple times for sampling different completions.
    - Entire batches are repeated for reuse in multiple updates.
    - Data is properly distributed across SP groups.
    In the table below, the values represent dataset indices. Each SP group has
    `sequence_parallel_degree = 2` GPUs working together on the same data. There are 2
    SP groups (SP0 and SP1), with `world_size = 4` total GPUs.
                                               Sequence Parallel Groups
                                        |       SP0        |       SP1        |
                                        |  GPU 0  |  GPU 1 |  GPU 2  |  GPU 3 |
                    global_step  step    <---> mini_repeat_count=3
                                            <----------> batch_size=2 per SP group
    grad_accum=2   ▲  ▲  0       0         [0 0 0  1 1 1]     [2 2 2  3 3 3]   <- SP groups get different data
                   ▼  |  0       1         [0 0 0  1 1 1]     [2 2 2  3 3 3]   <- Same data for each SP group GPU
                      |
                      |  1       2         [0 0 0  1 1 1]     [2 2 2  3 3 3]   <- Repeat same indices for iterations
    num_iterations=2  ▼  1       3         [0 0 0  1 1 1]     [2 2 2  3 3 3]   <- When using gradient accumulation
                         2       4         [4 4 4  5 5 5]     [6 6 6  7 7 7]   <- New batch of data indices
                         2       5         [4 4 4  5 5 5]     [6 6 6  7 7 7]
                                            ...
    Args:
        dataset: Dataset to sample from.
        mini_repeat_count: How many times to repeat each sample immediately.
        world_size: Total number of processes.
        rank: Rank of current process.
        batch_size: Number of samples per batch.
        repeat_count: How many times to repeat the full sampling process.
        sequence_parallel_degree: Number of ranks in a sequence parallel group.
        shuffle: Whether to shuffle the dataset.
        seed: Random seed for shuffling.
        drop_last: Whether to drop the last incomplete batch.
    """
    def __init__(
        self,
        dataset: Sized,
        mini_repeat_count: int,
        world_size: int,
        rank: int,
        batch_size: int = 1,
        repeat_count: int = 1,
        sequence_parallel_degree: int = 1,
        shuffle: bool = True,
        seed: int = 0,
        drop_last: bool = False,
    ):
        self.dataset = dataset
        self.mini_repeat_count = mini_repeat_count
        self.batch_size = batch_size
        self.repeat_count = repeat_count
        self.shuffle = shuffle
        self.seed = seed
        self.drop_last = drop_last
        self.epoch = 0
        self.world_size = world_size
        self.rank = rank
        # Sequence parallelism parameters
        self.sequence_parallel_degree = sequence_parallel_degree
        self.num_sp_groups = world_size // sequence_parallel_degree
        self.sp_group_id = rank // sequence_parallel_degree
        # Adjust dataset size for distributed sampling
        self.num_samples = len(self.dataset)
        self.total_size = self.num_samples
        # Calculate effective number of samples per SP group
        if (
            self.drop_last
            and self.total_size % (self.num_sp_groups * self.batch_size) != 0
        ):
            # Drop last incomplete batch if drop_last is True
            self.num_samples_per_sp_group = (
                self.total_size // self.batch_size // self.num_sp_groups
            ) * self.batch_size
        else:
            # Round up to include last batch if drop_last is False
            self.num_samples_per_sp_group = (
                (self.total_size + self.batch_size * self.num_sp_groups - 1)
                // (self.batch_size * self.num_sp_groups)
                * self.batch_size
            )
        if shuffle:
            self.generator = torch.Generator()
            self.generator.manual_seed(seed)
    def __iter__(self) -> Iterator[int]:
        """Creates iterator over dataset indices.
        Returns:
            Iterator that yields indices into the dataset.
        """
        # Deterministically shuffle based on epoch and seed
        if self.shuffle:
            indices = torch.randperm(
                self.num_samples, generator=self.generator
            ).tolist()
        else:
            indices = list(range(self.num_samples))
        # Add extra samples to make it evenly divisible by batch_size
        if len(indices) % self.batch_size != 0:
            padding = indices[: self.batch_size - len(indices) % self.batch_size]
            indices += padding
        # Subsample based on SP group ID
        # Each SP group gets distinct batches of data
        batch_indices = []
        for i in range(0, len(indices), self.batch_size * self.num_sp_groups):
            start_idx = i + self.sp_group_id * self.batch_size
            end_idx = min(start_idx + self.batch_size, len(indices))
            if start_idx < len(indices):
                for j in range(self.batch_size):
                    if start_idx + j < end_idx:
                        batch_indices.append(indices[start_idx + j])
        # Make sure batch_indices is exactly batch_size * num_batches_per_sp_group
        if self.drop_last:
            num_batches_per_sp_group = self.num_samples_per_sp_group // self.batch_size
            target_len = self.batch_size * num_batches_per_sp_group
            if len(batch_indices) > target_len:
                batch_indices = batch_indices[:target_len]
        # Apply the GRPO repeat pattern
        final_indices = []
        for _ in range(self.repeat_count):
            for idx in batch_indices:
                for _ in range(self.mini_repeat_count):
                    final_indices.append(idx)
        return iter(final_indices)
    def __len__(self) -> int:
        """Returns the total length of the iterable including repetitions.
        Returns:
            Total number of samples.
        """
        # Total length including all repetitions
        return (
            self.num_samples_per_sp_group * self.mini_repeat_count * self.repeat_count
        )
    def set_epoch(self, epoch: int) -> None:
        """Sets the epoch for this sampler.
        Args:
            epoch: Epoch number to use for shuffling.
        """
        self.epoch = epoch
--- a/src/axolotl/core/trainers/grpo/trainer.py
+++ b/src/axolotl/core/trainers/grpo/trainer.py
@@ -1,653 +1,69 @@
-"""Axolotl GRPO trainers (with and without sequence parallelism handling)"""
+"""
 Axolotl GRPO trainer
 """
-# pylint: disable=too-many-lines,duplicate-code,protected-access,no-member
+from contextlib import nullcontext
-import warnings
+from accelerate.utils import is_deepspeed_available, is_peft_model
 from typing import Any
 import datasets
 import torch
 import torch.distributed as dist
 import torch.utils.data
 from accelerate.utils import (
    broadcast_object_list,
    gather,
    gather_object,
    is_peft_available,
 )
 from datasets import Dataset, IterableDataset
 from torch import nn
 from torch.utils.data import (
    BatchSampler,
    DataLoader,
    Sampler,
 )
 from transformers import (
    PreTrainedModel,
    PreTrainedTokenizerBase,
    Trainer,
    TrainerCallback,
 )
 from transformers.trainer_utils import seed_worker
 from trl import GRPOTrainer
-from trl.data_utils import (
+from trl.extras.profiling import profiling_decorator
    apply_chat_template,
    is_conversational,
    maybe_apply_chat_template,
 )
 from trl.extras.profiling import profiling_context
 from trl.models import unwrap_model_for_generation
 from trl.trainer.grpo_config import GRPOConfig
 from trl.trainer.grpo_trainer import RewardFunc, nanstd
 from trl.trainer.utils import pad
 from axolotl.core.trainers.grpo.sampler import SequenceParallelRepeatRandomSampler
 from axolotl.core.trainers.mixins import RngLoaderMixin, SchedulerMixin
 from axolotl.monkeypatch.ring_attn import get_ring_attn_group
-if is_peft_available():
+if is_deepspeed_available():
-    # pylint: disable=unused-import
+    import deepspeed
    from peft import PeftConfig
 class AxolotlGRPOTrainer(RngLoaderMixin, SchedulerMixin, GRPOTrainer):
-    """Extend the base GRPOTrainer for axolotl helpers"""
+    """
    Extend the base GRPOTrainer for axolotl helpers
    """
    _tag_names = ["trl", "grpo", "axolotl"]
-
+    @profiling_decorator
-class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
+    def _move_model_to_vllm(self):
-    """Extend the base GRPOTrainer for sequence parallelism handling"""
+        # For DeepSpeed ZeRO-3, we need to gather all parameters before operations
-
+        deepspeed_plugin = self.accelerator.state.deepspeed_plugin
-    def __init__(
+        zero_stage_3 = deepspeed_plugin is not None and deepspeed_plugin.zero_stage == 3
-        self,
+        gather_if_zero3 = (
-        model: str | PreTrainedModel,
+            deepspeed.zero.GatheredParameters if zero_stage_3 else nullcontext
        reward_funcs: RewardFunc | list[RewardFunc],
        args: GRPOConfig | None = None,
        train_dataset: Dataset | IterableDataset | None = None,
        eval_dataset: (
            Dataset | IterableDataset | dict[str, Dataset | IterableDataset] | None
        ) = None,
        processing_class: PreTrainedTokenizerBase | None = None,
        reward_processing_classes: (
            PreTrainedTokenizerBase | list[PreTrainedTokenizerBase] | None
        ) = None,
        callbacks: list[TrainerCallback] | None = None,
        optimizers: tuple[
            torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None
        ] = (None, None),
        peft_config: "PeftConfig | None" = None,
    ):
        # First call the superclass constructor with all arguments
        super().__init__(
            model=model,
            reward_funcs=reward_funcs,
            args=args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            processing_class=processing_class,
            reward_processing_classes=reward_processing_classes,
            callbacks=callbacks,
            optimizers=optimizers,
            peft_config=peft_config,
        )
-        # Get number of SP groups (number of processes divided by SP degree)
+        if is_peft_model(self.model):
-        num_processes = self.accelerator.num_processes
+            # With PEFT and DeepSpeed ZeRO Stage 3, we must gather the full model at once before merging, as merging
-        num_sp_groups = num_processes // self.args.sequence_parallel_degree
+            # adapters in a sharded manner is not supported.
            with gather_if_zero3(list(self.model.parameters())):
                self.model.merge_adapter()
-        # Calculate batch size per SP group (not per process)
+                # Update vLLM weights while parameters are gathered
-        sp_group_batch_size = self.args.per_device_train_batch_size * num_sp_groups
+                for name, param in self.model.named_parameters():
-        possible_values = [
+                    # When using PEFT, we need to recover the original parameter name and discard some parameters
-            n_gen
+                    name = (
-            for n_gen in range(2, sp_group_batch_size + 1)
+                        name.removeprefix("base_model.model.")
-            if (sp_group_batch_size) % n_gen == 0
+                        .removeprefix("base_model.model.")
-        ]
+                        .replace(".base_layer", "")
        if self.num_generations not in possible_values:
            raise ValueError(
                f"The batch size per SP group ({num_sp_groups} x "
                f"{self.args.per_device_train_batch_size}) must be evenly divisible by "
                f"the number of generations per prompt ({self.num_generations}). Given "
                "the current configuration, the valid values for the number of "
                f"generations are: {possible_values}."
                    )
                    if self.model.prefix in name:
                        continue
                    # When module to save, remove its prefix and discard the original module
                    if "original_module" in name:
                        continue
                    name = name.replace("modules_to_save.default.", "")
        if self.args.eval_strategy != "no":
            # If sequence parallelism is enabled, calculate batch size per SP group
            sp_group_eval_batch_size = args.per_device_eval_batch_size * num_sp_groups  # type: ignore[union-attr]
            possible_values = [
                n_gen
                for n_gen in range(2, sp_group_eval_batch_size + 1)
                if (sp_group_eval_batch_size) % n_gen == 0
            ]
            if self.num_generations not in possible_values:
                raise ValueError(
                    f"With sequence parallelism (degree {self.args.sequence_parallel_degree}), "
                    f"the eval batch size per SP group ({num_sp_groups} x {self.args.per_device_eval_batch_size}) "
                    f"must be evenly divisible by the number of generations per prompt "
                    f"({self.num_generations}). Given the current eval batch size, "
                    f"the valid values for the number of generations are: {possible_values}."
                )
        # Initialize the SP group
        self.sp_group = get_ring_attn_group()
        self.rank = dist.get_rank()
        self.world_size = dist.get_world_size()
        self.local_rank = dist.get_rank(group=self.sp_group)
        self.local_world_size = dist.get_world_size(group=self.sp_group)
    def _get_train_sampler(self) -> Sampler:
        effective_batch_size = (
            self.args.per_device_train_batch_size
            * self.world_size
            * self.args.gradient_accumulation_steps
        )
        return SequenceParallelRepeatRandomSampler(
            dataset=self.train_dataset,
            mini_repeat_count=self.num_generations,
            world_size=self.world_size,
            rank=self.rank,
            batch_size=effective_batch_size
            // self.num_generations
            // self.args.sequence_parallel_degree,
            repeat_count=self.num_iterations * self.args.gradient_accumulation_steps,
            sequence_parallel_degree=self.args.sequence_parallel_degree,
            shuffle=True,
            seed=self.args.seed,
            drop_last=True,
        )
    def _create_dataloader_params(self, is_eval=False, custom_batch_size=None):
        """Create common dataloader parameters for train or eval."""
        batch_size = custom_batch_size or (
            self.args.eval_batch_size if is_eval else self._train_batch_size
        )
        params = {
            "batch_size": batch_size,
            "collate_fn": self.data_collator,
            "num_workers": self.args.dataloader_num_workers,
            "pin_memory": self.args.dataloader_pin_memory,
        }
        # Add persistent workers only for training
        if not is_eval and hasattr(self.args, "dataloader_persistent_workers"):
            params["persistent_workers"] = self.args.dataloader_persistent_workers
        # Add prefetch factor if specified
        if self.args.dataloader_prefetch_factor:
            params["prefetch_factor"] = self.args.dataloader_prefetch_factor
        return params
    def _prepare_dataloader(
        self, dataset, sampler, is_eval=False, custom_batch_size=None
    ):
        """Prepare a dataloader with the given dataset and sampler."""
        # Get base parameters
        dataloader_params = self._create_dataloader_params(is_eval, custom_batch_size)
        # Add sampler configuration
        if not isinstance(dataset, torch.utils.data.IterableDataset):
            if isinstance(sampler, BatchSampler):
                # batch_size and batch_sampler are mutually exclusive
                dataloader_params["batch_sampler"] = sampler
                del dataloader_params["batch_size"]
            else:
                dataloader_params["sampler"] = sampler
                dataloader_params["drop_last"] = self.args.dataloader_drop_last
            if not is_eval:
                dataloader_params["worker_init_fn"] = seed_worker
        # Create the dataloader
        dataloader = DataLoader(dataset, **dataloader_params)
        if self.args.sample_packing and (
            (not is_eval and not self.args.pretraining)
            or (is_eval and self.args.eval_sample_packing is not False)
        ):
            self.accelerator.even_batches = False
        # Return unprepared dataloader if using sequence parallelism
        # TODO(djsaunde): We might be able to use `accelerate`'s dataloader preparation
        # if we use `dispatch_batches` and `slice_fn_for_dispatch` properly (i.e.,
        # slice each batch along the sequence dimension).
        if self.args.sequence_parallel_degree > 1:
            return dataloader
        # Otherwise prepare with accelerator
        return self.accelerator.prepare_data_loader(dataloader)
    def get_train_dataloader(self) -> DataLoader:
        """Get dataloader for training"""
        train_dataset = self.train_dataset
        # pylint: disable=access-member-before-definition
        data_collator = self.data_collator  # type: ignore
        # Handle dataset preprocessing
        if isinstance(train_dataset, datasets.Dataset):
            # Add debug print before any modifications
            if self.args.sample_packing and not self.args.pretraining:
                train_dataset = train_dataset.remove_columns(["length"])
            if not self.args.sample_packing or self.args.pretraining:
                train_dataset = self._remove_unused_columns(
                    train_dataset, description="training"
                )
        else:
            self.data_collator = self._get_collator_with_removed_columns(  # pylint: disable=attribute-defined-outside-init
                data_collator,
                description="training",
            )
        # Get sampler and create dataloader
        sampler = self._get_train_sampler()
        dataloader = self._prepare_dataloader(train_dataset, sampler, is_eval=False)
        return dataloader
    def _generate_and_score_completions(
        self, inputs: list[dict[str, torch.Tensor | Any]]
    ) -> dict[str, torch.Tensor | Any]:
        device = self.accelerator.device
        mode = "eval" if self.control.should_evaluate else "train"
        prompts = [x["prompt"] for x in inputs]
        prompts_text = [
            maybe_apply_chat_template(example, self.processing_class)["prompt"]
            for example in inputs
        ]
        prompt_inputs = self.processing_class(
            text=prompts_text,
            return_tensors="pt",
            padding=True,
            padding_side="left",
            add_special_tokens=False,
        )
        prompt_inputs = Trainer._prepare_inputs(self, prompt_inputs)
        prompt_ids, prompt_mask = (
            prompt_inputs["input_ids"],
            prompt_inputs["attention_mask"],
        )
        if self.max_prompt_length is not None:
            prompt_ids = prompt_ids[:, -self.max_prompt_length :]
            prompt_mask = prompt_mask[:, -self.max_prompt_length :]
        # Generate completions using either vLLM or regular generation
        if self.args.use_vllm:
            # First, have main process load weights if needed
            # pylint: disable=access-member-before-definition
            if self.state.global_step != self._last_loaded_step:  # type: ignore[has-type]
                self._move_model_to_vllm()
                # pylint: disable=attribute-defined-outside-init
                self._last_loaded_step = self.state.global_step
            # Generate completions using vLLM: gather all prompts and use them in a single call in the main process
            all_prompts_text = gather_object(prompts_text)
                    if self.accelerator.is_main_process:
-                if self.args.sequence_parallel_degree > 1:
+                        self.vllm_client.update_named_param(name, param.data)
                    # Calculate sequence parallel group information
                    world_size = self.accelerator.num_processes
                    sequence_parallel_degree = self.args.sequence_parallel_degree
                    num_sp_groups = world_size // sequence_parallel_degree
-                    # Since processes in the same SP group have the same prompts, we need to ensure
+                # Unmerge adapters while parameters are still gathered
-                    # we only take one copy of each prompt from each SP group
+                self.model.unmerge_adapter()
-                    ordered_set_of_prompts = []
+                # Parameters will automatically be repartitioned when exiting the context
                    for sp_group_id in range(num_sp_groups):
                        # Get the first process from each SP group (typically the group leader)
                        group_leader_rank = sp_group_id * sequence_parallel_degree
                        # Extract prompts from this SP group, accounting for num_generations duplicates
                        # We only need prompts from one rank in each SP group
                        group_prompts = all_prompts_text[
                            group_leader_rank
                            * len(prompts_text) : (group_leader_rank + 1)
                            * len(prompts_text) : self.num_generations
                        ]
                        ordered_set_of_prompts.extend(group_prompts)
        else:
-                    # Since 'prompts' contains 'num_generations' duplicates, we first take unique prompts, and generate
+            # For non-PEFT models, simply gather and update each parameter individually.
-                    # num_generations outputs for each one. This is faster than generating outputs for each duplicate
+            for name, param in self.model.named_parameters():
-                    # prompt individually.
+                with gather_if_zero3([param]):
-                    ordered_set_of_prompts = all_prompts_text[
+                    if self.accelerator.is_main_process:
-                        :: self.num_generations * self.args.sequence_parallel_degree
+                        self.vllm_client.update_named_param(name, param.data)
                    ]
-                with profiling_context(self, "vLLM.generate"):
+        # Reset cache on main process
-                    completion_ids = self.vllm_client.generate(
+        if self.accelerator.is_main_process:
-                        prompts=ordered_set_of_prompts,
+            self.vllm_client.reset_prefix_cache()
                        n=self.num_generations,
                        repetition_penalty=self.repetition_penalty,
                        temperature=self.temperature,
                        top_p=self.top_p,
                        top_k=-1 if self.top_k is None else self.top_k,
                        min_p=0.0 if self.min_p is None else self.min_p,
                        max_tokens=self.max_completion_length,
                        guided_decoding_regex=self.guided_decoding_regex,
                    )
            else:
                completion_ids = [None] * (
                    len(all_prompts_text) // self.args.sequence_parallel_degree
                )
            # Broadcast the completions from the main process to all processes
            completion_ids = broadcast_object_list(completion_ids, from_process=0)
            # Determine the appropriate slice based on sequence parallelism
            if self.args.sequence_parallel_degree > 1:
                # Calculate SP group ID (which group of ranks this rank belongs to)
                sp_group_id = self.accelerator.process_index // self.local_world_size
                # Calculate the start index for this SP group
                sp_group_start = sp_group_id * len(prompts) * self.local_world_size
                # All ranks in the same SP group get the same data slice
                process_slice = slice(
                    sp_group_start,
                    sp_group_start + len(prompts),
                )
                completion_ids = completion_ids[process_slice]
            else:
                # Original behavior for non-sequence parallel case
                process_slice = slice(
                    self.accelerator.process_index * len(prompts),
                    (self.accelerator.process_index + 1) * len(prompts),
                )
                completion_ids = completion_ids[process_slice]
            # Pad the completions, and concatenate them with the prompts
            completion_ids = [
                torch.tensor(ids, device=device) for ids in completion_ids
            ]
            completion_ids = pad(
                completion_ids, padding_value=self.processing_class.pad_token_id
            )
            prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1)
        else:
            # Regular generation path
            with unwrap_model_for_generation(
                self.model_wrapped,
                self.accelerator,
                gather_deepspeed3_params=self.args.ds3_gather_for_generation,
            ) as unwrapped_model:
                prompt_completion_ids = unwrapped_model.generate(
                    prompt_ids,
                    attention_mask=prompt_mask,
                    generation_config=self.generation_config,
                )
            # Compute prompt length and extract completion ids
            prompt_length = prompt_ids.size(1)
            prompt_ids = prompt_completion_ids[:, :prompt_length]
            completion_ids = prompt_completion_ids[:, prompt_length:]
        # Mask everything after the first EOS token
        is_eos = completion_ids == self.processing_class.eos_token_id
        eos_idx = torch.full(
            (is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=device
        )
        eos_idx[is_eos.any(dim=1)] = is_eos.int().argmax(dim=1)[is_eos.any(dim=1)]
        sequence_indices = torch.arange(is_eos.size(1), device=device).expand(
            is_eos.size(0), -1
        )
        completion_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int()
        # If mask_truncated_completions is enabled, zero out truncated completions in completion_mask
        if self.args.mask_truncated_completions:
            truncated_completions = ~is_eos.any(dim=1)
            completion_mask = (
                completion_mask * (~truncated_completions).unsqueeze(1).int()
            )
        # Concatenate prompt_mask with completion_mask for logit computation
        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)  # (B, P+C)
        logits_to_keep = completion_ids.size(
            1
        )  # we only need to compute the logits for the completion tokens
        batch_size = (
            self.args.per_device_train_batch_size
            if mode == "train"
            else self.args.per_device_eval_batch_size
        )
        with torch.no_grad():
            # When using num_iterations == 1, old_per_token_logps == per_token_logps, so we can skip it's
            # computation here, and use per_token_logps.detach() instead.
            if self.num_iterations > 1:
                old_per_token_logps = self._get_per_token_logps(
                    self.model,
                    prompt_completion_ids,
                    attention_mask,
                    logits_to_keep,
                    batch_size,
                )
            else:
                old_per_token_logps = None
            if self.beta == 0.0:
                ref_per_token_logps = None
            elif self.ref_model is not None:
                ref_per_token_logps = self._get_per_token_logps(
                    self.ref_model,
                    prompt_completion_ids,
                    attention_mask,
                    logits_to_keep,
                    batch_size,
                )
            else:
                with self.accelerator.unwrap_model(self.model).disable_adapter():
                    ref_per_token_logps = self._get_per_token_logps(
                        self.model,
                        prompt_completion_ids,
                        attention_mask,
                        logits_to_keep,
                        batch_size,
                    )
        # Decode the generated completions
        completions_text = self.processing_class.batch_decode(
            completion_ids, skip_special_tokens=True
        )
        if is_conversational(inputs[0]):
            completions = []
            for prompt, completion in zip(prompts, completions_text):
                bootstrap = (
                    prompt.pop()["content"] if prompt[-1]["role"] == "assistant" else ""
                )
                completions.append(
                    [{"role": "assistant", "content": bootstrap + completion}]
                )
        else:
            completions = completions_text
        rewards_per_func = torch.zeros(
            len(prompts), len(self.reward_funcs), device=device
        )
        for i, (reward_func, reward_processing_class, reward_func_name) in enumerate(
            zip(
                self.reward_funcs,
                self.reward_processing_classes,
                self.reward_func_names,
            )
        ):
            with profiling_context(self, reward_func_name):
                if isinstance(
                    reward_func, nn.Module
                ):  # Module instead of PretrainedModel for compat with compiled models
                    if is_conversational(inputs[0]):
                        messages = [
                            {"messages": p + c} for p, c in zip(prompts, completions)
                        ]
                        texts = [
                            apply_chat_template(x, reward_processing_class)["text"]
                            for x in messages
                        ]
                    else:
                        texts = [p + c for p, c in zip(prompts, completions)]
                    reward_inputs = reward_processing_class(
                        text=texts,
                        return_tensors="pt",
                        padding=True,
                        padding_side="right",
                        add_special_tokens=False,
                    )
                    reward_inputs = Trainer._prepare_inputs(self, reward_inputs)
                    with torch.inference_mode():
                        rewards_per_func[:, i] = reward_func(**reward_inputs).logits[
                            :, 0
                        ]  # Shape (B*G,)
                else:
                    # Repeat all input columns (but "prompt" and "completion") to match the number of generations
                    keys = [
                        key for key in inputs[0] if key not in ["prompt", "completion"]
                    ]
                    reward_kwargs = {
                        key: [example[key] for example in inputs] for key in keys
                    }
                    output_reward_func = reward_func(
                        prompts=prompts, completions=completions, **reward_kwargs
                    )
                    # Convert None values to NaN
                    output_reward_func = [
                        reward if reward is not None else torch.nan
                        for reward in output_reward_func
                    ]
                    rewards_per_func[:, i] = torch.tensor(
                        output_reward_func, dtype=torch.float32, device=device
                    )
        # If all reward functions return None for a given row, issue a detailed warning
        if torch.isnan(rewards_per_func).all(dim=1).any():
            nan_row_idx = (
                torch.isnan(rewards_per_func).all(dim=1).nonzero(as_tuple=True)[0][0]
            )
            row_reward_kwargs = {
                key: value[nan_row_idx] for key, value in reward_kwargs.items()
            }
            row_reward_kwargs["prompt"] = prompts[nan_row_idx]
            row_reward_kwargs["completion"] = completions[nan_row_idx]
            warnings.warn(
                f"All reward functions returned None for the following kwargs: {row_reward_kwargs}. "
                "Please ensure that at least one reward function returns a valid reward."
            )
        # Gather the reward per function: this part is crucial, because the rewards are normalized per group and the
        # completions may be distributed across processes
        rewards_per_func = gather(rewards_per_func)
        # Apply weights to each reward function's output and sum
        rewards = (
            rewards_per_func * self.reward_weights.to(device).unsqueeze(0)
        ).nansum(dim=1)
        # Compute grouped-wise rewards
        mean_grouped_rewards = rewards.view(-1, self.num_generations).mean(dim=1)
        std_grouped_rewards = rewards.view(-1, self.num_generations).std(dim=1)
        # Normalize the rewards to compute the advantages
        mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(
            self.num_generations, dim=0
        )
        std_grouped_rewards = std_grouped_rewards.repeat_interleave(
            self.num_generations, dim=0
        )
        advantages = rewards - mean_grouped_rewards
        if self.args.scale_rewards:
            advantages = advantages / (std_grouped_rewards + 1e-4)
        # Slice to keep only the local part of the data
        if self.args.sequence_parallel_degree > 1:
            # Calculate SP group ID (which group of ranks this rank belongs to)
            sp_group_id = self.accelerator.process_index // self.local_world_size
            # Calculate the start index for this SP group
            sp_group_start = sp_group_id * len(prompts) * self.local_world_size
            # All ranks in the same SP group get the same data slice
            process_slice = slice(
                sp_group_start,
                sp_group_start + len(prompts),
            )
        else:
            # Original behavior for non-sequence parallel case
            process_slice = slice(
                self.accelerator.process_index * len(prompts),
                (self.accelerator.process_index + 1) * len(prompts),
            )
        advantages = advantages[process_slice]
        # Log the metrics
        if mode == "train":
            self._total_train_tokens += (
                self.accelerator.gather_for_metrics(attention_mask.sum()).sum().item()
            )
        self._metrics[mode]["num_tokens"] = [self._total_train_tokens]
        # log completion lengths, mean, min, max
        agg_completion_mask = self.accelerator.gather_for_metrics(
            completion_mask.sum(1)
        )
        self._metrics[mode]["completions/mean_length"].append(
            agg_completion_mask.float().mean().item()
        )
        self._metrics[mode]["completions/min_length"].append(
            agg_completion_mask.float().min().item()
        )
        self._metrics[mode]["completions/max_length"].append(
            agg_completion_mask.float().max().item()
        )
        # identify sequences that terminated with EOS and log their lengths
        agg_terminated_with_eos = self.accelerator.gather_for_metrics(is_eos.any(dim=1))
        term_completion_mask = agg_completion_mask[agg_terminated_with_eos]
        clipped_completions_ratio = 1 - len(term_completion_mask) / len(
            agg_completion_mask
        )
        self._metrics[mode]["completions/clipped_ratio"].append(
            clipped_completions_ratio
        )
        if len(term_completion_mask) == 0:
            # edge case where no completed sequences are found
            term_completion_mask = torch.zeros(1, device=device)
        self._metrics[mode]["completions/mean_terminated_length"].append(
            term_completion_mask.float().mean().item()
        )
        self._metrics[mode]["completions/min_terminated_length"].append(
            term_completion_mask.float().min().item()
        )
        self._metrics[mode]["completions/max_terminated_length"].append(
            term_completion_mask.float().max().item()
        )
        # Calculate mean reward per function, but only for samples where the function was applied (non-NaN values)
        for i, reward_func_name in enumerate(self.reward_func_names):
            mean_rewards = torch.nanmean(rewards_per_func[:, i]).item()
            self._metrics[mode][f"rewards/{reward_func_name}/mean"].append(mean_rewards)
            std_rewards = nanstd(rewards_per_func[:, i]).item()
            self._metrics[mode][f"rewards/{reward_func_name}/std"].append(std_rewards)
        self._metrics[mode]["reward"].append(mean_grouped_rewards.mean().item())
        self._metrics[mode]["reward_std"].append(std_grouped_rewards.mean().item())
        # Log prompt and completion texts
        self._textual_logs["prompt"].extend(gather_object(prompts_text))
        self._textual_logs["completion"].extend(gather_object(completions_text))
        for i, name in enumerate(self.reward_func_names):
            self._textual_logs["rewards"][name].extend(rewards_per_func[:, i].tolist())
        return {
            "prompt_ids": prompt_ids,
            "prompt_mask": prompt_mask,
            "completion_ids": completion_ids,
            "completion_mask": completion_mask,
            "advantages": advantages,
            "old_per_token_logps": old_per_token_logps,
            "ref_per_token_logps": ref_per_token_logps,
        }
--- a/src/axolotl/core/trainers/mixins/init.py
+++ b/src/axolotl/core/trainers/mixins/init.py
@@ -6,3 +6,4 @@
 from .optimizer import OptimizerMixin
 from .rng_state_loader import RngLoaderMixin
 from .scheduler import SchedulerMixin
 from .sequence_parallel import SequenceParallelContextManager, SequenceParallelMixin
--- a/src/axolotl/core/trainers/mixins/sequence_parallel.py
+++ b/src/axolotl/core/trainers/mixins/sequence_parallel.py
@@ -0,0 +1,313 @@
 """
 Module for Axolotl trainer sequence parallelism mixin and training context manager
 """
 import functools
 import logging
 import torch
 import torch.distributed as dist
 from datasets import Dataset
 from torch import nn
 from torch.utils.data import DistributedSampler, Sampler
 from torch.utils.hooks import RemovableHandle
 from axolotl.monkeypatch.attention.ring_attn import (
    RingAttnFunc,
    get_ring_attn_group,
    update_ring_attn_params,
 )
 LOG = logging.getLogger(__name__)
 def apply_sequence_parallelism(
    batch: dict[str, torch.Tensor],
    local_rank: int,
    local_world_size: int,
    ring_attn_func: RingAttnFunc,
 ) -> dict[str, torch.Tensor]:
    """
    Apply sequence parallelism slicing to a batch.
    Args:
        batch: Batch dictionary (e.g., input_ids, attention_mask, etc.)
        local_rank: Local rank in the sequence parallel group
        local_world_size: World size of the sequence parallel group
        ring_attn_func: The ring attention function to use
    Returns:
        Sliced batch dictionary.
    """
    # Update ring attention params if needed
    if batch.get("position_ids") is not None:
        update_ring_attn_params(position_ids=batch["position_ids"])
    # Slice batch for sequence parallel processing
    total_seq_len = batch["input_ids"].size(1)
    for key in batch:
        if (
            key in batch
            and isinstance(batch[key], torch.Tensor)
            and batch[key].dim() > 1
            and batch[key].size(1) == total_seq_len
        ):
            if ring_attn_func in [
                RingAttnFunc.VARLEN_LLAMA3,
                RingAttnFunc.BATCH_RING,
            ]:
                # Split in sequential fashion and grab this rank's chunk
                batch[key] = (
                    batch[key].chunk(local_world_size, dim=1)[local_rank].contiguous()
                )
            elif ring_attn_func is RingAttnFunc.BATCH_ZIGZAG:
                chunks = batch[key].chunk(2 * local_world_size, dim=1)
                # Take rank's chunk and opposing chunk for zigzag pattern
                selected_chunks = [
                    chunks[local_rank],
                    chunks[2 * local_world_size - local_rank - 1],
                ]
                batch[key] = torch.cat(selected_chunks, dim=1).contiguous()
            elif ring_attn_func is RingAttnFunc.BATCH_STRIPE:
                # Split into striped data and stack
                tensor = torch.stack(
                    batch[key].split(local_world_size, dim=1),
                    dim=1,
                ).transpose(1, 2)
                batch[key] = tensor[:, local_rank].contiguous()
    return batch
 class SequenceParallelMixin:
    """
    Mixin class for sequence parallelism support in trainers.
    This mixin provides functionality for handling sequence parallelism,
    specifically for creating appropriate data samplers.
    """
    args = None  # type: "AxolotlTrainingArguments"  # type: ignore[name-defined]
    def _setup_sequence_parallel(self):
        """Set up sequence parallelism environment."""
        self.ring_attn_group = get_ring_attn_group()
    def _create_sequence_parallel_sampler(
        self,
        dataset: Dataset,
        shuffle: bool = True,
        is_eval: bool = False,
    ) -> DistributedSampler:
        """
        Helper method to create sampler for sequence parallelism (SP).
        We create a distributed sampler with rank equal to the SP group ID, which
        means that all ranks in the SP group receive the same sample / set of samples
        per training step. We also set the number of replicas equal to the number of
        SP groups, which is a bit of a hack / unintended use, but works!
        Args:
            dataset: Dataset to sample from.
            shuffle: Whether to shuffle the dataset.
            is_eval: Whether we are creating a sampler for evaluation or training.
        Returns:
            Distributed sampler.
        """
        num_sp_groups = self.args.world_size // self.args.sequence_parallel_degree
        sp_group_id = dist.get_rank() // self.args.sequence_parallel_degree
        return DistributedSampler(
            dataset,
            num_replicas=num_sp_groups,
            rank=sp_group_id,
            seed=self.args.seed if shuffle else None,
            shuffle=shuffle,
            drop_last=not is_eval,
        )
    def _sp_get_train_sampler(self, dataset) -> Sampler | None:
        """
        Get a training sampler configured for sequence parallelism.
        Args:
            dataset: The training dataset
        Returns:
            Configured sequence parallel sampler.
        """
        return self._create_sequence_parallel_sampler(
            dataset,
            shuffle=not self.args.curriculum_sampling,
        )
    def _sp_get_eval_sampler(self, eval_dataset) -> Sampler | None:
        """
        Get an evaluation sampler configured for sequence parallelism.
        Args:
            eval_dataset: The evaluation dataset.
        Returns:
            Configured sequence parallel sampler.
        """
        return self._create_sequence_parallel_sampler(
            eval_dataset, shuffle=False, is_eval=True
        )
 class SequenceParallelContextManager:
    """
    Context manager for sequence parallelism operations.
    This class provides a context that will automatically apply sequence parallelism
    during model forward passes using a pre-forward hook, and gather outputs from
    across the sequence parallelism group using a post-forward hook.
    """
    def __init__(
        self,
        model: nn.Module,
        sequence_parallel_degree: int,
        ring_attn_func: RingAttnFunc,
    ):
        self.model = model
        self.sequence_parallel_degree = sequence_parallel_degree
        self.ring_attn_func = ring_attn_func
        self.process_group = get_ring_attn_group()
        # Initialize sequence parallel group details
        self.local_rank = dist.get_rank(self.process_group)
        self.local_world_size = dist.get_world_size(self.process_group)
        # Will store hook handles for removal
        self.hook_handles: list[RemovableHandle] = []
        # Create a partially applied version of the apply_sequence_parallelism function
        # with pre-configured params
        self.apply_sequence_parallelism = functools.partial(
            apply_sequence_parallelism,
            local_rank=self.local_rank,
            local_world_size=self.local_world_size,
            ring_attn_func=self.ring_attn_func,
        )
    def __enter__(self):
        # Forward pre-hook to apply sequence parallelism
        def sequence_parallel_pre_hook(_, args, kwargs):
            # Apply sequence parallelism to kwargs
            kwargs = self.apply_sequence_parallelism(batch=kwargs)
            return args, kwargs
        # Forward post-hook to gather outputs
        def sequence_parallel_post_hook(_, __, output):
            # Gather the sharded outputs
            return self.gather_outputs(output)
        # Register both hooks
        self.hook_handles.append(
            self.model.register_forward_pre_hook(
                sequence_parallel_pre_hook, with_kwargs=True
            )
        )
        self.hook_handles.append(
            self.model.register_forward_hook(sequence_parallel_post_hook)
        )
        return self
    def __exit__(self, exc_type, exc_val, exc_tb):
        # Remove all hooks
        for handle in self.hook_handles:
            handle.remove()
        self.hook_handles = []
    def gather_outputs(self, output):
        """Gather sharded outputs from all ranks and reconstruct the full tensor."""
        # Handle different output formats (dict, tensor, etc.)
        if isinstance(output, dict):
            gathered_output = {}
            for key, value in output.items():
                if isinstance(value, torch.Tensor) and value.dim() > 1:
                    # Gather logits or other sequence-sharded tensors
                    gathered_value = self.gather_tensor(value)
                    gathered_output[key] = gathered_value
                else:
                    gathered_value = value.clone()
                    dist.all_reduce(
                        gathered_value, op=dist.ReduceOp.SUM, group=self.process_group
                    )
                    gathered_output[key] = gathered_value
            return gathered_output
        if isinstance(output, torch.Tensor):
            return self.gather_tensor(output)
        return output
    def gather_tensor(self, tensor):
        """Gather a sharded tensor from all ranks."""
        # Prepare tensors for all_gather
        world_size = self.local_world_size
        # Create list to store tensors from all ranks
        gathered_tensors = [torch.zeros_like(tensor) for _ in range(world_size)]
        # All-gather operation
        dist.all_gather(gathered_tensors, tensor, group=self.process_group)
        # Concatenate along sequence dimension (typically dim=1)
        if self.ring_attn_func in [RingAttnFunc.VARLEN_LLAMA3, RingAttnFunc.BATCH_RING]:
            # Simple concatenation for standard sharding
            return torch.cat(gathered_tensors, dim=1)
        if self.ring_attn_func is RingAttnFunc.BATCH_ZIGZAG:
            # Each rank has a pattern of (rank, world_size*2-rank-1)
            reconstituted_tensors = [None] * (world_size * 2)
            # First, split each gathered tensor into its two chunks
            for rank, gathered_tensor in enumerate(gathered_tensors):
                # Each tensor contains two chunks in the sequence dimension
                chunk_size = gathered_tensor.size(1) // 2
                chunk1, chunk2 = gathered_tensor.split(chunk_size, dim=1)
                # Place chunks in their original positions
                reconstituted_tensors[rank] = chunk1
                reconstituted_tensors[world_size * 2 - rank - 1] = chunk2
            # Concatenate the reconstituted tensors in the correct order
            return torch.cat(reconstituted_tensors, dim=1)
        # Otherwise, RingAttnFunc.BATCH_STRIPE
        # In striping, each rank has every world_size-th slice
        batch_size = tensor.size(0)
        hidden_dim = tensor.size(-1)
        # First, determine the full sequence length
        total_seq_len = 0
        for t in gathered_tensors:
            total_seq_len += t.size(1)
        # Create a tensor to hold the unstriped result
        result = torch.zeros(
            batch_size,
            total_seq_len,
            hidden_dim,
            dtype=tensor.dtype,
            device=tensor.device,
        )
        # For each rank's tensor, distribute its slices to the correct positions
        for rank, gathered_tensor in enumerate(gathered_tensors):
            # The rank's tensor contains every world_size-th slice
            # starting from its rank position
            seq_len = gathered_tensor.size(1)
            for i in range(seq_len):
                # Calculate the position in the full tensor
                pos = i * world_size + rank
                if pos < total_seq_len:
                    result[:, pos] = gathered_tensor[:, i]
        return result
--- a/src/axolotl/core/training_args.py
+++ b/src/axolotl/core/training_args.py
@@ -9,6 +9,8 @@ from PIL.Image import Resampling
 from transformers import TrainingArguments
 from trl import CPOConfig, KTOConfig, ORPOConfig, PRMConfig, RewardConfig
 from axolotl.monkeypatch.attention.ring_attn.patch import RingAttnFunc
@dataclass
 class AxolotlTrainingMixins:
@@ -214,16 +216,14 @@ class AxolotlTrainingMixins:
        },
    )
-    adam_beta3: Optional[float] = field(
+    sequence_parallel_degree: Optional[int] = field(
-        default=None,
+        default=1,
-        metadata={
+        metadata={"help": "The number of workers to use in sequence parallelism"},
            "help": "The beta3 hyperparameter used in some optimizers such as CAME"
        },
    )
-    adam_epsilon2: Optional[float] = field(
+    ring_attn_func: Optional[RingAttnFunc] = field(
        default=None,
        metadata={
-            "help": "The epsilon2 hyperparameter used in some optimizers such as CAME"
+            "help": "The ring-flash-attn function to use in sequence parallelism"
        },
    )
--- a/src/axolotl/integrations/base.py
+++ b/src/axolotl/integrations/base.py
@@ -10,237 +10,224 @@
 # License for the specific language governing permissions and limitations under
 # the License.
-"""Base class for all plugins.
+"""
 Base class for all plugins.
 A plugin is a reusable, modular, and self-contained piece of code that extends the functionality of Axolotl.
 Plugins can be used to integrate third-party models, modify the training process, or add new features.
 To create a new plugin, you need to inherit from the BasePlugin class and implement the required methods.
 """
 from __future__ import annotations
 import collections
 import importlib
 import logging
-from typing import TYPE_CHECKING, Callable, OrderedDict, Union
+from typing import OrderedDict
-from peft import PeftModel
+import torch
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LRScheduler
 from transformers import PreTrainedModel, Trainer
 from axolotl.utils.dict import DictDefault
 if TYPE_CHECKING:
    from axolotl.common.datasets import TrainDatasetMeta
 class BasePlugin:
-    """Base class for all plugins. Defines the interface for plugin methods.
+    """
    Base class for all plugins. Defines the interface for plugin methods.
    Attributes:
    None
    Methods:
    register(cfg): Registers the plugin with the given configuration.
        load_datasets(cfg): Loads and preprocesses the dataset for training.
    pre_model_load(cfg): Performs actions before the model is loaded.
-        post_model_build(cfg, model): Performs actions after the model is loaded, but
+    post_model_build(cfg, model): Performs actions after the model is loaded, but before LoRA adapters are applied.
            before LoRA adapters are applied.
    pre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.
    post_lora_load(cfg, model): Performs actions after LoRA weights are loaded.
-        post_model_load(cfg, model): Performs actions after the model is loaded,
+    post_model_load(cfg, model): Performs actions after the model is loaded, inclusive of any adapters.
            inclusive of any adapters.
        post_trainer_create(cfg, trainer): Performs actions after the trainer is
            created.
    create_optimizer(cfg, trainer): Creates and returns an optimizer for training.
-        create_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and
+    create_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and returns a learning rate scheduler.
-            returns a learning rate scheduler.
+    add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before training.
-        add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before
+    add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after training.
            training.
        add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after
            training.
    """
    def __init__(self):
-        """Initializes the BasePlugin."""
+        """
        Initializes the BasePlugin.
        """
    def register(self, cfg):  # pylint: disable=unused-argument
        """Registers the plugin with the given configuration.
        Args:
            cfg: The configuration for the plugin.
        """
        Registers the plugin with the given configuration.
-    def get_input_args(self) -> str | None:
+        Parameters:
-        """Returns a pydantic model for the plugin's input arguments."""
+        cfg (dict): The configuration for the plugin.
    def load_datasets(
        self, cfg: DictDefault, preprocess: bool = False
    ) -> Union["TrainDatasetMeta", None]:
        """Loads and preprocesses the dataset for training.
        Args:
            cfg: The configuration for the plugin.
            preprocess: Whether this is the preprocess step of the datasets.
        Returns:
-            dataset_meta: The metadata for the training dataset.
+        None
        """
-    def pre_model_load(self, cfg: DictDefault):  # pylint: disable=unused-argument
+    def get_input_args(self):
-        """Performs actions before the model is loaded.
+        """
-
+        Returns a pydantic model for the plugin's input arguments.
        Args:
            cfg: The configuration for the plugin.
        """
-    # pylint: disable=unused-argument
+    def pre_model_load(self, cfg):  # pylint: disable=unused-argument
    def post_model_build(self, cfg: DictDefault, model: PreTrainedModel):
        """Performs actions after the model is built/loaded, but before any adapters are applied.
        Args:
            cfg: The configuration for the plugin.
        """
        Performs actions before the model is loaded.
-    # pylint: disable=unused-argument
+        Parameters:
-    def pre_lora_load(self, cfg: DictDefault, model: PreTrainedModel):
+        cfg (dict): The configuration for the plugin.
        """Performs actions before LoRA weights are loaded.
        Args:
            cfg: The configuration for the plugin.
            model: The loaded model.
        """
    # pylint: disable=unused-argument
    def post_lora_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
        """Performs actions after LoRA weights are loaded.
        Args:
            cfg: The configuration for the plugin.
            model: The loaded model.
        """
    # pylint: disable=unused-argument
    def post_model_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
        """Performs actions after the model is loaded.
        Args:
            cfg: The configuration for the plugin.
            model: The loaded model.
        """
    # pylint: disable=unused-argument
    def get_trainer_cls(self, cfg: DictDefault) -> Trainer | None:
        """Returns a custom class for the trainer.
        Args:
            cfg: The global axolotl configuration.
        Returns:
-            The first non-`None` trainer class returned by a plugin.
+        None
        """
-    # pylint: disable=unused-argument
+    def post_model_build(self, cfg, model):  # pylint: disable=unused-argument
-    def post_trainer_create(self, cfg: DictDefault, trainer: Trainer):
+        """
-        """Performs actions after the trainer is created.
+        Performs actions after the model is built/loaded, but before any adapters are applied.
        Args:
-            cfg: The configuration for the plugin.
+            cfg (dict): The configuration for the plugin.
            trainer: The trainer object for training.
        """
-    # pylint: disable=unused-argument
+    def post_model_load(self, cfg, model):  # pylint: disable=unused-argument
-    def create_optimizer(self, cfg: DictDefault, trainer: Trainer) -> Optimizer | None:
+        """
-        """Creates and returns an optimizer for training.
+        Performs actions after the model is loaded.
-        Args:
+        Parameters:
-            cfg: The configuration for the plugin.
+        cfg (dict): The configuration for the plugin.
-            trainer: The trainer object for training.
+        model (object): The loaded model.
        Returns:
-            The created optimizer.
+        None
        """
    def pre_lora_load(self, cfg, model):  # pylint: disable=unused-argument
        """
        Performs actions before LoRA weights are loaded.
        Parameters:
        cfg (dict): The configuration for the plugin.
        model (object): The loaded model.
        Returns:
        None
        """
    def post_lora_load(self, cfg, model):  # pylint: disable=unused-argument
        """
        Performs actions after LoRA weights are loaded.
        Parameters:
        cfg (dict): The configuration for the plugin.
        model (object): The loaded model.
        Returns:
        None
        """
    def get_trainer_cls(self, cfg):  # pylint: disable=unused-argument):
        """
        Returns a custom class for the trainer.
        Parameters:
        cfg (dict): The global axolotl configuration.
        Returns:
        class: The class for the trainer.
        """
    def create_optimizer(self, cfg, trainer):  # pylint: disable=unused-argument
        """
        Creates and returns an optimizer for training.
        Parameters:
        cfg (dict): The configuration for the plugin.
        trainer (object): The trainer object for training.
        Returns:
        object: The created optimizer.
        """
    # pylint: disable=unused-argument
    def create_lr_scheduler(
-        self,
+        self, cfg, trainer, optimizer, num_training_steps
-        cfg: DictDefault,
+    ) -> LRScheduler | None:  # pylint: disable=unused-argument
-        trainer: Trainer,
+        """
-        optimizer: Optimizer,
+        Creates and returns a learning rate scheduler.
        num_training_steps: int,
    ) -> LRScheduler | None:
        """Creates and returns a learning rate scheduler.
-        Args:
+        Parameters:
-            cfg: The configuration for the plugin.
+        cfg (dict): The configuration for the plugin.
-            trainer: The trainer object for training.
+        trainer (object): The trainer object for training.
-            optimizer: The optimizer for training.
+        optimizer (object): The optimizer for training.
-            num_training_steps: Total number of training steps
+        num_training_steps (int): Total number of training steps
        Returns:
-            The created learning rate scheduler.
+        object (LRScheduler): The created learning rate scheduler.
        """
-    # pylint: disable=unused-argument
+    def add_callbacks_pre_trainer(self, cfg, model):  # pylint: disable=unused-argument
-    def add_callbacks_pre_trainer(
+        """
-        self, cfg: DictDefault, model: PreTrainedModel
+        setup callbacks before creating the trainer.
    ) -> list[Callable]:
        """Set up callbacks before creating the trainer.
-        Args:
+        Parameters:
-            cfg: The configuration for the plugin.
+        cfg (dict): The configuration for the plugin.
-            model: The loaded model.
+        model (object): The loaded model.
        Returns:
-            A list of callback functions to be added to the `TrainingArgs`.
+        List[callable]: A list of callback functions to be added to the TrainingArgs
        """
        return []
    # pylint: disable=unused-argument
    def add_callbacks_post_trainer(
-        self, cfg: DictDefault, trainer: Trainer
+        self, cfg, trainer
-    ) -> list[Callable]:
+    ):  # pylint: disable=unused-argument
-        """Adds callbacks to the trainer after creating the trainer. This is useful for
+        """
-        callbacks that require access to the model or trainer.
+        Adds callbacks to the trainer after creating the trainer.
        This is useful for callbacks that require access to the model or trainer.
-        Args:
+        Parameters:
-            cfg: The configuration for the plugin.
+        cfg (dict): The configuration for the plugin.
-            trainer: The trainer object for training.
+        trainer (object): The trainer object for training.
        Returns:
-            A list of callback functions to be added
+        List[callable]: A list of callback functions to be added
        """
        return []
-    # pylint: disable=unused-argument
+    def post_train(self, cfg, model):  # pylint: disable=unused-argument
-    def post_train(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
+        """
-        """Performs actions after training is complete.
+        Performs actions after training is complete.
-        Args:
+        Parameters:
-            cfg: The axolotl configuration.
+        cfg (dict): The axolotl configuration
-            model: The loaded model.
+        model (object): The loaded model.
        Returns:
        None
        """
-    def post_train_unload(self, cfg: DictDefault):  # pylint: disable=unused-argument
+    def post_train_unload(self, cfg):  # pylint: disable=unused-argument
-        """Performs actions after training is complete and the model is unloaded.
+        """
        Performs actions after training is complete and the model is unloaded.
-        Args:
+        Parameters:
-            cfg: The configuration for the plugin.
+        cfg (dict): The configuration for the plugin.
        Returns:
        None
        """
 def load_plugin(plugin_name: str) -> BasePlugin:
-    """Loads a plugin based on the given plugin name.
+    """
    Loads a plugin based on the given plugin name.
-    The plugin name should be in the format "module_name.class_name". This function
+    The plugin name should be in the format "module_name.class_name".
-    splits the plugin name into module and class, imports the module, retrieves the
+    This function splits the plugin name into module and class, imports the module,
-    class from the module, and creates an instance of the class.
+    retrieves the class from the module, and creates an instance of the class.
-    Args:
+    Parameters:
-        plugin_name: The name of the plugin to be loaded. The name should be in the
+    plugin_name (str): The name of the plugin to be loaded. The name should be in the format "module_name.class_name".
            format "module_name.class_name".
    Returns:
-        An instance of the loaded plugin.
+    BasePlugin: An instance of the loaded plugin.
    Raises:
    ImportError: If the plugin module cannot be imported.
@@ -269,25 +256,28 @@ def load_plugin(plugin_name: str) -> BasePlugin:
 class PluginManager:
-    """The `PluginManager` class is responsible for loading and managing plugins. It
+    """
-    should be a singleton so it can be accessed from anywhere in the codebase.
+    The PluginManager class is responsible for loading and managing plugins.
    It should be a singleton so it can be accessed from anywhere in the codebase.
    Attributes:
-        plugins: A list of loaded plugins.
+    plugins (List[BasePlugin]): A list of loaded plugins.
    Methods:
-        get_instance(): Static method to get the singleton instance of `PluginManager`.
+    get_instance(): Static method to get the singleton instance of PluginManager.
    register(plugin_name: str): Registers a new plugin by its name.
    pre_model_load(cfg): Calls the pre_model_load method of all registered plugins.
    """
    plugins: OrderedDict[str, BasePlugin] = collections.OrderedDict()
-    _instance: PluginManager | None = None
+    _instance = None
-    _cfg: DictDefault | None = None
+    _cfg = None
    def __new__(cls):
-        """Creates a new instance of PluginManager if it doesn't exist yet."""
+        """
        Creates a new instance of PluginManager if it doesn't exist yet.
        """
        if cls._instance is None:
            cls._instance = super(PluginManager, cls).__new__(cls)
            cls._instance.plugins: OrderedDict[str, BasePlugin] = (
@@ -297,8 +287,9 @@ class PluginManager:
    @staticmethod
    def get_instance() -> "PluginManager":
-        """Returns the singleton instance of PluginManager. If the instance doesn't
+        """
-        exist, it creates a new one.
+        Returns the singleton instance of PluginManager.
        If the instance doesn't exist, it creates a new one.
        """
        if PluginManager._instance is None:
            PluginManager()
@@ -313,10 +304,14 @@ class PluginManager:
        self._cfg = cfg
    def register(self, plugin_name: str):
-        """Registers a new plugin by its name.
+        """
        Registers a new plugin by its name.
-        Args:
+        Parameters:
-            plugin_name: The name of the plugin to be registered.
+        plugin_name (str): The name of the plugin to be registered.
        Returns:
        None
        Raises:
        ImportError: If the plugin module cannot be imported.
@@ -329,11 +324,12 @@ class PluginManager:
        except ImportError:
            logging.error(f"Failed to load plugin: {plugin_name}")
-    def get_input_args(self) -> list[str]:
+    def get_input_args(self):
-        """Returns a list of Pydantic classes for all registered plugins' input arguments.'
+        """
        Returns a list of Pydantic classes for all registered plugins' input arguments.'
        Returns:
-            A list of Pydantic classes for all registered plugins' input arguments.'
+        list[str]: A list of Pydantic classes for all registered plugins' input arguments.'
        """
        input_args = []
        for plugin in self.plugins.values():
@@ -342,88 +338,83 @@ class PluginManager:
                input_args.append(input_args_from_plugin)
        return input_args
-    def load_datasets(
+    def pre_model_load(self, cfg):
-        self, cfg: DictDefault, preprocess: bool = False
+        """
-    ) -> Union["TrainDatasetMeta", None]:
+        Calls the pre_model_load method of all registered plugins.
        """Calls the load_datasets method of each registered plugin.
-        Args:
+        Parameters:
-            cfg: The configuration for the plugins.
+        cfg (dict): The configuration for the plugins.
            preprocess: Whether this is preprocess step of the datasets.
        Returns:
-            The dataset metadata loaded from all registered plugins.
+        None
        """
        return_ds_meta = None
        for plugin in self.plugins.values():
            dataset_meta = plugin.load_datasets(cfg, preprocess)
            if dataset_meta is not None:
                if return_ds_meta is None:
                    return_ds_meta = dataset_meta
                else:
                    raise RuntimeError("Multiple plugins loaded datasets")
        return return_ds_meta
    def pre_model_load(self, cfg: DictDefault):
        """Calls the pre_model_load method of all registered plugins.
        Args:
            cfg: The configuration for the plugins.
        """
        for plugin in self.plugins.values():
            plugin.pre_model_load(cfg)
-    def post_model_build(self, cfg: DictDefault, model: PreTrainedModel):
+    def post_model_build(self, cfg, model):
-        """Calls the `post_model_build` method of all registered plugins after the
+        """
-        model has been built / loaded, but before any adapters have been applied.
+        Calls the post_model_build method of all registered plugins after the model has been built/loaded,
        but before any adapters have been applied.
        Args:
-            cfg: The configuration for the plugins.
+            cfg (dict): The configuration for the plugins.
-            model: The loaded model.
+            model (object): The loaded model.
        """
        for plugin in self.plugins.values():
            plugin.post_model_build(cfg, model)
-    def pre_lora_load(self, cfg: DictDefault, model: PreTrainedModel):
+    def post_model_load(self, cfg, model):
        """Calls the `pre_lora_load` method of all registered plugins.
        Args:
            cfg: The configuration for the plugins.
            model: The loaded model.
        """
-        for plugin in self.plugins.values():
+        Calls the post_model_load method of all registered plugins after the model has been loaded
-            plugin.pre_lora_load(cfg, model)
+        inclusive of any adapters
-    def post_lora_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
+        Parameters:
-        """Calls the `post_lora_load` method of all registered plugins.
+        cfg (dict): The configuration for the plugins.
        model (object): The loaded model.
-        Args:
+        Returns:
-            cfg: The configuration for the plugins.
+        None
            model: The loaded model.
        """
        for plugin in self.plugins.values():
            plugin.post_lora_load(cfg, model)
    def post_model_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
        """Calls the `post_model_load` method of all registered plugins after the model
        has been loaded inclusive of any adapters.
        Args:
            cfg: The configuration for the plugins.
            model: The loaded model.
        """
        for plugin in self.plugins.values():
            plugin.post_model_load(cfg, model)
-    def get_trainer_cls(self, cfg: DictDefault) -> Trainer | None:
+    def pre_lora_load(self, cfg, model):
-        """Calls the `get_trainer_cls` method of all registered plugins and returns the
+        """
-        first non-`None` trainer class.
+        Calls the pre_lora_load method of all registered plugins.
-        Args:
+        Parameters:
-            cfg: The configuration for the plugins.
+        cfg (dict): The configuration for the plugins.
        model (object): The loaded model.
        Returns:
-            The first non-`None` trainer class returned by a plugin.
+        None
        """
        for plugin in self.plugins.values():
            plugin.pre_lora_load(cfg, model)
    def post_lora_load(self, cfg, model):
        """
        Calls the post_lora_load method of all registered plugins.
        Parameters:
        cfg (dict): The configuration for the plugins.
        model (object): The loaded model.
        Returns:
        None
        """
        for plugin in self.plugins.values():
            plugin.post_lora_load(cfg, model)
    def get_trainer_cls(self, cfg):
        """
        Calls the get_trainer_cls method of all registered plugins and returns the first non-None trainer class.
        Parameters:
        cfg (dict): The configuration for the plugins.
        Returns:
        object: The trainer class, or None if none was found.
        """
        for plugin in self.plugins.values():
            trainer_cls = plugin.get_trainer_cls(cfg)
@@ -431,25 +422,15 @@ class PluginManager:
                return trainer_cls
        return None
-    def post_trainer_create(self, cfg: DictDefault, trainer: Trainer):
+    def create_optimizer(self, trainer):
        """Calls the `post_trainer_create` method of all registered plugins.
        Args:
            cfg: The configuration for the plugins.
            trainer: The trainer object for training.
        """
-        for plugin in self.plugins.values():
+        Calls the create_optimizer method of all registered plugins and returns the first non-None optimizer.
            plugin.post_trainer_create(cfg, trainer)
-    def create_optimizer(self, trainer: Trainer) -> Optimizer | None:
+        Parameters:
-        """Calls the `create_optimizer` method of all registered plugins and returns
+        trainer (object): The trainer object for training.
        the first non-`None` optimizer.
        Args:
            trainer: The trainer object for training.
        Returns:
-            The created optimizer, or `None` if none was found.
+        object: The created optimizer, or None if none was found.
        """
        for plugin in self.plugins.values():
            optimizer = plugin.create_optimizer(self.cfg, trainer)
@@ -458,17 +439,17 @@ class PluginManager:
        return None
    def create_lr_scheduler(
-        self, trainer: Trainer, optimizer: Optimizer, num_training_steps: int
+        self, trainer, optimizer, num_training_steps
    ) -> LRScheduler | None:
-        """Calls the `create_lr_scheduler` method of all registered plugins and returns
+        """
-        the first non-`None` scheduler.
+        Calls the create_lr_scheduler method of all registered plugins and returns the first non-None scheduler.
-        Args:
+        Parameters:
-            trainer: The trainer object for training.
+        trainer (object): The trainer object for training.
-            optimizer: The optimizer for training.
+        optimizer (object): The optimizer for training.
        Returns:
-            The created learning rate scheduler, or `None` if not found.
+        object: The created learning rate scheduler, or None if none was found.
        """
        for plugin in self.plugins.values():
            scheduler: LRScheduler | None = plugin.create_lr_scheduler(
@@ -481,17 +462,16 @@ class PluginManager:
                return scheduler
        return None
-    def add_callbacks_pre_trainer(
+    def add_callbacks_pre_trainer(self, cfg, model):
-        self, cfg: DictDefault, model: PreTrainedModel
+        """
-    ) -> list[Callable]:
+        Calls the add_callbacks_pre_trainer method of all registered plugins.
        """Calls the add_callbacks_pre_trainer method of all registered plugins.
-        Args:
+        Parameters:
-            cfg: The configuration for the plugins.
+        cfg (dict): The configuration for the plugins.
-            model: The loaded model.
+        model (object): The loaded model.
        Returns:
-            A list of callback functions to be added to the `TrainingArgs`.
+        List[callable]: A list of callback functions to be added to the TrainingArgs.
        """
        callbacks = []
        for plugin in self.plugins.values():
@@ -500,17 +480,16 @@ class PluginManager:
                callbacks.extend(plugin_callbacks)
        return callbacks
-    def add_callbacks_post_trainer(
+    def add_callbacks_post_trainer(self, cfg, trainer):
-        self, cfg: DictDefault, trainer: Trainer
+        """
-    ) -> list[Callable]:
+        Calls the add_callbacks_post_trainer method of all registered plugins.
        """Calls the `add_callbacks_post_trainer` method of all registered plugins.
-        Args:
+        Parameters:
-            cfg: The configuration for the plugins.
+        cfg (dict): The configuration for the plugins.
-            trainer: The trainer object for training.
+        trainer (object): The trainer object for training.
        Returns:
-            A list of callback functions to be added to the `TrainingArgs`.
+        List[callable]: A list of callback functions to be added to the TrainingArgs.
        """
        callbacks = []
        for plugin in self.plugins.values():
@@ -519,31 +498,41 @@ class PluginManager:
                callbacks.extend(plugin_callbacks)
        return callbacks
-    def post_train(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
+    def post_train(self, cfg, model):
-        """Calls the post_train method of all registered plugins.
+        """
        Calls the post_train method of all registered plugins.
-        Args:
+        Parameters:
-            cfg: The configuration for the plugins.
+        cfg (dict): The configuration for the plugins.
-            model: The loaded model.
+        model (object): The loaded model.
        Returns:
        None
        """
        for plugin in self.plugins.values():
            plugin.post_train(cfg, model)
-    def post_train_unload(self, cfg: DictDefault):
+    def post_train_unload(self, cfg):
-        """Calls the post_train_unload method of all registered plugins.
+        """
        Calls the post_train_unload method of all registered plugins.
-        Args:
+        Parameters:
-            cfg: The configuration for the plugins.
+        cfg (dict): The configuration for the plugins.
-            model: The loaded model.
+        model (object): The loaded model.
        Returns:
        None
        """
        for plugin in self.plugins.values():
            plugin.post_train_unload(cfg)
 class BaseOptimizerFactory:
-    """Base class for factories to create custom optimizers"""
+    """
    Base class for factories to create custom optimizers
    """
    def __call__(
        self, opt_model, training_args, **optimizer_kwargs
-    ) -> Optimizer | None:
+    ) -> "torch.optim.Optimizer":
        pass
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/cohere.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/cohere.py
@@ -20,15 +20,25 @@ from cut_cross_entropy.transformers.utils import (
 from transformers.cache_utils import Cache
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.models.cohere.modeling_cohere import (
    _CONFIG_FOR_DOC,
    COHERE_INPUTS_DOCSTRING,
    KwargsForCausalLM,
 )
 from transformers.processing_utils import Unpack
 from transformers.utils import (
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
 )
 from transformers.utils.deprecation import deprecate_kwarg
 _PATCH_OPTS: PatchOptions | None = None
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
@add_start_docstrings_to_model_forward(COHERE_INPUTS_DOCSTRING)
@replace_return_docstrings(
    output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
 )
 def cce_forward(
    self,
    input_ids: torch.LongTensor | None = None,
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/gemma.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/gemma.py
@@ -17,15 +17,25 @@ from cut_cross_entropy.transformers.utils import (
 from transformers.cache_utils import Cache
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.models.gemma.modeling_gemma import (
    _CONFIG_FOR_DOC,
    GEMMA_INPUTS_DOCSTRING,
    KwargsForCausalLM,
 )
 from transformers.processing_utils import Unpack
 from transformers.utils import (
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
 )
 from transformers.utils.deprecation import deprecate_kwarg
 _PATCH_OPTS: PatchOptions | None = None
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
@add_start_docstrings_to_model_forward(GEMMA_INPUTS_DOCSTRING)
@replace_return_docstrings(
    output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
 )
 def cce_forward(
    self,
    input_ids: torch.LongTensor | None = None,
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/gemma3.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/gemma3.py
@@ -20,11 +20,15 @@ from torch import nn
 from transformers.cache_utils import Cache, HybridCache
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.models.gemma3.modeling_gemma3 import (
    _CONFIG_FOR_DOC,
    GEMMA3_INPUTS_DOCSTRING,
    Gemma3CausalLMOutputWithPast,
    logger,
 )
 from transformers.utils import (
    add_start_docstrings_to_model_forward,
    is_torchdynamo_compiling,
    replace_return_docstrings,
 )
 from transformers.utils.deprecation import deprecate_kwarg
@@ -34,6 +38,10 @@ _PATCH_OPTS: PatchOptions | None = None
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
@add_start_docstrings_to_model_forward(GEMMA3_INPUTS_DOCSTRING)
@replace_return_docstrings(
    output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
 )
 def cce_forward(
    self,
    input_ids: torch.LongTensor | None = None,
@@ -162,6 +170,10 @@ def cce_forward(
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
@add_start_docstrings_to_model_forward(GEMMA3_INPUTS_DOCSTRING)
@replace_return_docstrings(
    output_type=Gemma3CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
 )
 def cce_forward_multimodal(
    self,
    input_ids: torch.LongTensor | None = None,
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/llama.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/llama.py
@@ -19,9 +19,15 @@ from transformers.modeling_outputs import (
    CausalLMOutputWithPast,
 )
 from transformers.models.llama.modeling_llama import (
    _CONFIG_FOR_DOC,
    LLAMA_INPUTS_DOCSTRING,
    KwargsForCausalLM,
 )
 from transformers.processing_utils import Unpack
 from transformers.utils import (
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
 )
 from transformers.utils.deprecation import deprecate_kwarg
 from transformers.utils.generic import can_return_tuple
@@ -30,6 +36,10 @@ _PATCH_OPTS: PatchOptions | None = None
@can_return_tuple
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
@add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
@replace_return_docstrings(
    output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
 )
 def cce_forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/llama4.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/llama4.py
@@ -16,12 +16,22 @@ from torch import nn
 from transformers.cache_utils import Cache
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.models.llama4.modeling_llama4 import (
    _CONFIG_FOR_DOC,
    LLAMA4_INPUTS_DOCSTRING,
    Llama4CausalLMOutputWithPast,
 )
 from transformers.utils import (
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
 )
 _PATCH_OPTS: PatchOptions | None = None
@add_start_docstrings_to_model_forward(LLAMA4_INPUTS_DOCSTRING)
@replace_return_docstrings(
    output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
 )
 def cce_forward(
    self,
    input_ids: torch.LongTensor | None = None,
@@ -150,6 +160,9 @@ def cce_forward(
    )
@replace_return_docstrings(
    output_type=Llama4CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
 )
 def cce_forward_multimodal(
    self,
    input_ids: torch.LongTensor | None = None,  # type: ignore
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/mistral3.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/mistral3.py
@@ -19,11 +19,15 @@ from transformers.models.mistral3.modeling_mistral3 import (
    Mistral3CausalLMOutputWithPast,
 )
 from transformers.models.mistral.modeling_mistral import (
    _CONFIG_FOR_DOC,
    MISTRAL_INPUTS_DOCSTRING,
    KwargsForCausalLM,
 )
 from transformers.processing_utils import Unpack
 from transformers.utils import (
    add_start_docstrings_to_model_forward,
    is_torchdynamo_compiling,
    replace_return_docstrings,
 )
 from transformers.utils.deprecation import deprecate_kwarg
@@ -31,6 +35,10 @@ _PATCH_OPTS: PatchOptions | None = None
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
@add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
@replace_return_docstrings(
    output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
 )
 def cce_forward(
    self,
    input_ids: torch.LongTensor | None = None,
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_moe.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_moe.py
@@ -13,10 +13,16 @@ from cut_cross_entropy.transformers.utils import (
    apply_lce,
 )
 from transformers.models.qwen2_moe.modeling_qwen2_moe import (
    _CONFIG_FOR_DOC,
    QWEN2MOE_INPUTS_DOCSTRING,
    MoeCausalLMOutputWithPast,
    MoeModelOutputWithPast,
    load_balancing_loss_func,
 )
 from transformers.utils import (
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
 )
 from transformers.utils.deprecation import deprecate_kwarg
 from transformers.utils.generic import can_return_tuple
@@ -25,6 +31,10 @@ _PATCH_OPTS: PatchOptions | None = None
@can_return_tuple
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
@add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING)
@replace_return_docstrings(
    output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
 )
 def forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_vl.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_vl.py
@@ -14,12 +14,22 @@ from cut_cross_entropy.transformers.utils import (
 )
 from torch.nn import CrossEntropyLoss
 from transformers.models.qwen2_vl.modeling_qwen2_vl import (
    _CONFIG_FOR_DOC,
    QWEN2_VL_INPUTS_DOCSTRING,
    Qwen2VLCausalLMOutputWithPast,
 )
 from transformers.utils import (
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
 )
 _PATCH_OPTS: PatchOptions | None = None
@add_start_docstrings_to_model_forward(QWEN2_VL_INPUTS_DOCSTRING)
@replace_return_docstrings(
    output_type=Qwen2VLCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
 )
 def cce_forward_multimodal(
    self,
    input_ids: Optional[torch.LongTensor] = None,
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen3_moe.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen3_moe.py
@@ -12,13 +12,20 @@ from cut_cross_entropy.transformers.utils import (
    TransformersModelT,
    apply_lce,
 )
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.models.qwen3_moe.modeling_qwen3_moe import (
    _CONFIG_FOR_DOC,
    QWEN3_MOE_INPUTS_DOCSTRING,
    KwargsForCausalLM,
    MoeCausalLMOutputWithPast,
    MoeModelOutputWithPast,
    load_balancing_loss_func,
 )
 from transformers.processing_utils import Unpack
 from transformers.utils import (
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
 )
 from transformers.utils.deprecation import deprecate_kwarg
 from transformers.utils.generic import can_return_tuple
@@ -27,6 +34,10 @@ _PATCH_OPTS: PatchOptions | None = None
@can_return_tuple
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
@add_start_docstrings_to_model_forward(QWEN3_MOE_INPUTS_DOCSTRING)
@replace_return_docstrings(
    output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
 )
 def forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
--- a/src/axolotl/integrations/liger/init.py
+++ b/src/axolotl/integrations/liger/init.py
@@ -151,30 +151,6 @@ class LigerPlugin(BasePlugin):
                rms_norm=cfg.liger_rms_norm,
                layer_norm=cfg.liger_layer_norm,
            )
        elif cfg.model_config_type == "qwen3":
            from axolotl.integrations.liger.models.qwen3 import (
                apply_liger_kernel_to_qwen3,
            )
            apply_liger_kernel_to_qwen3(
                cross_entropy=cfg.liger_cross_entropy,
                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
                glu_activation=cfg.liger_glu_activation,
                rms_norm=cfg.liger_rms_norm,
                layer_norm=cfg.liger_layer_norm,
            )
        elif cfg.model_config_type == "qwen3_moe":
            from axolotl.integrations.liger.models.qwen3_moe import (
                apply_liger_kernel_to_qwen3_moe,
            )
            apply_liger_kernel_to_qwen3_moe(
                cross_entropy=cfg.liger_cross_entropy,
                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
                glu_activation=cfg.liger_glu_activation,
                rms_norm=cfg.liger_rms_norm,
                layer_norm=cfg.liger_layer_norm,
            )
        else:
            logging.warning(
                f"Unsupported model config type: {cfg.model_config_type}. Liger not applied."
--- a/src/axolotl/integrations/liger/models/init.py
+++ b/src/axolotl/integrations/liger/models/init.py
--- a/src/axolotl/integrations/liger/models/deepseekv2.py
+++ b/src/axolotl/integrations/liger/models/deepseekv2.py
@@ -14,6 +14,10 @@ from torch.nn import CrossEntropyLoss
 from transformers.modeling_outputs import CausalLMOutputWithPast
 # @add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
 # @replace_return_docstrings(
 #    output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
 # )
 def lce_forward(
    self,
    input_ids: torch.LongTensor = None,
--- a/src/axolotl/integrations/liger/models/jamba.py
+++ b/src/axolotl/integrations/liger/models/jamba.py
@@ -13,11 +13,21 @@ from liger_kernel.transformers.fused_linear_cross_entropy import (
 from torch.nn import CrossEntropyLoss
 from transformers.modeling_outputs import MoeCausalLMOutputWithPast
 from transformers.models.jamba.modeling_jamba import (
    _CONFIG_FOR_DOC,
    JAMBA_INPUTS_DOCSTRING,
    HybridMambaAttentionDynamicCache,
    load_balancing_loss_func,
 )
 from transformers.utils import (
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
 )
@add_start_docstrings_to_model_forward(JAMBA_INPUTS_DOCSTRING)
@replace_return_docstrings(
    output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
 )
 def lce_forward(
    self,
    input_ids: torch.LongTensor = None,
--- a/src/axolotl/integrations/liger/models/qwen3.py
+++ b/src/axolotl/integrations/liger/models/qwen3.py
@@ -1,160 +0,0 @@
 """
 Liger FLCE for Qwen3. Based on transformers v4.51.3.
 """
 import sys
 from typing import Optional, Tuple, Union
 import torch
 from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
 from transformers.cache_utils import Cache
 from transformers.modeling_outputs import CausalLMOutputWithPast
 def lce_forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[Cache] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    cache_position: Optional[torch.LongTensor] = None,
    logits_to_keep: Union[int, torch.Tensor] = 0,
    **kwargs,
 ) -> Union[Tuple, CausalLMOutputWithPast]:
    r"""
    Args:
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        logits_to_keep (`int` or `torch.Tensor`, *optional*):
            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
            This is useful when using packed tensor format (single dimension for batch and sequence length).
    Returns:
    """
    # pylint: disable=duplicate-code
    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )
    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
    outputs = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        cache_position=cache_position,
        **kwargs,
    )
    hidden_states = outputs[0]
    logits = None
    loss = None
    # if in training mode, don't materialize logits
    if self.training and (labels is not None):
        loss = LigerForCausalLMLoss(
            hidden_states=hidden_states,
            lm_head_weight=self.lm_head.weight,
            labels=labels,
            hidden_size=self.config.hidden_size,
            **kwargs,
        )
    else:  # if in inference mode materialize logits
        slice_indices = (
            slice(-logits_to_keep, None)
            if isinstance(logits_to_keep, int)
            else logits_to_keep
        )
        logits = self.lm_head(hidden_states[:, slice_indices, :])
        if labels is not None:
            loss = self.loss_function(
                logits=logits,
                labels=labels,
                vocab_size=self.config.vocab_size,
                **kwargs,
            )
    return CausalLMOutputWithPast(
        loss=loss,
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )
 def apply_liger_kernel_to_qwen3(
    cross_entropy: bool = False,
    fused_linear_cross_entropy: bool = False,
    rms_norm: bool = False,
    glu_activation: bool = False,
    layer_norm: bool = False,
    **kwargs,  # pylint: disable=unused-argument
 ) -> None:
    # pylint: disable=duplicate-code
    """
    Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
    Args:
        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
        fused_linear_cross_entropy (bool):
            Whether to apply Liger's fused linear cross entropy loss. Default is False.
            `cross_entropy` and `fused_linear_cross_entropy` cannot both be False.
            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is False.
        glu_activation (bool): Whether to apply Liger's SwiGLU MLP. Default is False.
        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
    """
    import transformers.models.qwen3.modeling_qwen3  # noqa: F401  # pylint: disable=unused-import
    from liger_kernel.transformers.functional import liger_cross_entropy
    from liger_kernel.transformers.layer_norm import LigerLayerNorm
    from liger_kernel.transformers.rms_norm import LigerRMSNorm
    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
    assert not (
        cross_entropy and fused_linear_cross_entropy
    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
    modeling_qwen3 = sys.modules["transformers.models.qwen3.modeling_qwen3"]
    if rms_norm:
        modeling_qwen3.Qwen3RMSNorm = LigerRMSNorm
    if glu_activation:
        modeling_qwen3.Qwen3MLP = LigerSwiGLUMLP
    if layer_norm:
        modeling_qwen3.nn.LayerNorm = LigerLayerNorm
    if cross_entropy:
        from transformers.loss.loss_utils import nn
        nn.functional.cross_entropy = liger_cross_entropy
    if fused_linear_cross_entropy:
        modeling_qwen3.Qwen3ForCausalLM.forward = lce_forward
--- a/src/axolotl/integrations/liger/models/qwen3_moe.py
+++ b/src/axolotl/integrations/liger/models/qwen3_moe.py
@@ -1,191 +0,0 @@
 """
 Liger FLCE for Qwen3 MoE. Based on transformers v4.51.3.
 """
 import sys
 from copy import deepcopy
 from typing import List, Optional, Union
 import torch
 from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
 from transformers.modeling_outputs import MoeCausalLMOutputWithPast
 from transformers.models.qwen3_moe.modeling_qwen3_moe import load_balancing_loss_func
 def lce_forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[List[torch.FloatTensor]] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    output_router_logits: Optional[bool] = None,
    cache_position: Optional[torch.LongTensor] = None,
    logits_to_keep: Union[int, torch.Tensor] = 0,
    **kwargs,
 ) -> MoeCausalLMOutputWithPast:
    r"""
    Args:
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        logits_to_keep (`int` or `torch.Tensor`, *optional*):
            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
            This is useful when using packed tensor format (single dimension for batch and sequence length).
    Returns:
    """
    # pylint: disable=duplicate-code
    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_router_logits = (
        output_router_logits
        if output_router_logits is not None
        else self.config.output_router_logits
    )
    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )
    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
    outputs = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        output_router_logits=output_router_logits,
        cache_position=cache_position,
        **kwargs,
    )
    hidden_states = outputs[0]
    logits = None
    loss = None
    # if in training mode, don't materialize logits
    if self.training and (labels is not None):
        loss = LigerForCausalLMLoss(
            hidden_states=hidden_states,
            lm_head_weight=self.lm_head.weight,
            labels=labels,
            hidden_size=self.config.hidden_size,
            **kwargs,
        )
    else:  # if in inference mode materialize logits
        slice_indices = (
            slice(-logits_to_keep, None)
            if isinstance(logits_to_keep, int)
            else logits_to_keep
        )
        logits = self.lm_head(hidden_states[:, slice_indices, :])
        if labels is not None:
            loss = self.loss_function(
                logits=logits,
                labels=labels,
                vocab_size=self.config.vocab_size,
                **kwargs,
            )
    aux_loss = None
    if output_router_logits:
        aux_loss = load_balancing_loss_func(
            outputs.router_logits,
            self.num_experts,
            self.num_experts_per_tok,
            attention_mask,
        )
        if labels is not None:
            loss += self.router_aux_loss_coef * aux_loss.to(
                loss.device
            )  # make sure to reside in the same device
    return MoeCausalLMOutputWithPast(
        loss=loss,
        aux_loss=aux_loss,
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )
 def apply_liger_kernel_to_qwen3_moe(
    cross_entropy: bool = False,
    fused_linear_cross_entropy: bool = False,
    rms_norm: bool = False,
    glu_activation: bool = False,
    layer_norm: bool = False,
    **kwargs,  # pylint: disable=unused-argument
 ) -> None:
    # pylint: disable=duplicate-code
    """
    Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
    Args:
        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
        fused_linear_cross_entropy (bool):
            Whether to apply Liger's fused linear cross entropy loss. Default is False.
            `cross_entropy` and `fused_linear_cross_entropy` cannot both be False.
            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is False.
        glu_activation (bool): Whether to apply Liger's SwiGLU MLP. Default is False.
        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
    """
    import transformers.models.qwen3_moe.modeling_qwen3_moe  # noqa: F401  # pylint: disable=unused-import
    from liger_kernel.transformers.functional import liger_cross_entropy
    from liger_kernel.transformers.layer_norm import LigerLayerNorm
    from liger_kernel.transformers.rms_norm import LigerRMSNorm
    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
    assert not (
        cross_entropy and fused_linear_cross_entropy
    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
    modeling_qwen3_moe = sys.modules["transformers.models.qwen3_moe.modeling_qwen3_moe"]
    if rms_norm:
        modeling_qwen3_moe.Qwen3MoeRMSNorm = LigerRMSNorm
    if glu_activation:
        def _liger_swiglu_mlp_wrapper(config, intermediate_size=None, **kwargs):
            "Accepts intermediate_size to pass to LigerSwiGLUMLP"
            # clone config to avoid modifying the original
            config = deepcopy(config)
            if intermediate_size:
                setattr(config, "intermediate_size", intermediate_size)
            return LigerSwiGLUMLP(config, **kwargs)
        modeling_qwen3_moe.Qwen3MoeMLP = _liger_swiglu_mlp_wrapper
    if layer_norm:
        modeling_qwen3_moe.nn.LayerNorm = LigerLayerNorm
    if cross_entropy:
        from transformers.loss.loss_utils import nn
        nn.functional.cross_entropy = liger_cross_entropy
    if fused_linear_cross_entropy:
        modeling_qwen3_moe.Qwen3MoeForCausalLM.forward = lce_forward
--- a/src/axolotl/kernels/geglu.py
+++ b/src/axolotl/kernels/geglu.py
@@ -1,4 +1,5 @@
-"""Module for definition of GEGLU Triton kernels.
+"""
 Module for definition of GEGLU Triton kernels.
 See "GLU Variants Improve Transformer" (https://arxiv.org/abs/2002.05202).
@@ -11,6 +12,8 @@ import torch
 import triton
 import triton.language as tl
 SQRT_2_PI: tl.constexpr = 0.7978845608028654  # sqrt(2/π)
@triton.jit
 def _geglu_fwd_kernel(
--- a/src/axolotl/kernels/quantize.py
+++ b/src/axolotl/kernels/quantize.py
@@ -55,12 +55,15 @@ def dequantize(
    target_device = W.device
    # Extract quantization state
    nested = False
    if not isinstance(quant_state, list):
        # New style quant_state class
        absmax = quant_state.absmax.to(target_device)
        shape = quant_state.shape
        dtype = quant_state.dtype
        blocksize = quant_state.blocksize
        if quant_state.nested:
            nested = True
            offset = quant_state.offset.to(target_device)
        state2 = quant_state.state2
        absmax2 = state2.absmax.to(target_device)
@@ -115,6 +118,7 @@ def dequantize(
            ctypes.c_int(n_elements_absmax),
        )
    if nested:
        out_absmax += offset
    # Choose appropriate dequantization function
--- a/src/axolotl/loaders/init.py
+++ b/src/axolotl/loaders/init.py
@@ -1,10 +0,0 @@
 """Init for axolotl.loaders module"""
 # pylint: disable=unused-import
 # flake8: noqa
 from .adapter import load_adapter, load_lora
 from .constants import MULTIMODAL_AUTO_MODEL_MAPPING
 from .model import ModelLoader
 from .processor import load_processor
 from .tokenizer import load_tokenizer
--- a/src/axolotl/loaders/adapter.py
+++ b/src/axolotl/loaders/adapter.py
@@ -1,206 +0,0 @@
 """Adapter loading functionality, including LoRA / QLoRA and associated utils"""
 import logging
 import os
 import types
 from typing import Any
 import bitsandbytes as bnb
 import torch
 from bitsandbytes.nn import Params4bit
 from peft import (
    AdaptionPromptConfig,
    LoftQConfig,
    LoraConfig,
    PeftConfig,
    PeftMixedModel,
    PeftModel,
    get_peft_model,
 )
 from transformers import PreTrainedModel
 from axolotl.loaders.utils import get_linear_embedding_layers
 from axolotl.utils.dict import DictDefault
 LOG = logging.getLogger(__name__)
 def setup_quantized_meta_for_peft(model: torch.nn.Module):
    """Replaces `quant_state.to` with a dummy function to prevent PEFT from moving `quant_state` to meta device"""
    def temp_to_method(self, *args, **kwargs):  # pylint: disable=unused-argument
        return self
    for param in model.parameters():
        if isinstance(param, Params4bit):
            param.quant_state._orig_to = (  # pylint: disable=protected-access
                param.quant_state.to
            )
            param.quant_state.to = types.MethodType(temp_to_method, param.quant_state)
 def setup_quantized_peft_meta_for_training(model: torch.nn.Module):
    """Replaces dummy `quant_state.to` method with the original function to allow training to continue"""
    for param in model.parameters():
        if isinstance(param, Params4bit) and hasattr(param.quant_state, "_orig_to"):
            param.quant_state.to = (
                param.quant_state._orig_to  # pylint: disable=protected-access
            )
            param.quant_state._orig_to = None  # pylint: disable=protected-access
 def find_all_linear_names(model):
    cls = (bnb.nn.Linear4bit, bnb.nn.Linear8bitLt, torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if (
            isinstance(module, cls)
            or "Linear" in module.__class__.__name__
            and module.__class__.__name__ not in ("LlamaLinearScalingRotaryEmbedding",)
        ):
            names = name.split(".")
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    embedding_modules = get_linear_embedding_layers(model.config.model_type)
    output_embedding = embedding_modules[1]
    if output_embedding in lora_module_names:  # needed for 16-bit
        lora_module_names.remove(output_embedding)
    return list(lora_module_names)
 def load_lora(
    model: PreTrainedModel,
    cfg: DictDefault,
    inference: bool = False,
    config_only: bool = False,
 ) -> tuple[PreTrainedModel | PeftModel | PeftMixedModel | None, PeftConfig | None]:
    lora_target_modules = cfg.lora_target_modules or []
    if cfg.lora_target_linear:
        linear_names = find_all_linear_names(model)
        LOG.info(f"found linear modules: {repr(sorted(linear_names))}")
        lora_target_modules_as_list = (
            lora_target_modules
            if isinstance(lora_target_modules, list)
            else [lora_target_modules]
        )
        lora_target_modules = list(set(lora_target_modules_as_list + linear_names))
    lora_config_kwargs = {}
    loftq_bits = cfg.peft and cfg.peft.loftq_config and cfg.peft.loftq_config.loftq_bits
    if loftq_bits:
        lora_config_kwargs["loftq_config"] = LoftQConfig(loftq_bits=loftq_bits)
        lora_config_kwargs["init_lora_weights"] = "loftq"
    if cfg.peft_init_lora_weights:
        lora_config_kwargs["init_lora_weights"] = cfg.peft_init_lora_weights
    if cfg.peft_use_dora:
        lora_config_kwargs["use_dora"] = cfg.peft_use_dora
        LOG.info("Initializing LoRA weights using dora. This might take longer.")
    if cfg.peft_use_rslora:
        lora_config_kwargs["use_rslora"] = cfg.peft_use_rslora
    if cfg.peft_layer_replication:
        lora_config_kwargs["layer_replication"] = cfg.peft_layer_replication
    lora_config = LoraConfig(
        r=cfg.lora_r,
        lora_alpha=cfg.lora_alpha,
        target_modules=lora_target_modules,
        layers_to_transform=cfg.peft_layers_to_transform,
        layers_pattern=cfg.peft_layers_pattern,
        lora_dropout=cfg.lora_dropout,
        fan_in_fan_out=cfg.lora_fan_in_fan_out,
        modules_to_save=cfg.lora_modules_to_save if cfg.lora_modules_to_save else None,
        bias="none",
        task_type="CAUSAL_LM",
        **lora_config_kwargs,
    )
    if config_only:
        return None, lora_config
    rank = int(os.environ.get("LOCAL_RANK", 0))
    if (
        cfg.fsdp
        and cfg.adapter
        and cfg.fsdp_config.fsdp_cpu_ram_efficient_loading
        and rank != 0
    ):
        setup_quantized_meta_for_peft(model)
    if cfg.lora_model_dir:
        LOG.debug("Loading pretrained PEFT - LoRA")
        model_kwargs: Any = {}
        if cfg.lora_on_cpu:
            model_kwargs["max_memory"] = {"cpu": "256GiB"}
            model_kwargs["device_map"] = {"": "cpu"}
        model = PeftModel.from_pretrained(
            model,
            cfg.lora_model_dir,
            is_trainable=(not inference),
            **model_kwargs,
        )
    else:
        model = get_peft_model(model, lora_config)
    if rank == 0:
        try:
            model.print_trainable_parameters()
        except AttributeError as exc:
            LOG.warning(
                "Exception caught during model.print_trainable_parameters(): %s", exc
            )
    elif (
        cfg.fsdp
        and cfg.adapter
        and cfg.fsdp_config.fsdp_cpu_ram_efficient_loading
        and rank != 0
    ):
        setup_quantized_peft_meta_for_training(model)
    return model, lora_config
 def load_adapter(
    model: PreTrainedModel,
    cfg: DictDefault,
    adapter: str | None,
    inference: bool = False,
 ) -> tuple[PreTrainedModel | PeftModel | PeftMixedModel, PeftConfig | None]:
    if adapter is None:
        return model, None
    if hasattr(model, "enable_input_require_grads"):
        model.enable_input_require_grads()
    if adapter in ["lora", "qlora"]:
        peft_model, lora_config = load_lora(model, cfg, inference=inference)
        return peft_model, lora_config
    if adapter == "llama-adapter":
        peft_model, lora_config = load_llama_adapter(model, cfg)
        return peft_model, lora_config
    raise NotImplementedError(f"{adapter} PEFT adapter not available")
 def load_llama_adapter(
    model: PreTrainedModel, cfg: DictDefault
 ) -> tuple[PeftModel | PeftMixedModel, PeftConfig]:
    peft_config = AdaptionPromptConfig(
        adapter_layers=cfg.peft_adapter.layers,  # layers (L)
        adapter_len=cfg.peft_adapter.len,  # prompt length (K)
        task_type="CAUSAL_LM",
    )
    if cfg.lora_model_dir:
        LOG.debug("Loading pretrained PEFT - llama_adapter")
        peft_model = PeftModel.from_pretrained(
            model,
            cfg.lora_model_dir,
            torch_dtype=torch.float16,
        )
    else:
        peft_model = get_peft_model(model, peft_config)
    peft_model.print_trainable_parameters()
    return peft_model, peft_config
--- a/src/axolotl/loaders/constants.py
+++ b/src/axolotl/loaders/constants.py
@@ -1,21 +0,0 @@
 """Shared constants for axolotl.loaders module"""
 from transformers import (
    Gemma3ForConditionalGeneration,
    Llama4ForConditionalGeneration,
    LlavaForConditionalGeneration,
    Mistral3ForConditionalGeneration,
    MllamaForConditionalGeneration,
    Qwen2_5_VLForConditionalGeneration,
    Qwen2VLForConditionalGeneration,
 )
 MULTIMODAL_AUTO_MODEL_MAPPING = {
    "mllama": MllamaForConditionalGeneration,
    "llama4": Llama4ForConditionalGeneration,
    "llava": LlavaForConditionalGeneration,
    "qwen2_vl": Qwen2VLForConditionalGeneration,
    "qwen2_5_vl": Qwen2_5_VLForConditionalGeneration,
    "mistral3": Mistral3ForConditionalGeneration,
    "gemma3": Gemma3ForConditionalGeneration,
 }
--- a/src/axolotl/loaders/model.py
+++ b/src/axolotl/loaders/model.py
@@ -1,754 +0,0 @@
 """Model loader class implementation for loading, configuring, and patching various
 models.
 """
 import gc
 import logging
 import math
 import os
 from functools import cached_property
 from importlib.util import find_spec
 from typing import Any
 import peft
 import torch
 import transformers
 import transformers.modeling_utils
 from accelerate import init_empty_weights
 from peft import PeftConfig, PeftMixedModel, PeftModel, prepare_model_for_kbit_training
 from transformers import (
    AutoModelForCausalLM,
    AutoModelForVision2Seq,
    AwqConfig,
    BitsAndBytesConfig,
    GPTQConfig,
    PreTrainedModel,
    PreTrainedTokenizerBase,
 )
 from transformers.integrations.deepspeed import (
    HfTrainerDeepSpeedConfig,
    is_deepspeed_zero3_enabled,
 )
 from axolotl.common.architectures import MOE_ARCH_BLOCK
 from axolotl.integrations.base import PluginManager
 from axolotl.loaders.adapter import load_adapter, load_lora
 from axolotl.loaders.constants import MULTIMODAL_AUTO_MODEL_MAPPING
 from axolotl.loaders.patch_manager import PatchManager
 from axolotl.loaders.utils import (
    get_linear_embedding_layers,
    get_module_class_from_name,
    load_model_config,
 )
 from axolotl.models.mamba import fix_mamba_attn_for_loss
 from axolotl.utils.bench import log_gpu_memory_usage
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import (
    get_device_count,
    get_device_type,
 )
 from axolotl.utils.model_shard_quant import load_sharded_model_quant
 from axolotl.utils.schemas.enums import RLType
 LOG = logging.getLogger(__name__)
 PLUGIN_MANAGER = PluginManager.get_instance()
 class ModelLoader:
    """Manages model configuration, initialization and application of patches during
    model loading.
    This class orchestrates the entire process of loading a model from configuration to
    final preparation. It handles device mapping, quantization, attention mechanisms,
    adapter integration, and various optimizations.
    The loading process includes:
        - Loading and validating model configuration
        - Applying monkey patches for optimizations / fixes
        - Setting up device mapping (including multi-GPU configurations)
        - Configuring quantization
        - Setting attention mechanisms (Flash Attention, SDPA, etc.)
        - Loading and initializing the model
        - Applying adapters (LoRA, QLoRA, etc.)
    Attributes:
        model: The loaded model instance (available after load() is called).
        model_kwargs: Dictionary of keyword arguments passed to model initialization.
        base_model: Name or path of the base model to load.
        model_type: Type of model to load (e.g., `AutoModelForCausalLM`).
        model_config: Configuration object for the model.
        auto_model_loader: class used for loading the model (default:
            `AutoModelForCausalLM`).
    """
    def __init__(
        self,
        cfg: DictDefault,
        tokenizer: PreTrainedTokenizerBase,
        *,
        inference: bool = False,
        reference_model: bool = False,
        **kwargs,  # pylint: disable=unused-argument
    ):
        """Initializes the ModelLoader.
        Args:
            cfg: Configuration dictionary with model and training settings.
            tokenizer: Tokenizer instance associated with the model.
            processor: Optional processor for multimodal models. Defaults to None.
            inference: Whether the model is being loaded for inference mode. Defaults
                to False.
            reference_model: Whether this is a reference model (used in setups like DPO
                training). Defaults to False.
            **kwargs: Additional keyword arguments (ignored).
        """
        self.cfg = cfg
        self.tokenizer = tokenizer
        self.inference: bool = inference
        self.reference_model: bool = reference_model
        # Init model kwargs
        self.model_kwargs: dict[str, Any] = {}
        if cfg.overrides_of_model_kwargs:
            for key, val in cfg.overrides_of_model_kwargs.items():
                self.model_kwargs[key] = val
        # Init model
        self.model: PreTrainedModel | PeftModel | PeftMixedModel
        self.base_model = cfg.base_model
        self.model_type = cfg.type_of_model
        # Init model config
        self.model_config = load_model_config(cfg)
        self.auto_model_loader = AutoModelForCausalLM  # pylint: disable=invalid-name
        # Initialize the patch manager
        self.patch_manager = PatchManager(
            cfg=cfg,
            model_config=self.model_config,
            inference=inference,
        )
    @cached_property
    def has_flash_attn(self) -> bool:
        """Check if flash attention is installed."""
        return find_spec("flash_attn") is not None
    @cached_property
    def qlora_fsdp(self):
        """Property that determines if FSDP with QLoRA is enabled."""
        return self.cfg.fsdp and self.cfg.adapter == "qlora"
    def load(self) -> tuple[PreTrainedModel, PeftConfig | None]:
        """Load and prepare the model with all configurations and patches.
        Returns:
            A tuple with the loaded model and its LoRA configuration (if applicable).
        """
        # Initial setup and patches
        self.patch_manager.apply_pre_model_load_patches()
        self._apply_pre_model_load_setup()
        # Build the model
        PLUGIN_MANAGER.pre_model_load(self.cfg)
        skip_move_to_device = self._build_model()
        PLUGIN_MANAGER.post_model_build(self.cfg, self.model)
        # Post-build model configuration
        self._apply_post_model_load_setup()
        # Load adapters (LoRA, etc.)
        PLUGIN_MANAGER.pre_lora_load(self.cfg, self.model)
        lora_config = self._load_adapters()
        PLUGIN_MANAGER.post_lora_load(self.cfg, self.model)
        # Apply remaining patches and finalize
        self._apply_post_lora_load_setup(skip_move_to_device)
        self.patch_manager.apply_post_model_load_patches(self.model)
        PLUGIN_MANAGER.post_model_load(self.cfg, self.model)
        return self.model, lora_config
    def _apply_pre_model_load_setup(self):
        """Apply patches and setup configurations before model loading."""
        self._set_auto_model_loader()
        self._set_device_map_config()
        if self.cfg.revision_of_model:
            self.model_kwargs["revision"] = self.cfg.revision_of_model
        self._set_quantization_config()
        self._set_attention_config()
    def _apply_post_model_load_setup(self):
        """Configure the model after it has been loaded."""
        # Handle PeftModel if needed
        if (
            isinstance(self.model, (peft.PeftModel, peft.PeftModelForCausalLM))
            and not self.qlora_fsdp
        ):
            self.model = self.model.merge_and_unload()
        self._resize_token_embeddings()
        self._adjust_model_config()
        self._log_memory_usage()
        self._configure_embedding_dtypes()
    def _resize_token_embeddings(self):
        """Resize token embeddings if needed."""
        embeddings_len = (
            math.ceil(len(self.tokenizer) / 32) * 32
            if self.cfg.resize_token_embeddings_to_32x
            else len(self.tokenizer)
        )
        if hasattr(self.model, "get_input_embeddings") and (
            self.model.get_input_embeddings().num_embeddings < embeddings_len
            or (
                self.model.get_input_embeddings().num_embeddings > embeddings_len
                and self.cfg.shrink_embeddings
            )
        ):
            resize_kwargs = {}
            if self.cfg.mean_resizing_embeddings is not None and (
                self.model_config.model_type != "llava"
            ):
                resize_kwargs["mean_resizing"] = self.cfg.mean_resizing_embeddings
            self.model.resize_token_embeddings(embeddings_len, **resize_kwargs)
        else:
            self.model.tie_weights()
    def _adjust_model_config(self):
        if (
            hasattr(self.model, "config")
            and hasattr(self.model.config, "max_position_embeddings")
            and self.model.config.max_position_embeddings
            and self.cfg.sequence_len > self.model.config.max_position_embeddings
        ):
            LOG.warning(
                "increasing model.config.max_position_embeddings from "
                f"{self.model.config.max_position_embeddings} to {self.cfg.sequence_len}"
            )
            self.model.config.max_position_embeddings = self.cfg.sequence_len
        if (
            hasattr(self.model, "config")
            and hasattr(self.model.config, "bos_token_id")
            and self.model.config.bos_token_id
            and self.model.config.bos_token_id != self.tokenizer.bos_token_id
        ):
            self.model.config.bos_token_id = self.tokenizer.bos_token_id
        if (
            hasattr(self.model, "config")
            and hasattr(self.model.config, "eos_token_id")
            and self.model.config.eos_token_id
            and self.model.config.eos_token_id != self.tokenizer.eos_token_id
        ):
            self.model.config.eos_token_id = self.tokenizer.eos_token_id
    def _log_memory_usage(self):
        """Log device memory usage after model load."""
        if hasattr(self.model, "device") and self.model.device.type in (
            "cuda",
            "mps",
            "npu",
        ):
            log_gpu_memory_usage(LOG, "after model load", self.model.device)
    def _configure_embedding_dtypes(self):
        """Configure embedding module dtypes."""
        # Get embedding modules
        embedding_modules = get_linear_embedding_layers(self.cfg.model_config_type)
        # Initial dtype conversion
        if not self.cfg.fsdp:
            # We don't run this during FSDP because this will leave mixed and bfloat16
            # dtypes in the model which FSDP doesn't like
            if self.cfg.load_in_4bit and self.cfg.embeddings_skip_upcast:
                embedding_modules = []
            self._convert_embedding_modules_dtype(
                embedding_modules,
                dist_dtype=torch.float32,
                before_kbit_train_or_finetune=True,
            )
        # Handle DeepSpeed Zero3
        if is_deepspeed_zero3_enabled():
            self._set_z3_leaf_modules()
        # Apply gradient checkpointing if needed
        needs_fa2_dtype = self.cfg.adapter or self.cfg.fsdp
        if self.cfg.adapter in ["lora", "qlora"]:
            needs_fa2_dtype = True
            if self.cfg.gradient_checkpointing:
                self.model.gradient_checkpointing_enable(
                    gradient_checkpointing_kwargs=self.cfg.gradient_checkpointing_kwargs
                )
        self._prepare_model_for_quantization()
        # Convert dtypes if needed
        should_convert = (
            # LlamaRMSNorm layers are in fp32 after kbit_training or full finetune, so
            # we need to convert them back to fp16/bf16 for flash-attn compatibility.
            (
                (needs_fa2_dtype or self.cfg.flash_attention or self.cfg.flex_attention)
                and not self.qlora_fsdp
            )
            # CCE requires embedding layers to be in fp16/bf16 for backward pass
            or self.cfg.cut_cross_entropy
        )
        if should_convert:
            LOG.info("Converting modules to %s", self.cfg.torch_dtype)
            self._convert_embedding_modules_dtype(
                embedding_modules=embedding_modules,
                dist_dtype=self.cfg.torch_dtype,
                before_kbit_train_or_finetune=False,
            )
    def _load_adapters(self) -> PeftConfig | None:
        """Load LoRA or other adapters."""
        # Load LoRA or adapter
        lora_config = None
        if not self.reference_model or self.cfg.lora_model_dir:
            # If we're not loading the reference model, then we're loading the model
            # for training. Then, the DPO trainer doesn't want the PEFT model loaded
            # over it, it just wants the LoRA / PEFT config.
            if (
                self.cfg.adapter
                and self.cfg.rl in [RLType.DPO, RLType.IPO, RLType.KTO]
                and not self.cfg.merge_lora
            ):
                _, lora_config = load_lora(
                    self.model, self.cfg, inference=False, config_only=True
                )
            else:
                self.model, lora_config = load_adapter(
                    self.model, self.cfg, self.cfg.adapter
                )
        return lora_config
    def _apply_post_lora_load_setup(self, skip_move_to_device: bool):
        """Apply final optimizations and patches."""
        # Place model on accelerator
        if (
            self.cfg.ddp
            and not self.cfg.load_in_8bit
            and not (self.cfg.rl and self.cfg.load_in_4bit)
            and not skip_move_to_device
        ):
            # TODO: validate this conditional
            self.model.to(f"{str(get_device_type())}:{self.cfg.local_rank}")
        if get_device_count() > 1 and int(os.getenv("WORLD_SIZE", "1")) == 1:
            self.model.is_parallelizable = True
            self.model.model_parallel = True
        if not any(
            param.requires_grad
            for _, param in self.model.named_parameters(recurse=True)
        ):
            LOG.warning("There are no parameters that require gradient updates")
        if self.cfg.flash_optimum:
            from optimum.bettertransformer import BetterTransformer
            self.model = BetterTransformer.transform(self.model)
        if self.cfg.adapter is not None:
            log_gpu_memory_usage(LOG, "after adapters", self.model.device)
        for _ in range(3):
            gc.collect()
            torch.cuda.empty_cache()
    def _set_auto_model_loader(self):
        """Set `self.auto_model_loader`. Defaults to `transformers.AutoModelForCausalLM`
        (set at `__init__`). When using a multimodal model, `self.auto_model_loader`
        should be set according to the type of the model.
        """
        if self.cfg.is_multimodal:
            self.auto_model_loader = MULTIMODAL_AUTO_MODEL_MAPPING.get(
                self.model_config.model_type, AutoModelForVision2Seq
            )
    def _set_device_map_config(self):
        """Setup `device_map` according to config"""
        device_map = self.cfg.device_map
        max_memory = self.cfg.max_memory
        if self.cfg.gpu_memory_limit:
            gpu_memory_limit = (
                str(self.cfg.gpu_memory_limit) + "GiB"
                if isinstance(self.cfg.gpu_memory_limit, int)
                else self.cfg.gpu_memory_limit
            )
            max_memory = {}
            num_device = get_device_count()
            for i in range(num_device):
                max_memory[i] = gpu_memory_limit
            max_memory["cpu"] = "256GiB"  # something sufficiently large to fit anything
        if max_memory is not None:
            # Based on https://github.com/togethercomputer/OpenChatKit/blob/main/inference/bot.py
            from accelerate import infer_auto_device_map
            with init_empty_weights():
                model_canvas = self.auto_model_loader.from_config(
                    self.model_config,
                    trust_remote_code=self.cfg.trust_remote_code or False,
                )
            model_canvas.tie_weights()
            device_map = infer_auto_device_map(
                model_canvas,
                max_memory=max_memory,
                dtype=self.cfg.torch_dtype,
            )
            # We can discard max_memory now as we have a device map set up
            max_memory = None
        self.model_kwargs["torch_dtype"] = self.cfg.torch_dtype
        if not is_deepspeed_zero3_enabled():
            self.model_kwargs["device_map"] = device_map
            cur_device = get_device_type()
            if "mps" in str(cur_device):
                self.model_kwargs["device_map"] = "mps:0"
            elif "npu" in str(cur_device):
                self.model_kwargs["device_map"] = "npu:0"
        # TODO: can we put the reference model on it's own gpu? I think we have to move
        # logits around to calculate loss
        # if cfg.rl:
        #     if torch.cuda.device_count() > 1:
        #         if reference_model:
        #             model_kwargs["device_map"] = "cuda:" + str(
        #                 torch.cuda.current_device() + 1
        #             )
        #         else:
        #             model_kwargs["device_map"] = "cuda:" + str(torch.cuda.current_device())
    def _set_quantization_config(self):
        """Set up quantization config (bitsandbytes, awq, gptq, etc.)"""
        self.model_kwargs["load_in_8bit"] = self.cfg.load_in_8bit
        self.model_kwargs["load_in_4bit"] = self.cfg.load_in_4bit
        if self.cfg.gptq:
            if not hasattr(self.model_config, "quantization_config"):
                LOG.warning(
                    "model config does not contain quantization_config information"
                )
            else:
                if self.cfg.gptq_disable_exllama is not None:
                    self.model_config.quantization_config["disable_exllama"] = (
                        self.cfg.gptq_disable_exllama
                    )
                self.model_kwargs["quantization_config"] = GPTQConfig(
                    **self.model_config.quantization_config
                )
        if (
            self.cfg.adapter in ["qlora", "lora"]
            and hasattr(self.model_config, "quantization_config")
            and self.model_config.quantization_config["quant_method"]
            in ["gptq", "awq", "bitsandbytes"]
        ):
            if self.model_config.quantization_config["quant_method"] == "gptq":
                self.model_kwargs["quantization_config"] = GPTQConfig(
                    **self.model_config.quantization_config
                )
            elif self.model_config.quantization_config["quant_method"] == "awq":
                self.model_kwargs["quantization_config"] = AwqConfig(
                    **self.model_config.quantization_config
                )
            elif (
                self.model_config.quantization_config["quant_method"] == "bitsandbytes"
            ):
                self.model_kwargs["quantization_config"] = BitsAndBytesConfig(
                    **self.model_config.quantization_config
                )
        elif self.cfg.adapter == "qlora" and self.model_kwargs["load_in_4bit"]:
            bnb_config = {
                "load_in_4bit": True,
                "llm_int8_threshold": 6.0,
                "llm_int8_has_fp16_weight": False,
                "bnb_4bit_compute_dtype": self.cfg.torch_dtype,
                "bnb_4bit_use_double_quant": True,
                "bnb_4bit_quant_type": "nf4",
                "bnb_4bit_quant_storage": torch.bfloat16,
            }
            if self.cfg.model_config_type in ["jamba", "qwen2_moe"] and not (
                self.cfg.deepspeed or self.cfg.fsdp
            ):
                # for some reason, this causes the loss to be off by an order of magnitude
                # but deepspeed needs this still in bfloat16
                bnb_config["bnb_4bit_quant_storage"] = torch.float32
            if self.cfg.bnb_config_kwargs:
                bnb_config.update(self.cfg.bnb_config_kwargs)
            self.model_kwargs["quantization_config"] = BitsAndBytesConfig(
                **bnb_config,
            )
        elif self.cfg.adapter == "lora" and self.model_kwargs["load_in_8bit"]:
            bnb_config = {
                "load_in_8bit": True,
            }
            # Exclude mamba blocks from int8 quantization for jamba
            if self.cfg.model_config_type == "jamba":
                bnb_config["llm_int8_skip_modules"] = ["mamba"]
            self.model_kwargs["quantization_config"] = BitsAndBytesConfig(
                **bnb_config,
            )
        # no longer needed per https://github.com/huggingface/transformers/pull/26610
        if "quantization_config" in self.model_kwargs or self.cfg.gptq:
            self.model_kwargs.pop("load_in_8bit", None)
            self.model_kwargs.pop("load_in_4bit", None)
    def _set_attention_config(self):
        """Sample packing uses custom FA2 patch"""
        if self.cfg.flex_attention:
            self.model_kwargs["attn_implementation"] = "flex_attention"
            self.model_config._attn_implementation = (  # pylint: disable=protected-access
                "flex_attention"
            )
        elif self.cfg.flash_attention:
            if not self.cfg.sample_packing and self.cfg.s2_attention:
                pass
            self.model_kwargs["attn_implementation"] = "flash_attention_2"
            self.model_config._attn_implementation = (  # pylint: disable=protected-access
                "flash_attention_2"
            )
        elif self.cfg.sdp_attention:
            self.model_kwargs["attn_implementation"] = "sdpa"
            self.model_config._attn_implementation = (  # pylint: disable=protected-access
                "sdpa"
            )
        elif self.cfg.eager_attention:
            self.model_kwargs["attn_implementation"] = "eager"
            self.model_config._attn_implementation = (  # pylint: disable=protected-access
                "eager"
            )
        if self.cfg.low_cpu_mem_usage:
            self.model_kwargs["low_cpu_mem_usage"] = True
    def _configure_zero3_memory_efficient_loading(self):
        """Set the deepspeed config to load the model into RAM first before moving
        to VRAM.
        We need to return `hf_ds_cfg` as it needs to exist before model loading.
        """
        hf_ds_cfg = None
        if os.getenv("ACCELERATE_DEEPSPEED_ZERO_STAGE") == "3":
            hf_ds_cfg = HfTrainerDeepSpeedConfig(self.cfg.deepspeed)
            hf_ds_cfg.fill_match(
                "train_micro_batch_size_per_gpu", self.cfg.micro_batch_size
            )
            hf_ds_cfg.fill_match(
                "gradient_accumulation_steps", self.cfg.gradient_accumulation_steps
            )
            hf_ds_cfg.fill_match(
                "train_batch_size",
                int(os.getenv("WORLD_SIZE", "1"))
                * self.cfg.micro_batch_size
                * self.cfg.gradient_accumulation_steps,
            )
            if "device_map" in self.model_kwargs:
                del self.model_kwargs["device_map"]
            transformers.modeling_utils.is_deepspeed_zero3_enabled = lambda: True
            transformers.integrations.deepspeed.is_deepspeed_zero3_enabled = (
                lambda: True
            )
        return hf_ds_cfg
    def _build_model(self) -> bool:
        """Load model, with load strategy depending on config."""
        skip_move_to_device = False
        if (
            self.qlora_fsdp
            and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading
            and (
                self.cfg.model_config_type == "dbrx"
                or self.cfg.qlora_sharded_model_loading
            )
        ):
            quant_storage = self.cfg.torch_dtype
            quantization_config = getattr(
                self.model_config, "quantization_config", None
            )
            quantization_config = (
                quantization_config or self.model_kwargs["quantization_config"]
            )
            self.model = load_sharded_model_quant(
                self.base_model,
                self.model_config,
                self.cfg,
                quant_storage=quant_storage,
                quantization_config=quantization_config,
            )
            skip_move_to_device = True
        elif (
            self.model_config.model_type in ["llama", "llama4"]
            and not self.cfg.trust_remote_code
            and not self.cfg.gptq
        ):
            # TODO: Do we need to open this up for all models?
            if self.cfg.fsdp and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading:
                skip_move_to_device = True
                if "device_map" in self.model_kwargs:
                    del self.model_kwargs["device_map"]
            self._configure_zero3_memory_efficient_loading()
            # Load model with random initialization if specified
            if self.cfg.random_init_weights:
                # AutoModel classes support the from_config method
                if self.auto_model_loader in [
                    AutoModelForCausalLM,
                    AutoModelForVision2Seq,
                ]:
                    self.model = self.auto_model_loader.from_config(
                        config=self.model_config,
                    )
                else:
                    self.model = self.auto_model_loader(config=self.model_config)
            else:
                self.model = self.auto_model_loader.from_pretrained(
                    self.base_model,
                    config=self.model_config,
                    **self.model_kwargs,
                )
        elif self.model_type == "MambaLMHeadModel":
            # FIXME this is janky at best and hacked together to make it work
            MambaLMHeadModel = fix_mamba_attn_for_loss()  # pylint: disable=invalid-name
            self.model_kwargs["dtype"] = self.model_kwargs["torch_dtype"]
            self.model_kwargs["device"] = torch.cuda.current_device()
            self.model_kwargs.pop("torch_dtype", None)
            self.model_kwargs.pop("device_map", None)
            self.model = MambaLMHeadModel.from_pretrained(
                self.base_model,
                **self.model_kwargs,
            )
        elif (
            self.model_type
            and self.model_type != "AutoModelForCausalLM"
            and not self.cfg.trust_remote_code
        ):
            if self.cfg.gptq:
                self.model = self.auto_model_loader.from_pretrained(
                    self.base_model,
                    config=self.model_config,
                    trust_remote_code=self.cfg.trust_remote_code or False,
                    **self.model_kwargs,
                )
            else:
                self.model = getattr(transformers, self.model_type).from_pretrained(
                    self.base_model,
                    config=self.model_config,
                    trust_remote_code=self.cfg.trust_remote_code or False,
                    **self.model_kwargs,
                )
        else:
            if self.cfg.gptq:
                self.model = self.auto_model_loader.from_pretrained(
                    self.base_model,
                    config=self.model_config,
                    trust_remote_code=self.cfg.trust_remote_code or False,
                    **self.model_kwargs,
                )
            else:
                if (
                    self.cfg.fsdp
                    and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading
                ):
                    # disabling either of these two still leads to VRAM spike before setting back down
                    skip_move_to_device = True
                    if "device_map" in self.model_kwargs:
                        del self.model_kwargs["device_map"]
                self._configure_zero3_memory_efficient_loading()
                self.model = self.auto_model_loader.from_pretrained(
                    self.base_model,
                    config=self.model_config,
                    trust_remote_code=self.cfg.trust_remote_code or False,
                    **self.model_kwargs,
                )
        if is_deepspeed_zero3_enabled():
            skip_move_to_device = True
        return skip_move_to_device
    def _set_z3_leaf_modules(self):
        from deepspeed.utils import set_z3_leaf_modules
        if self.cfg.model_config_type in MOE_ARCH_BLOCK:
            moe_blocks = MOE_ARCH_BLOCK[self.cfg.model_config_type]
            moe_blocks = [moe_blocks] if isinstance(moe_blocks, str) else moe_blocks
            set_z3_leaf_modules(
                self.model,
                [
                    get_module_class_from_name(self.model, module_name)
                    for module_name in moe_blocks
                ],
            )
    def _prepare_model_for_quantization(self):
        """Prepare loaded model for quantization."""
        skip_prepare_model_for_kbit_training = False
        if self.cfg.model_config_type == "qwen" and self.cfg.adapter == "lora":
            # Qwen doesn't play nicely with LoRA if this is enabled
            skip_prepare_model_for_kbit_training = True
        loftq_bits = (
            self.cfg.peft
            and self.cfg.peft.loftq_config
            and self.cfg.peft.loftq_config.loftq_bits
        )
        if self.cfg.adapter == "lora" and loftq_bits:
            skip_prepare_model_for_kbit_training = True
        if (
            self.qlora_fsdp
            or (self.cfg.fsdp and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading)
            or is_deepspeed_zero3_enabled()
        ):
            # Make sure everything is in the same dtype
            skip_prepare_model_for_kbit_training = True
        if (
            not skip_prepare_model_for_kbit_training
            and self.cfg.adapter in ["lora", "qlora"]
            and (self.cfg.load_in_8bit or self.cfg.load_in_4bit)
        ):
            LOG.info("converting PEFT model w/ prepare_model_for_kbit_training")
            self.model = prepare_model_for_kbit_training(
                self.model, use_gradient_checkpointing=self.cfg.gradient_checkpointing
            )
    def _convert_embedding_modules_dtype(
        self,
        embedding_modules: list[str],
        dist_dtype: torch.dtype,
        before_kbit_train_or_finetune: bool,
    ):
        for name, module in self.model.named_modules():
            if "norm" in name:
                module.to(dist_dtype)
            if before_kbit_train_or_finetune:
                if name.endswith(".gate"):
                    module.to(dist_dtype)
                if self.model_config.model_type == "btlm":
                    # don't upcast lm_head for btlm
                    continue
            if any(m in name for m in embedding_modules) and hasattr(module, "weight"):
                module.to(dist_dtype)
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -1,380 +0,0 @@
 """Patch manager class implementation to complement `axolotl.loaders.ModelLoader`.
 Applies pre- and post-model load patches for various fixes and optimizations.
 """
 import importlib.util
 import logging
 from functools import cached_property
 import addict
 import transformers
 from transformers import PretrainedConfig, PreTrainedModel
 from axolotl.integrations.base import PluginManager
 from axolotl.monkeypatch.multipack import (
    SUPPORTED_MULTIPACK_MODEL_TYPES,
    patch_for_multipack,
 )
 from axolotl.utils.dict import DictDefault
 LOG = logging.getLogger(__name__)
 PLUGIN_MANAGER = PluginManager.get_instance()
 class PatchManager:
    """Manages the application of patches during the model loading process."""
    def __init__(
        self,
        cfg: DictDefault,
        model_config: PretrainedConfig | addict.Dict,
        inference: bool = False,
    ):
        """Initialize the `PatchManager`.
        Args:
            cfg: Configuration dictionary with model and training settings.
            model_config: Configuration object for the model.
            inference: Whether the model is being loaded for inference mode.
        """
        self.cfg = cfg
        self.model_config = model_config
        self.inference = inference
    @cached_property
    def has_flash_attn(self) -> bool:
        """Check if flash attention is installed."""
        return importlib.util.find_spec("flash_attn") is not None
    def apply_pre_model_load_patches(self):
        """Apply pre-model load patches based on config."""
        self._apply_flash_attention_patches()
        self._apply_fsdp_patches()
        self._apply_adapter_patches()
        self._apply_flex_attention_patches()
        self._apply_model_specific_patches()
        self._apply_fp8_patches()
        self._apply_flash_attention_peft_patches()
        self._apply_gradient_checkpointing_patches()
        self._patch_attention()
        self._apply_multipack_patches()
        self._patch_llama_derived_model()
        self._apply_mistral_cross_entropy_patch()
        self._apply_unsloth_self_attention_patch()
    def apply_post_model_load_patches(self, model: PreTrainedModel):
        """Apply patches that require the model instance."""
        self._apply_llama_flash_attn_patches(model)
        self._apply_unsloth_patches(model)
        self._apply_lora_kernel_patch(model)
    def _apply_flash_attention_patches(self):
        """Apply patches related to Flash Attention."""
        if self.cfg.xformers_attention and self.cfg.sample_packing:
            from axolotl.monkeypatch.attention import patch_xformers_attn_over_fa2
            patch_xformers_attn_over_fa2()
            self.cfg.flash_attention = True
    def _apply_fsdp_patches(self):
        """Apply patches for FSDP configurations."""
        if self.cfg.fsdp_config and str(self.cfg.fsdp_config.fsdp_version) == "2":
            from axolotl.monkeypatch.accelerate.fsdp2 import patch_accelerate_fsdp_utils
            patch_accelerate_fsdp_utils()
    def _apply_adapter_patches(self):
        """Apply patches for adapter configurations."""
        if self.cfg.adapter and self.cfg.embeddings_skip_upcast:
            from axolotl.monkeypatch.peft.utils import patch_peft_prep_code
            patch_peft_prep_code()
    def _apply_flex_attention_patches(self):
        """Apply patches for flexible attention."""
        if self.cfg.flex_attention:
            from axolotl.monkeypatch.attention.flex_attn import (
                patch_flex_make_mask,
                patch_flex_wrapper,
            )
            flex_attn_compile_kwargs = self.cfg.flex_attn_compile_kwargs or {}
            patch_flex_wrapper(**flex_attn_compile_kwargs)
            patch_flex_make_mask()
    def _apply_model_specific_patches(self):
        """Apply patches specific to model architectures."""
        if (
            self.cfg.model_config_type == "llama4"
            and self.cfg.llama4_linearized_experts
        ):
            from axolotl.monkeypatch.models.llama4.modeling import (
                patch_llama4_linearized_modeling,
            )
            patch_llama4_linearized_modeling()
        if self.cfg.model_config_type == "gemma3":
            from axolotl.monkeypatch.gemma3 import (
                patch_gemma3conditionalgeneration_forward,
            )
            patch_gemma3conditionalgeneration_forward()
    def _apply_fp8_patches(self):
        """Apply patches for FP8 support."""
        if self.cfg.fp8:
            from axolotl.monkeypatch.trainer_accelerator_args import (
                patch_create_accelerate_code_for_fp8,
            )
            patch_create_accelerate_code_for_fp8()
    def _apply_flash_attention_peft_patches(self):
        """Apply patches for Flash Attention with PEFT."""
        if self.cfg.adapter:
            from axolotl.monkeypatch.transformers_fa_utils import (
                patch_fa_peft_integration,
            )
            patch_fa_peft_integration()
    def _apply_gradient_checkpointing_patches(self):
        """Apply patches for gradient checkpointing."""
        if self.cfg.gradient_checkpointing in ["unsloth", "offload"]:
            from axolotl.monkeypatch.gradient_checkpointing import (
                hf_grad_checkpoint_offload_wrapper,
            )
            transformers.modeling_utils.checkpoint = hf_grad_checkpoint_offload_wrapper
        if self.cfg.gradient_checkpointing == "offload_disk":
            from axolotl.monkeypatch.gradient_checkpointing import (
                hf_grad_checkpoint_disk_offload_wrapper,
            )
            transformers.modeling_utils.checkpoint = (
                hf_grad_checkpoint_disk_offload_wrapper
            )
    def _apply_mistral_cross_entropy_patch(self):
        """Apply Mistral cross entropy patch if configured."""
        if (
            self.cfg.model_config_type == "mistral"
            and self.cfg.flash_attn_cross_entropy_loss
        ):
            from axolotl.monkeypatch.mistral_attn_hijack_flash import (
                patch_mistral_cross_entropy,
            )
            patch_mistral_cross_entropy()
    def _apply_unsloth_self_attention_patch(self):
        """Apply Unsloth self-attention patches if configured."""
        if self.cfg.unsloth_lora_qkv or self.cfg.unsloth_lora_o:
            from axolotl.monkeypatch.lora_kernels import patch_self_attn_lora
            patch_self_attn_lora(self.cfg)
    def _apply_multipack_patches(self):
        """Apply multipack patches if necessary."""
        if (
            self.cfg.model_config_type in SUPPORTED_MULTIPACK_MODEL_TYPES
            and (self.cfg.flash_attention or self.cfg.flex_attention)
            and self.cfg.sample_packing
        ):
            # Get automap config if it exists
            auto_map_config = None
            if isinstance(self.model_config, dict) and "auto_map" in self.model_config:
                auto_map_config = self.model_config["auto_map"]
            elif hasattr(self.model_config, "auto_map"):
                auto_map_config = self.model_config.auto_map
            # Determine if the model has remote code
            if auto_map_config is not None:
                has_remote_code = "AutoModelForCausalLM" in auto_map_config
            else:
                has_remote_code = False
            if has_remote_code and self.cfg.trust_remote_code is False:
                # If explicitly set in YAML, prefer that
                has_remote_code = self.cfg.trust_remote_code
            patch_for_multipack(
                self.cfg.model_config_type,
                model_name=self.cfg.base_model,
                has_remote_code=has_remote_code,
            )
            if self.cfg.is_llama_derived_model:
                self._patch_loss_llama()
    def _patch_attention(self):
        """Apply attention-specific patches based on model type."""
        if not (self.cfg.flash_attention and hasattr(self.model_config, "model_type")):
            return
        if self.model_config.model_type == "mllama" and self.cfg.flash_attention:
            from axolotl.monkeypatch.attention.mllama import patch_mllama
            patch_mllama()
        if self.model_config.model_type == "btlm":
            from axolotl.monkeypatch.btlm_attn_hijack_flash import (
                replace_btlm_attn_with_flash_attn,
            )
            replace_btlm_attn_with_flash_attn(self.cfg.base_model)
        if self.model_config.model_type == "stablelm_epoch" and self.cfg.sample_packing:
            from axolotl.monkeypatch.stablelm_attn_hijack_flash import (
                replace_stablelm_attn_with_flash_attn,
            )
            replace_stablelm_attn_with_flash_attn(self.cfg.base_model)
    def _patch_loss_llama(self):
        """Patch loss functions and other optimizations for LLaMA models."""
        if self.cfg.flash_attn_cross_entropy and self.has_flash_attn:
            from axolotl.monkeypatch.llama_attn_hijack_flash import (
                patch_fa_llama_cross_entropy,
            )
            patch_fa_llama_cross_entropy()
        elif self.cfg.unsloth_cross_entropy_loss:
            from axolotl.monkeypatch.unsloth_ import integrate_cross_entropy_loss_patch
            integrate_cross_entropy_loss_patch(model_type="llama")
        if self.cfg.flash_attn_rms_norm and self.has_flash_attn:
            from axolotl.monkeypatch.llama_attn_hijack_flash import patch_llama_rms_norm
            patch_llama_rms_norm()
        elif self.cfg.unsloth_rms_norm:
            from axolotl.monkeypatch.unsloth_ import patch_unsloth_layernorm
            patch_unsloth_layernorm()
        if self.cfg.unsloth_lora_qkv or self.cfg.unsloth_lora_o:
            from axolotl.monkeypatch.unsloth_ import patch_self_attn_lora
            patch_self_attn_lora()
    def _patch_llama_flash_attention(self, packed=False):
        """Apply Flash Attention patches for LLaMA models."""
        from axolotl.monkeypatch.llama_attn_hijack_flash import (
            replace_llama_attn_with_flash_attn,
        )
        if packed:
            if self.cfg.device not in ["mps", "cpu"] and not self.inference:
                LOG.info("patching with flash attention for sample packing")
                replace_llama_attn_with_flash_attn(
                    packed=True,
                    cross_entropy=self.cfg.flash_attn_cross_entropy,
                    rms_norm=self.cfg.flash_attn_rms_norm,
                )
        elif self.cfg.s2_attention:
            LOG.info("patching w/ flash-enabled, shifted-sparse attention")
            replace_llama_attn_with_flash_attn(
                packed=False,
                cross_entropy=self.cfg.flash_attn_cross_entropy,
                rms_norm=self.cfg.flash_attn_rms_norm,
                use_shifted_sparse_attn=True,
            )
        elif self.cfg.flash_attn_cross_entropy or self.cfg.flash_attn_rms_norm:
            replace_llama_attn_with_flash_attn(
                packed=False,
                cross_entropy=self.cfg.flash_attn_cross_entropy,
                rms_norm=self.cfg.flash_attn_rms_norm,
            )
    def _patch_llama_xformers_attention(self):
        """Apply xformers attention patches for LLaMA models."""
        from axolotl.monkeypatch.llama_attn_hijack_xformers import (
            hijack_llama_attention,
        )
        LOG.info("Patching with xformers attention...")
        hijack_llama_attention()
    def _patch_llama_sample_packing(self):
        """Apply sample packing patches for LLaMA models."""
        from axolotl.monkeypatch.llama_patch_multipack import (
            hijack_llama_prepare_4d_mask,
        )
        LOG.info("Patching llama _prepare_4d_causal_attention_mask*...")
        hijack_llama_prepare_4d_mask()
    def _patch_llama_derived_model(self):
        """Modify all llama derived models in one block."""
        if self.cfg.is_llama_derived_model and not (
            self.cfg.model_config_type in SUPPORTED_MULTIPACK_MODEL_TYPES
            and (self.cfg.flash_attention or self.cfg.flex_attention)
            and self.cfg.sample_packing
        ):
            self._patch_loss_llama()
            if self.cfg.flash_attention:
                self._patch_llama_flash_attention(packed=self.cfg.sample_packing)
            elif self.cfg.xformers_attention:
                self._patch_llama_xformers_attention()
            elif self.cfg.sample_packing:
                self._patch_llama_sample_packing()
            elif self.cfg.s2_attention:
                raise NotImplementedError(
                    "Shifted-sparse attention not currently implemented without flash attention."
                )
    def _apply_llama_flash_attn_patches(self, model):
        """Apply LLaMA-specific flash attention patches."""
        if (
            self.model_config.model_type in ["llama", "llama4"]
            and not self.cfg.trust_remote_code
            and not self.cfg.gptq
            and self.cfg.flash_attention
            and not self.inference
        ):
            # TODO(MengqingCao): split these patches seperately
            from axolotl.monkeypatch.llama_attn_hijack_flash import (
                is_xformers_swiglu_available,
                replace_llama_mlp_with_swiglu,
                replace_llama_qkv_with_fused,
            )
            if self.cfg.flash_attn_fuse_mlp and is_xformers_swiglu_available():
                LOG.info("Patching with SwiGLU...")
                replace_llama_mlp_with_swiglu(model)
            if self.cfg.flash_attn_fuse_qkv:
                LOG.info("Patching with fused QKV...")
                replace_llama_qkv_with_fused(model)
    def _apply_unsloth_patches(self, model):
        """Apply unsloth optimization patches."""
        if self.cfg.unsloth_lora_mlp:
            from axolotl.monkeypatch.unsloth_ import integrate_lora_mlp_patch
            integrate_lora_mlp_patch(peft_model=model)
        if self.cfg.unsloth_lora_qkv or self.cfg.unsloth_lora_o:
            from axolotl.monkeypatch.unsloth_ import integrate_lora_patch
            integrate_lora_patch(peft_model=model, cfg=self.cfg)
        if self.cfg.unsloth_rope:
            from axolotl.monkeypatch.unsloth_ import integrate_rope_embeddings
            integrate_rope_embeddings()
    def _apply_lora_kernel_patch(self, model):
        """Apply LoRA kernel patches."""
        if (
            self.cfg.lora_mlp_kernel
            or self.cfg.lora_qkv_kernel
            or self.cfg.lora_o_kernel
        ):
            from axolotl.monkeypatch.lora_kernels import apply_lora_kernel_patches
            apply_lora_kernel_patches(model=model, cfg=self.cfg)
--- a/src/axolotl/loaders/processor.py
+++ b/src/axolotl/loaders/processor.py
@@ -1,56 +0,0 @@
 """Processor loading functionality for multi-modal models"""
 import logging
 from typing import Any
 import transformers
 from transformers import (
    AutoProcessor,
    PreTrainedTokenizerBase,
 )
 from axolotl.utils.dict import DictDefault
 LOG = logging.getLogger(__name__)
 def load_processor(cfg: DictDefault, tokenizer: PreTrainedTokenizerBase):
    processor_kwargs: dict[str, Any] = {}  # Do we actually need this?
    processor_cls = AutoProcessor
    if cfg.processor_type:
        processor_cls = getattr(transformers, cfg.processor_type)
    processor = processor_cls.from_pretrained(
        cfg.processor_config,
        trust_remote_code=cfg.trust_remote_code or False,
        tokenizer=tokenizer,
        **processor_kwargs,
    )
    # Attempt to load image size from processor if available
    if (
        cfg.image_size is None
        and hasattr(processor, "size")
        and any(dim in processor.size for dim in ["width", "height"])
    ):
        im_width = None
        im_height = None
        if "width" in processor.size:
            im_width = processor.size["width"]
        if "height" in processor.size:
            im_height = processor.size["height"]
        # If both width and height are set, use a tuple
        if im_width is not None and im_height is not None:
            cfg.image_size = (im_width, im_height)
        # If only width is set, use as integer
        elif im_width is not None:
            cfg.image_size = im_width
        # If only height is set, use as integer
        elif im_height is not None:
            cfg.image_size = im_height
        LOG.debug(f"Loaded image size: {cfg.image_size} from processor")
    return processor
--- a/src/axolotl/loaders/tokenizer.py
+++ b/src/axolotl/loaders/tokenizer.py
@@ -1,281 +0,0 @@
 """Tokenizer loading functionality and associated utils"""
 import json
 import logging
 import os
 import transformers
 from transformers import (
    AddedToken,
    AutoTokenizer,
 )
 from axolotl.integrations.base import PluginManager
 from axolotl.loaders.utils import get_linear_embedding_layers, load_model_config
 from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN
 from axolotl.utils.chat_templates import get_chat_template_from_config
 from axolotl.utils.distributed import (
    barrier,
    is_local_main_process,
    is_main_process,
 )
 LOG = logging.getLogger(__name__)
 PLUGIN_MANAGER = PluginManager.get_instance()
 def modify_tokenizer_files(
    tokenizer_path: str, token_mappings: dict[int, str], output_dir: str
 ) -> str:
    """
    Modify tokenizer files to replace added_tokens strings, save to output directory,
    and return the path to the modified tokenizer.
    This only works with reserved tokens that were added to the tokenizer, not tokens
    already part of the vocab.
    Args:
        tokenizer_path: Path or name of the original tokenizer
        token_mappings: Dict mapping {token_id (int): new_token_string}
        output_dir: Directory to save the modified tokenizer
    Returns:
        Path to the modified tokenizer directory
    Ref: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941
    """
    # Create the tokenizer directory in output_dir if it doesn't exist
    tokenizer_dir = os.path.join(output_dir, "tokenizer")
    os.makedirs(tokenizer_dir, exist_ok=True)
    if is_local_main_process():  # pylint: disable=too-many-nested-blocks
        # Load the tokenizer
        temp_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True)
        # Save the tokenizer to the output directory
        temp_tokenizer.save_pretrained(tokenizer_dir)
        # Get the token IDs and map them to their new values
        token_id_mappings = {
            int(token_id): new_value for token_id, new_value in token_mappings.items()
        }
        # 1. Update tokenizer_config.json - added_tokens_decoder
        config_path = os.path.join(tokenizer_dir, "tokenizer_config.json")
        if os.path.exists(config_path):
            with open(config_path, "r", encoding="utf-8") as f:
                config_data = json.load(f)
            # Update added_tokens_decoder
            if "added_tokens_decoder" in config_data:
                for token_id, new_value in token_id_mappings.items():
                    token_id_str = str(token_id)
                    if token_id_str in config_data["added_tokens_decoder"]:
                        config_data["added_tokens_decoder"][token_id_str][
                            "content"
                        ] = new_value
                    else:
                        raise ValueError(
                            f"Token ID {token_id_str} not found in added_tokens_decoder"
                        )
            # Write the updated config back
            with open(config_path, "w", encoding="utf-8") as f:
                json.dump(config_data, f, indent=2)
        # 2. Update tokenizer.json - added_tokens
        tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json")
        if os.path.exists(tokenizer_path):
            with open(tokenizer_path, "r", encoding="utf-8") as f:
                tokenizer_data = json.load(f)
            # Update added_tokens
            if "added_tokens" in tokenizer_data:
                for token_id, new_value in token_id_mappings.items():
                    for i, token_entry in enumerate(tokenizer_data["added_tokens"]):
                        if token_entry["id"] == token_id:
                            tokenizer_data["added_tokens"][i]["content"] = new_value
                            break
                    else:
                        # Reaching this section means the token_id was not found in tokenizer.json added_tokens
                        raise ValueError(
                            f"Token ID {token_id} not found in added_tokens"
                        )
            if "model" in tokenizer_data and "vocab" in tokenizer_data["model"]:
                for token_id, new_value in token_id_mappings.items():
                    for entry_val, entry_id in tokenizer_data["model"]["vocab"].items():
                        if entry_id == token_id:
                            del tokenizer_data["model"]["vocab"][entry_val]
                            tokenizer_data["model"]["vocab"][new_value] = token_id
                            break
            # Write the updated tokenizer data back
            with open(tokenizer_path, "w", encoding="utf-8") as f:
                json.dump(tokenizer_data, f, indent=2)
    barrier()
    return tokenizer_dir
 def load_tokenizer(cfg):
    """Load and configure the tokenizer based on the provided config."""
    model_config = load_model_config(cfg)
    tokenizer_kwargs = {}
    use_fast = True  # this is the default
    if cfg.tokenizer_use_fast is not None:
        use_fast = cfg.tokenizer_use_fast
    if cfg.tokenizer_legacy is not None:
        # True is the default w/ https://github.com/huggingface/transformers/pull/25224
        tokenizer_kwargs["legacy"] = cfg.tokenizer_legacy
    tokenizer_cls = AutoTokenizer
    if cfg.tokenizer_type:
        tokenizer_cls = getattr(transformers, cfg.tokenizer_type)
    # Set base tokenizer path
    tokenizer_path = cfg.tokenizer_config
    # Apply token string overrides if specified
    if cfg.added_tokens_overrides:
        # Modify tokenizer files and get path to modified tokenizer
        tokenizer_path = modify_tokenizer_files(
            tokenizer_path, cfg.added_tokens_overrides, output_dir=cfg.output_dir
        )
    tokenizer = tokenizer_cls.from_pretrained(
        tokenizer_path,
        trust_remote_code=cfg.trust_remote_code or False,
        use_fast=use_fast,
        **tokenizer_kwargs,
    )
    if (
        tokenizer.__class__.__name__
        in [
            "LlamaTokenizer",
            "LlamaTokenizerFast",
            "CodeLlamaTokenizer",
            "CodeLlamaTokenizerFast",
        ]
        and hasattr(tokenizer, "pad_token")
        and not tokenizer.pad_token
    ):
        # set a pad_token, but use eos_token so we don't add a new token
        tokenizer.pad_token = LLAMA_DEFAULT_EOS_TOKEN
    if tokenizer.__class__.__name__ == "GPTNeoXTokenizerFast":
        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
        os.environ["TOKENIZERS_PARALLELISM"] = "false"
    # Mistral's official FA implementation requires left padding
    if cfg.is_mistral_derived_model and cfg.flash_attention and not cfg.sample_packing:
        tokenizer.padding_side = "left"
    # Qwen base only has single token, so we need to set the special tokens
    if cfg.is_qwen_derived_model:
        token_ids = ["bos_token_id", "eos_token_id", "pad_token_id", "unk_token_id"]
        for attr_name in token_ids:
            if getattr(tokenizer, attr_name) is None:
                setattr(tokenizer, attr_name, tokenizer.eod_id)
        token_names = ["bos_token", "eos_token", "pad_token", "unk_token"]
        for attr_name in token_names:
            if getattr(tokenizer, attr_name) is None:
                setattr(tokenizer, attr_name, "<|endoftext|>")
    additional_special_tokens = None
    if cfg.special_tokens:
        special_tokens = cfg.special_tokens.to_dict()
        additional_special_tokens = special_tokens.pop(
            "additional_special_tokens", None
        )
        lora_modules_to_save = get_linear_embedding_layers(model_config.model_type)
        for k, val in special_tokens.items():
            # check if new special token is not already in tokenizer and
            # is adapter training to make sure lora_modules_to_save is set
            # pylint: disable=too-many-boolean-expressions
            if (
                (getattr(tokenizer, k) is None or getattr(tokenizer, k) != val)
                and (len(tokenizer.encode(val, add_special_tokens=False)) > 2)
                and cfg.adapter
                and (
                    not cfg.lora_modules_to_save
                    or not all(
                        x in cfg.lora_modules_to_save for x in lora_modules_to_save
                    )
                )
                and k != "pad_token"
            ):
                lora_modules_to_save = ", ".join(
                    [f"`{x}`" for x in lora_modules_to_save]
                )
                raise ValueError(
                    f"Please set lora_modules_to_save to [{lora_modules_to_save}] when using an adapter and changing the special tokens."
                )
            tokenizer.add_special_tokens(
                {k: AddedToken(val, rstrip=False, lstrip=False, normalized=False)}
            )
        # If we add bos_token and eos_token, we need to update the post processor to
        # handle them correctly.
        # https://github.com/huggingface/transformers/pull/24132
        bos_or_eos_in_special_tokens = (
            "bos_token" in cfg.special_tokens and "eos_token" in cfg.special_tokens
        )
        if (
            tokenizer.__class__.__name__
            in (
                "LlamaTokenizerFast",
                "CodeLlamaTokenizerFast",
            )
            and bos_or_eos_in_special_tokens
        ):
            tokenizer.update_post_processor()
    if cfg.tokens:
        tokenizer.add_tokens(
            [
                AddedToken(token, rstrip=False, lstrip=False, normalized=False)
                for token in cfg.tokens
            ]
        )
    # Additional special tokens are a List, and need to be treated differently than regular special
    # tokens. We add them after we have called `add_tokens` in case these additional special tokens
    # are new tokens.
    #
    # Usage:
    #
    # ```py
    # special_tokens:
    #   additional_special_tokens: ["<|im_start|>", "<|im_end|>"]
    # ```
    if additional_special_tokens is not None:
        tokenizer.add_special_tokens(
            {"additional_special_tokens": additional_special_tokens}
        )
    if is_main_process(use_environ=True):
        LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
        LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
        LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
        LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")
    if cfg.chat_template:
        chat_template_string = get_chat_template_from_config(
            cfg=cfg,
            tokenizer=tokenizer,
        )
        if cfg.default_system_message and cfg.chat_template == "chatml":
            chat_template_string = chat_template_string.replace(
                "You are a helpful assistant.", cfg.default_system_message
            )
        tokenizer.chat_template = chat_template_string
    else:
        LOG.info(
            "No Chat template selected. Consider adding a chat template for easier inference."
        )
    return tokenizer
--- a/src/axolotl/loaders/utils.py
+++ b/src/axolotl/loaders/utils.py
@@ -1,211 +0,0 @@
 """Utilities for axolotl.loaders module"""
 import contextlib
 import logging
 from typing import Type
 import addict
 import torch
 from transformers import AutoConfig, PretrainedConfig, PreTrainedModel
 from axolotl.utils.dict import DictDefault
 LOG = logging.getLogger(__name__)
 def get_module_class_from_name(
    module: torch.nn.Module, name: str
 ) -> Type[torch.nn.Module] | None:
    """Gets a class from a module by its name. Copied from `accelerate.utils.dataclasses`
    (https://github.com/huggingface/accelerate/blob/main/src/accelerate/utils/dataclasses.py#L2805).
    Args:
        module: The module to get the class from.
        name: The name of the class.
    Returns:
        The class type of the matching module, or `None` if no match is found.
    """
    modules_children = list(module.children())
    if module.__class__.__name__ == name:
        return module.__class__
    if len(modules_children) == 0:
        return None
    for child_module in modules_children:
        module_class = get_module_class_from_name(child_module, name)
        if module_class is not None:
            return module_class
    return None
 def check_model_config(cfg: DictDefault, model_config: PretrainedConfig):
    """Validates and adjusts model config based on `axolotl` config.
    This function performs several important checks and adjustments:
        - Disables model caching for better memory efficiency
        - Handles multimodal model-specific configurations
        - Validates quantization settings
        - Ensures proper LoRA configuration when using adapters with new tokens
    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        model_config: The model's configuration object from `transformers`.
    Raises:
        ValueError: If a multimodal model lacks text configuration, if GPTQ settings
            are inconsistent, or if LoRA `modules_to_save` is improperly configured
            with new tokens.
    """
    if hasattr(model_config, "use_cache"):
        model_config.use_cache = False
    if cfg.is_multimodal:
        # For multimodal configs, use_cache is set in the text_config
        if hasattr(model_config, "get_text_config"):
            text_config = model_config.get_text_config()
            if hasattr(text_config, "use_cache"):
                text_config.use_cache = False
        else:
            raise ValueError(
                "No text config found for multimodal model. Please raise an Issue with model details."
            )
        # Check if image_size is not set and load image size from model config if available
        if (
            cfg.image_size is None
            and hasattr(model_config, "vision_config")
            and hasattr(model_config.vision_config, "image_size")
        ):
            cfg.image_size = model_config.vision_config.image_size
            LOG.debug(f"Loaded image size: {cfg.image_size} from model config")
    quant_config_exists = (
        hasattr(model_config, "quantization_config")
        and model_config.quantization_config
    )
    # Detect compressed-tensors config
    is_compressed_tensors_config = (
        quant_config_exists
        and model_config.quantization_config.get("quant_method") == "compressed-tensors"
    )
    if is_compressed_tensors_config:
        if model_config.quantization_config.get("config_groups"):
            LOG.warning(
                "Found `config_groups` in a compressed-tensors config. "
                "QAT integration with llmcompressor is not tested."
            )
        # Skip further quant checks for compressed-tensors
        return
    quant_config_method_is_gptq = (
        quant_config_exists
        and "quant_method" in model_config.quantization_config
        and model_config.quantization_config["quant_method"] == "gptq"
    )
    if cfg.gptq and not quant_config_method_is_gptq:
        raise ValueError(
            "model_config.quantization_config is not set or quant_method is not set to gptq. "
            "Please make sure to point to a GPTQ model."
        )
    lora_modules_to_save = get_linear_embedding_layers(model_config.model_type)
    if (
        cfg.adapter
        and cfg.tokens
        and (
            not cfg.lora_modules_to_save
            or not all(x in cfg.lora_modules_to_save for x in lora_modules_to_save)
        )
    ):
        lora_modules_to_save_joined = ", ".join(
            map(lambda x: f"`{x}`", lora_modules_to_save)
        )
        raise ValueError(
            "`lora_modules_to_save` not properly set when adding new tokens. "
            f"Please include [{lora_modules_to_save_joined}] in `lora_modules_to_save`."
        )
 def load_model_config(cfg: DictDefault) -> PretrainedConfig | addict.Dict:
    """Loads and configures a model configuration from HuggingFace or local sources.
    This function determines the appropriate model config source, loads it, applies any
    necessary overrides, and validates it for compatibility with the `axolotl` config.
    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
    Returns:
        A configured model configuration object (`AutoConfig` instance), or a simple
            dictionary configuration for special cases like Mamba models.
    Raises:
        ValueError: If configuration loading fails for reasons other than special cases
            that are handled (e.g., Mamba models).
    """
    model_config_name = cfg.base_model_config or cfg.base_model
    if not model_config_name and cfg.tokenizer_config:
        model_config_name = cfg.tokenizer_config
    trust_remote_code = cfg.trust_remote_code is True
    config_kwargs = {}
    if cfg.revision_of_model:
        config_kwargs["revision"] = cfg.revision_of_model
    if cfg.num_labels:
        # num_labels is used to initialize classifier models
        config_kwargs["num_labels"] = cfg.num_labels
    try:
        model_config = AutoConfig.from_pretrained(
            model_config_name,
            trust_remote_code=trust_remote_code,
            **config_kwargs,
        )
    except ValueError as error:
        if "mamba" in model_config_name:
            return addict.Dict(
                {
                    "model_type": "mamba",
                }
            )
        raise error
    if cfg.overrides_of_model_config:
        for key, val in cfg.overrides_of_model_config.items():
            setattr(model_config, key, val)
    check_model_config(cfg, model_config)
    return model_config
 def ensure_dtype(model: PreTrainedModel, dtype: torch.dtype = torch.bfloat16):
    """Ensures all modules in the model are converted to the specified data type."""
    for name, module in model.named_modules():
        weight_mismatch = False
        with contextlib.suppress(AttributeError):
            weight_mismatch = module.weight.dtype != dtype
        bias_mismatch = False
        with contextlib.suppress(AttributeError):
            bias_mismatch = module.bias.dtype != dtype
        if weight_mismatch:
            print(f"Converting module {name}.weight: {module.weight.dtype} -> {dtype}")
        if bias_mismatch:
            print(f"Converting module {name}.bias: {module.bias.dtype} -> {dtype}")
        if weight_mismatch or bias_mismatch:
            module.to(dtype)
 def get_linear_embedding_layers(model_type: str) -> list[str]:
    """Returns layer names of linear embeddings needed for LoRA based on model type."""
    if model_type == "gpt_neox":
        return ["embed_in", "embed_out"]
    if model_type == "falcon":
        return ["word_embeddings", "lm_head"]
    return ["embed_tokens", "lm_head"]
--- a/src/axolotl/monkeypatch/attention/init.py
+++ b/src/axolotl/monkeypatch/attention/init.py
@@ -1,19 +0,0 @@
 """
 attention module for attention monkeypatches
 """
 from transformers.integrations.flash_attention import flash_attention_forward
 def patch_xformers_attn_over_fa2():
    from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
    from .xformers import xformers_attention_forward
    ALL_ATTENTION_FUNCTIONS["flash_attention_2"] = xformers_attention_forward
 def unpatch_xformers_attn_over_fa2():
    from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
    ALL_ATTENTION_FUNCTIONS["flash_attention_2"] = flash_attention_forward()
--- a/src/axolotl/monkeypatch/attention/ring_attn/init.py
+++ b/src/axolotl/monkeypatch/attention/ring_attn/init.py
@@ -0,0 +1,12 @@
 """Init for ring attention monkeypatch module"""
 # pylint: disable=unused-import
 # flake8: noqa
 from .patch import (
    RingAttnFunc,
    get_ring_attn_group,
    register_ring_attn,
    set_ring_attn_group,
    update_ring_attn_params,
 )
--- a/src/axolotl/monkeypatch/attention/ring_attn/adapters/init.py
+++ b/src/axolotl/monkeypatch/attention/ring_attn/adapters/init.py
--- a/src/axolotl/monkeypatch/attention/ring_attn/adapters/batch.py
+++ b/src/axolotl/monkeypatch/attention/ring_attn/adapters/batch.py
@@ -16,7 +16,11 @@ import torch
 import torch.distributed as dist
 import transformers
 import transformers.modeling_flash_attention_utils
-from ring_flash_attn import ring_flash_attn_func
+from ring_flash_attn import (
    ring_flash_attn_func,
    stripe_flash_attn_func,
    zigzag_ring_flash_attn_func,
 )
 from ring_flash_attn.adapters.hf_adapter import check_params
 from transformers.modeling_flash_attention_utils import (
    _flash_supports_window_size,
@@ -24,12 +28,12 @@ from transformers.modeling_flash_attention_utils import (
 )
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
-from axolotl.utils.schemas.enums import RingAttnFunc
+from axolotl.monkeypatch.attention.ring_attn.patch import RingAttnFunc
 RING_ATTN_FUNC_MAPPING = {
-    RingAttnFunc.BATCH_RING: torch.compile(ring_flash_attn_func),
+    RingAttnFunc.BATCH_RING: ring_flash_attn_func,
-    # RingAttnFunc.BATCH_ZIGZAG: torch.compile(zigzag_ring_flash_attn_func),
+    RingAttnFunc.BATCH_ZIGZAG: zigzag_ring_flash_attn_func,
-    # RingAttnFunc.BATCH_STRIPE: torch.compile(stripe_flash_attn_func),
+    RingAttnFunc.BATCH_STRIPE: stripe_flash_attn_func,
 }
--- a/src/axolotl/monkeypatch/attention/ring_attn/patch.py
+++ b/src/axolotl/monkeypatch/attention/ring_attn/patch.py
@@ -0,0 +1,147 @@
 """
 Ring attention group registration and flash attention patching.
 Make use of the `ring-flash-attn` (https://github.com/zhuzilin/ring-flash-attention)
 package, specifically the `hf_adapter.substitute_hf_flash_attn` function to patch in
 their sequence parallel version of Flash Attention 2.
 """
 from enum import Enum
 import torch
 import torch.distributed as dist
 from accelerate.logging import get_logger
 from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
 LOG = get_logger(__name__)
 RING_ATTN_GROUP = None
 def get_ring_attn_group() -> dist.ProcessGroup:
    """
    Getter for ring attention group on this rank.
    Returns:
        The process group for ring attention for this rank.
    """
    return RING_ATTN_GROUP
 def set_ring_attn_group(ring_attn_group: dist.ProcessGroup | None):
    """
    Setter for ring attention group on this rank.
    Args:
        Process group for ring attention.
    """
    global RING_ATTN_GROUP  # pylint: disable=global-statement
    RING_ATTN_GROUP = ring_attn_group
 class RingAttnFunc(str, Enum):
    """Enum class for supported `ring-flash-attn` implementations"""
    # VARLEN_RING = "varlen_ring"
    # VARLEN_ZIGZAG = "varlen_zigzag"
    VARLEN_LLAMA3 = "varlen_llama3"
    BATCH_RING = "batch_ring"
    BATCH_ZIGZAG = "batch_zigzag"
    BATCH_STRIPE = "batch_stripe"
 def register_ring_attn(
    sequence_parallel_degree: int,
    heads_k_stride: int | None,
    ring_attn_func: RingAttnFunc | None,
 ):
    """
    Create ring attention group and substitute flash attn with ring flash attn.
    Args:
        sequence_parallel_degree: Sequence parallelism factor.
        heads_k_stride: Sequence parallelism K head stride size. Passed
            through to `ring_flash_attn.substitute_hf_flash_attn`.
        ring_attn_func: `ring_flash_attn` ring attention implemention. If sample
            packing is enabled, it must be a `varlen` function; otherwise, it must be a
            `batch` function.
    """
    if get_ring_attn_group() is not None:
        LOG.info("Ring attention already registered, exiting early...")
        return
    LOG.info(
        "Enabling ring attention sequence parallelism: "
        f"each sequence will be processed across {sequence_parallel_degree} GPUs"
    )
    rank = dist.get_rank()
    world_size = dist.get_world_size()
    assert sequence_parallel_degree <= world_size, (
        f"sequence_parallel_degree ({sequence_parallel_degree}) "
        f"must be less than or equal to world_size ({world_size})"
    )
    assert world_size % sequence_parallel_degree == 0, (
        f"sequence_parallel_degree ({sequence_parallel_degree}) "
        f"must evenly divide world_size ({world_size})"
    )
    # Assign ranks to sequence parallel groups
    group_assignments = {}
    for i in range(world_size // sequence_parallel_degree):
        ring_attn_ranks = list(
            range(
                i * sequence_parallel_degree,
                (i + 1) * sequence_parallel_degree,
            )
        )
        group = dist.new_group(ranks=ring_attn_ranks, backend="nccl")
        # Track which GPUs are in which groups
        for r in ring_attn_ranks:
            group_assignments[r] = i
        if rank in ring_attn_ranks:
            set_ring_attn_group(group)
    # Log the GPU group assignments
    if rank == 0:
        LOG.info(f"Sequence parallel group assignments: {group_assignments}")
    if ring_attn_func is RingAttnFunc.VARLEN_LLAMA3:
        from ring_flash_attn import substitute_hf_flash_attn
        substitute_hf_flash_attn(
            process_group=get_ring_attn_group(), heads_k_stride=heads_k_stride or 1
        )
    elif ring_attn_func in [
        RingAttnFunc.BATCH_RING,
        RingAttnFunc.BATCH_ZIGZAG,
        RingAttnFunc.BATCH_STRIPE,
    ]:
        from axolotl.monkeypatch.attention.ring_attn.adapters.batch import (
            substitute_hf_flash_attn,
        )
        substitute_hf_flash_attn(
            process_group=get_ring_attn_group(),
            ring_attn_func=ring_attn_func,
        )
 def update_ring_attn_params(position_ids: torch.Tensor | None):
    """
    Calculate the cumulative sequence lengths for the current forward pass and pass the
    value to the substituted `ring_flash_attn`.
    Args:
        position_ids: Optional tensor of position IDs (for sample packed data).
    """
    from ring_flash_attn import update_ring_flash_attn_params
    cu_seqlens, _ = get_cu_seqlens_from_pos_ids(position_ids)
    cu_seqlens = cu_seqlens.squeeze().to(device=torch.cuda.current_device())
    update_ring_flash_attn_params(cu_seqlens, get_ring_attn_group())
--- a/src/axolotl/monkeypatch/attention/xformers.py
+++ b/src/axolotl/monkeypatch/attention/xformers.py
@@ -1,160 +0,0 @@
 """
 xformers attention implementation for packing
 """
 from typing import Optional
 import torch
 import xformers
 import xformers.ops.fmha
 from transformers.modeling_flash_attention_utils import (
    _upad_input,
 )
 from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
 xformers_attention = xformers.ops.fmha.memory_efficient_attention
 def xformers_attention_forward(
    module: torch.nn.Module,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    dropout: float = 0.0,  # pylint: disable=unused-argument
    scaling: Optional[float] = None,  # pylint: disable=unused-argument
    sliding_window: Optional[int] = None,  # pylint: disable=unused-argument
    softcap: Optional[float] = None,  # pylint: disable=unused-argument
    cu_seq_lens_q: Optional[torch.LongTensor] = None,
    cu_seq_lens_k: Optional[torch.LongTensor] = None,
    max_length_q: Optional[int] = None,
    max_length_k: Optional[int] = None,  # pylint: disable=unused-argument
    **kwargs,  # pylint: disable=unused-argument
 ):
    # Get dimensions
    # query: [batch, heads, seq_len, hidden_dim]
    batch_size = query.size(0)
    query_length = query.shape[2]
    key_length = key.shape[2]
    # Default causal mask
    attn_bias = xformers.ops.LowerTriangularMask()
    # Check if we have sliding window attention
    has_sliding_window = sliding_window is not None and sliding_window < query_length
    # Transpose dimensions for xformers (Q: [b, h, s, d] -> [b, s, h, d])
    query = query.transpose(1, 2)
    key = key.transpose(1, 2)
    value = value.transpose(1, 2)
    # Get GQA parameters
    num_attention_heads = module.config.num_attention_heads
    num_key_value_heads = module.config.num_key_value_heads
    head_dim = query.size(-1)
    is_gqa = num_attention_heads != num_key_value_heads
    n_groups = num_attention_heads // num_key_value_heads if is_gqa else 1
    # If position_ids is provided and check all examples do not contain only 1 sequence, If tensor in increasing
    # then we probably have one sequence, otherwise it is packed. Additionally check we are in pre-fill/training stage.
    # Use `flash_attn_varlen_func` to prevent cross-example attention and also allow padding free approach
    if position_ids is not None and (
        max_length_q is not None
        or (query_length != 1 and not (torch.diff(position_ids, dim=-1) >= 0).all())
    ):
        if cu_seq_lens_q is None or cu_seq_lens_k is None:
            cu_seq_lens_q = get_cu_seqlens_from_pos_ids(position_ids)[0]
            cu_seq_lens_q = cu_seq_lens_q.squeeze()
            seq_lengths = cu_seq_lens_q[1:] - cu_seq_lens_q[:-1]
            attn_bias = (
                xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask.from_seqlens(
                    q_seqlen=seq_lengths.tolist(),
                )
            )
        else:
            query = query.reshape(-1, query.size(-2), query.size(-1))
            key = key.reshape(-1, key.size(-2), key.size(-1))
            value = value.reshape(-1, value.size(-2), value.size(-1))
        # Handle GQA
        if is_gqa:
            key = key.repeat_interleave(n_groups, dim=2)
            value = value.repeat_interleave(n_groups, dim=2)
    elif attention_mask is not None:
        query, key, value, _, cu_seq_lens, _ = _upad_input(
            query, key, value, attention_mask, query_length
        )
        cu_seq_lens_q, cu_seq_lens_k = cu_seq_lens
        seq_lengths = []
        for i in range(len(cu_seq_lens_q) - 1):
            seq_lengths.append(cu_seq_lens_q[i + 1] - cu_seq_lens_q[i])
        attn_bias = xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask.from_seqlens(
            q_seqlen=seq_lengths,
            kv_seqlen=seq_lengths,
        )
        # Handle GQA
        if is_gqa:
            key = key.repeat_interleave(n_groups, dim=2)
            value = value.repeat_interleave(n_groups, dim=2)
    else:
        # Handle Group Query Attention (GQA) using view/expand approach from reference
        key = key.view(batch_size, key_length, num_key_value_heads, 1, head_dim)
        value = value.view(batch_size, key_length, num_key_value_heads, 1, head_dim)
        key = key.expand(
            batch_size, key_length, num_key_value_heads, n_groups, head_dim
        )
        value = value.expand(
            batch_size, key_length, num_key_value_heads, n_groups, head_dim
        )
        if module.training:
            key = key.reshape(batch_size, key_length, num_attention_heads, head_dim)
            value = value.reshape(batch_size, key_length, num_attention_heads, head_dim)
            if has_sliding_window:
                query = query.view(
                    1, batch_size * query_length, num_attention_heads, head_dim
                )
                key = key.view(
                    1, batch_size * key_length, num_attention_heads, head_dim
                )
                value = value.view(
                    1, batch_size * key_length, num_attention_heads, head_dim
                )
        else:
            query = query.view(
                batch_size, query_length, num_key_value_heads, n_groups, head_dim
            )
            # If we need a sliding window attention
            if has_sliding_window:
                query = query.view(
                    1,
                    batch_size * query_length,
                    num_key_value_heads,
                    n_groups,
                    head_dim,
                )
                key = key.view(
                    1, batch_size * key_length, num_key_value_heads, n_groups, head_dim
                )
                value = value.view(
                    1, batch_size * key_length, num_key_value_heads, n_groups, head_dim
                )
    # Run the xformers attention
    attn_output = xformers_attention(
        query,
        key,
        value,
        attn_bias=attn_bias,
    )
    attn_output = attn_output.view(
        batch_size, -1, attn_output.size(-2), attn_output.size(-1)
    )
    return attn_output, None
--- a/src/axolotl/monkeypatch/gemma3.py
+++ b/src/axolotl/monkeypatch/gemma3.py
@@ -7,16 +7,24 @@ from typing import Optional, Tuple, Union
 import torch
 from transformers.cache_utils import Cache
 from transformers.models.gemma3.modeling_gemma3 import (
    _CONFIG_FOR_DOC,
    GEMMA3_INPUTS_DOCSTRING,
    Gemma3CausalLMOutputWithPast,
    logger,
 )
 from transformers.utils import (
    add_start_docstrings_to_model_forward,
    is_torchdynamo_compiling,
    replace_return_docstrings,
 )
 from transformers.utils.deprecation import deprecate_kwarg
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
@add_start_docstrings_to_model_forward(GEMMA3_INPUTS_DOCSTRING)
@replace_return_docstrings(
    output_type=Gemma3CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
 )
 def new_forward(
    self,
    input_ids: torch.LongTensor = None,
--- a/src/axolotl/monkeypatch/gradient_checkpointing/init.py
+++ b/src/axolotl/monkeypatch/gradient_checkpointing/init.py
@@ -1,63 +0,0 @@
 """custom checkpointing utils"""
 import importlib
 from functools import partial
 from packaging import version
 from axolotl.monkeypatch.gradient_checkpointing.offload_cpu import (
    CPU_Offloaded_Gradient_Checkpointer,
 )
 from axolotl.monkeypatch.gradient_checkpointing.offload_disk import (
    Disco,
 )
 transformers_version = version.parse(importlib.metadata.version("transformers"))
 if transformers_version > version.parse("4.51.3"):
    from transformers.modeling_layers import GradientCheckpointingLayer
    def uses_gc_layers(decoder_layer):
        return isinstance(decoder_layer.func.__self__, GradientCheckpointingLayer)
 else:
    def uses_gc_layers(_):
        return False
 def hf_grad_checkpoint_offload_wrapper(
    decoder_layer, *args, use_reentrant=None
 ):  # pylint: disable=unused-argument
    if uses_gc_layers(decoder_layer):
        return CPU_Offloaded_Gradient_Checkpointer.apply(
            decoder_layer,
            *args,
        )
    return CPU_Offloaded_Gradient_Checkpointer.apply(
        (
            decoder_layer.func.__self__
            if isinstance(decoder_layer, partial)
            else decoder_layer.__self__
        ),
        *args,
    )
 def hf_grad_checkpoint_disk_offload_wrapper(
    decoder_layer, *args, use_reentrant=None
 ):  # pylint: disable=unused-argument
    if uses_gc_layers(decoder_layer):
        return Disco.apply(
            decoder_layer,
            *args,
        )
    return Disco.apply(
        (
            decoder_layer.func.__self__
            if isinstance(decoder_layer, partial)
            else decoder_layer.__self__
        ),
        *args,
    )
--- a/src/axolotl/monkeypatch/gradient_checkpointing/offload_disk.py
+++ b/src/axolotl/monkeypatch/gradient_checkpointing/offload_disk.py
@@ -1,531 +0,0 @@
 """
 DISCO - DIsk-based Storage and Checkpointing with Optimized prefetching
 """
 # Copyright 2025 Axolotl AI. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import atexit
 import concurrent.futures
 import logging
 import os
 import queue
 import shutil
 import tempfile
 import threading
 import time
 import uuid
 from collections import deque
 from concurrent.futures import Future
 from typing import Dict
 import torch
 torch_cuda_amp_custom_fwd = torch.amp.custom_fwd(device_type="cuda")
 torch_cuda_amp_custom_bwd = torch.amp.custom_bwd(device_type="cuda")
 # Setup logger
 logger = logging.getLogger(__name__)
 class DiskOffloadManager:
    """
    Manages offloaded tensors and handles prefetching in a separate thread.
    Includes synchronization to prevent race conditions.
    """
    def __init__(
        self,
        prefetch_size: int = 3,
        prefetch_to_gpu: bool = True,
        save_workers: int = 4,
    ):
        """
        Args:
            prefetch_size: Maximum number of tensors to prefetch in the background.
            prefetch_to_gpu: Whether to prefetch tensors directly to GPU memory.
            save_workers: Maximum number of concurrent save operations.
        """
        self.temp_dir = tempfile.mkdtemp(prefix="disco_")
        # Track tensor paths and their status
        self.tensor_paths: deque = deque()  # Ordered history of tensor paths (LIFO)
        self.file_locks: Dict[str, threading.Lock] = (
            {}
        )  # Maps file_path -> threading.Lock()
        # Maps file_path -> status ("saving", "ready", "prefetching", "loaded", "deleted")
        self.file_status: Dict[str, str] = {}
        self.max_prefetch = prefetch_size
        self.prefetch_to_gpu = prefetch_to_gpu
        # Thread synchronization
        self.manager_lock = threading.RLock()  # Used for thread-safe operations
        # Prefetch queue and cache
        self.prefetch_queue: queue.Queue = queue.Queue()
        self.prefetch_cache: Dict[str, torch.Tensor] = {}  # Maps file_path -> tensor
        # Save queue and thread pool
        self.save_queue: queue.Queue = queue.Queue()
        self.save_pool = concurrent.futures.ThreadPoolExecutor(max_workers=save_workers)
        self.save_futures: Dict[str, Future] = {}
        self.save_semaphore = threading.Semaphore(
            save_workers * 2
        )  # Limit concurrent save operations
        # Start prefetch worker thread
        self.stop_event = threading.Event()
        # start multiple threads for prefetching
        self.prefetch_worker_count = 2
        self.prefetch_workers = []
        for _ in range(self.prefetch_worker_count):
            worker = threading.Thread(target=self._prefetch_worker, daemon=True)
            worker.start()
            self.prefetch_workers.append(worker)
        # Start save worker thread
        self.save_worker = threading.Thread(target=self._save_worker, daemon=True)
        self.save_worker.start()
        self.idx = 0
        atexit.register(self.cleanup)
    def _save_worker(self):
        """Background thread that processes the save queue"""
        while not self.stop_event.is_set():
            try:
                save_item = self.save_queue.get(timeout=0.5)
                if save_item is None:
                    continue
                tensor, file_path = save_item
                # Submit the save task to the thread pool
                future = self.save_pool.submit(
                    self._save_tensor_to_disk, tensor, file_path
                )
                with self.manager_lock:
                    self.save_futures[file_path] = future
                self.save_queue.task_done()
            except queue.Empty:
                time.sleep(0.01)  # Small sleep to prevent CPU spinning
                continue
    def _save_tensor_to_disk(self, tensor: torch.Tensor, file_path: str):
        """Actually save the tensor to disk"""
        try:
            # Save tensor to disk
            cpu_tensor = tensor.detach().cpu()
            torch.save(cpu_tensor, file_path)
            del cpu_tensor
            with self.manager_lock:
                # Mark file as ready
                self.file_status[file_path] = "ready"
            # Release semaphore
            self.save_semaphore.release()
            return True
        except FileNotFoundError as e:
            logger.error(f"Error saving tensor to {file_path}: {e}")
            with self.manager_lock:
                self.file_status[file_path] = "error"
            # Release semaphore
            self.save_semaphore.release()
            return False
    def _prefetch_worker(self):
        """Background thread that loads tensors from disk ahead of time"""
        while not self.stop_event.is_set():
            try:
                file_path = self.prefetch_queue.get(timeout=0.5)
                if file_path is None:
                    continue
                # Check if file is available and not already in cache
                with self.manager_lock:
                    if (
                        file_path not in self.file_status
                        or self.file_status[file_path] == "deleted"
                    ):
                        self.prefetch_queue.task_done()
                    if file_path in self.prefetch_cache:
                        self.prefetch_queue.task_done()
                        continue
                    # If file is still being saved, wait for it
                    if (
                        self.file_status[file_path] == "saving"
                        and file_path in self.save_futures
                    ):
                        # Re-queue this prefetch request with a little delay
                        self.prefetch_queue.task_done()
                        time.sleep(0.1)
                        self.prefetch_queue.put(file_path)
                        continue
                    # Mark file as being prefetched
                    self.file_status[file_path] = "prefetching"
                # Load tensor from disk and store in cache
                try:
                    if os.path.exists(file_path):
                        if self.prefetch_to_gpu:
                            tensor = torch.load(
                                file_path,
                                map_location=torch.device("cuda"),
                                weights_only=True,
                            )
                        else:
                            tensor = torch.load(file_path, weights_only=True)
                        with self.manager_lock:
                            self.prefetch_cache[file_path] = tensor
                            self.file_status[file_path] = "ready"
                    else:
                        with self.manager_lock:
                            if self.file_status.get(file_path) != "deleted":
                                logger.warning(
                                    f"Prefetch error: File not found {file_path}"
                                )
                                self.file_status[file_path] = "missing"
                except FileNotFoundError as e:
                    with self.manager_lock:
                        if self.file_status.get(file_path) != "deleted":
                            logger.warning(f"Prefetch error for {file_path}: {e}")
                            self.file_status[file_path] = "error"
                self.prefetch_queue.task_done()
            except queue.Empty:
                time.sleep(0.01)  # Small sleep to prevent CPU spinning
                continue
    def save_tensor(self, tensor: torch.Tensor):
        """Save tensor to disk asynchronously and return file path with thread-safe operations"""
        # Generate unique file path
        self.idx += 1
        file_path: str = os.path.join(
            self.temp_dir, f"{self.idx:06d}-{uuid.uuid4()}.pt"
        )
        with self.manager_lock:
            # Mark file as being saved
            self.file_locks[file_path] = threading.Lock()
            self.file_status[file_path] = "saving"
            # Add to history
            self.tensor_paths.append(file_path)
        # Acquire semaphore to limit concurrent save operations
        self.save_semaphore.acquire()  # pylint: disable=consider-using-with
        # Queue tensor for saving in background
        self.save_queue.put((tensor.detach(), file_path))
        return file_path
    def wait_for_save(self, file_path, timeout=None) -> None:
        """Wait for a tensor to be saved to disk"""
        start_time = time.time()
        while timeout is None or time.time() - start_time < timeout:
            with self.manager_lock:
                if self.file_status.get(file_path) == "ready":
                    return
                if self.file_status.get(file_path) in ["error", "missing", "deleted"]:
                    return
                if file_path in self.save_futures:
                    future = self.save_futures[file_path]
                    if future.done():
                        return
            # Small sleep to prevent CPU spinning
            time.sleep(0.01)
        # Timeout
        logger.warning(f"Timeout waiting for tensor to be saved: {file_path}")
        return
    def load_tensor(self, file_path, target_device="cuda"):
        """Load tensor from disk or prefetch cache with proper synchronization"""
        # Wait for tensor to be saved if it's still in progress
        self.wait_for_save(file_path)
        tensor = None
        # Try to get from cache first
        with self.manager_lock:
            # Check if tensor is already in cache
            if file_path in self.prefetch_cache:
                tensor = self.prefetch_cache[file_path]
                del self.prefetch_cache[file_path]
                self.file_status[file_path] = "loaded"
        if tensor is not None:
            # Ensure tensor is on correct device
            if target_device != "cpu" and tensor.device.type == "cpu":
                tensor = tensor.to(target_device, non_blocking=True)
            return tensor
        # If not in cache, load directly from disk
        try:
            if not os.path.exists(file_path):
                logger.error(f"File not found for loading: {file_path}")
                raise FileNotFoundError(f"File not found: {file_path}")
            tensor = torch.load(file_path, weights_only=True)
            with self.manager_lock:
                self.file_status[file_path] = "loaded"
            if target_device != "cpu":
                tensor = tensor.to(target_device, non_blocking=True)
            return tensor
        except Exception as e:
            logger.error(f"Error loading tensor from {file_path}: {e}")
            raise
    def _safe_delete_file(self, file_path):
        """Safely delete a file with proper synchronization"""
        with self.manager_lock:
            # Make sure any save operation is completed
            if file_path in self.save_futures:
                future = self.save_futures[file_path]
                try:
                    if not future.done():
                        future.cancel()
                    del self.save_futures[file_path]
                except FileNotFoundError as e:
                    logger.warning(
                        f"Error canceling save operation for {file_path}: {e}"
                    )
            # Only delete if file exists and is not being prefetched
            status = self.file_status.get(file_path)
            if status in ["ready", "loaded", "error", "missing"]:
                try:
                    if os.path.exists(file_path):
                        os.remove(file_path)
                    self.file_status[file_path] = "deleted"
                    return True
                except FileNotFoundError as e:
                    logger.warning(f"Error deleting file {file_path}: {e}")
            return False
    def trigger_prefetch(self, n=None):
        """Trigger prefetching of the next N tensors with proper synchronization"""
        if n is None:
            n = self.max_prefetch
        prefetch_paths = []
        with self.manager_lock:
            # Find files that are ready to be prefetched (not already in cache or being prefetched)
            for path in reversed(self.tensor_paths):
                if (
                    path not in self.prefetch_cache
                    and self.file_status.get(path) == "ready"
                ):
                    prefetch_paths.append(path)
                    if len(prefetch_paths) >= n:
                        break
        # Queue files for prefetching
        for path in prefetch_paths:
            self.prefetch_queue.put(path)
    def cleanup_tensor(self, file_path: str):
        """Clean up a specific tensor file after it's been used"""
        with self.manager_lock:
            if file_path in self.tensor_paths:
                self.tensor_paths.remove(file_path)
            # Remove from prefetch cache if present
            if file_path in self.prefetch_cache:
                del self.prefetch_cache[file_path]
            # Remove from save futures if present
            if file_path in self.save_futures:
                future = self.save_futures[file_path]
                if not future.done():
                    future.cancel()
                del self.save_futures[file_path]
        # Try to delete the file
        self._safe_delete_file(file_path)
    def cleanup(self):
        """Clean up all temp files and stop prefetch thread with proper synchronization"""
        self.stop_event.set()
        # Cancel all pending save operations
        with self.manager_lock:
            for _, future in self.save_futures.items():
                if not future.done():
                    future.cancel()
            self.save_futures.clear()
        # Drain the save queue
        while not self.save_queue.empty():
            try:
                self.save_queue.get_nowait()
                self.save_queue.task_done()
            except queue.Empty:
                break
        # Shutdown the save pool
        self.save_pool.shutdown(wait=False)
        # Join the save worker thread
        if self.save_worker.is_alive():
            self.save_worker.join(timeout=2.0)
        # Join the prefetch worker threads
        for thread in self.prefetch_workers:
            if thread.is_alive():
                thread.join(timeout=2.0)
        # Clear cache and remove all temporary files
        with self.manager_lock:
            self.prefetch_cache.clear()
            paths_to_delete = list(self.tensor_paths)
            self.tensor_paths.clear()
        # Delete all temporary files
        for path in paths_to_delete:
            self._safe_delete_file(path)
        # Remove temp directory
        try:
            if os.path.exists(self.temp_dir):
                shutil.rmtree(self.temp_dir, ignore_errors=True)
        except FileNotFoundError as e:
            logger.warning(f"Error removing temporary directory {self.temp_dir}: {e}")
 class Disco(torch.autograd.Function):
    """
    Disco: DIsk-based Storage and Checkpointing with Optimized prefetching
    Advanced disk-based gradient checkpointer with prefetching.
    """
    # Shared manager instance across all checkpointing operations
    _manager = None
    @staticmethod
    def get_instance(prefetch_size=1, prefetch_to_gpu=True, save_workers=4):
        """Get or create the offload manager"""
        if Disco._manager is None:
            Disco._manager = DiskOffloadManager(
                prefetch_size=prefetch_size,
                prefetch_to_gpu=prefetch_to_gpu,
                save_workers=save_workers,
            )
        return Disco._manager
    @staticmethod
    @torch_cuda_amp_custom_fwd
    def forward(
        ctx,
        forward_function,
        hidden_states,
        *args,
        prefetch_size=1,
        prefetch_to_gpu=True,
        save_workers=4,
    ):
        """Forward pass that offloads activations to disk asynchronously"""
        # Get or create the manager
        manager = Disco.get_instance(
            prefetch_size=prefetch_size,
            prefetch_to_gpu=prefetch_to_gpu,
            save_workers=save_workers,
        )
        # Save tensor to disk asynchronously
        file_path = manager.save_tensor(hidden_states)
        # Run forward pass immediately without waiting for save to complete
        with torch.no_grad():
            output = forward_function(hidden_states, *args)
        # Store what we need for backward
        ctx.save_for_backward(torch.tensor([0]))  # Dummy tensor
        ctx.file_path = file_path
        ctx.forward_function = forward_function
        ctx.args = args
        return output
    @staticmethod
    @torch_cuda_amp_custom_bwd
    def backward(ctx, *grad_outputs):
        """Backward pass that loads activations from disk with prefetching"""
        # Get the manager
        manager = Disco._manager
        # Trigger prefetching for future tensors
        # This happens at the start of backward, so should have time to complete
        manager.trigger_prefetch()
        # Load hidden states from disk or prefetch cache
        file_path = ctx.file_path
        try:
            # Ensure the file is saved before we try to load it
            manager.wait_for_save(file_path)
            hidden_states = manager.load_tensor(file_path)
            hidden_states.requires_grad = True
            # Compute gradients
            with torch.enable_grad():
                output = ctx.forward_function(hidden_states, *ctx.args)
                # Handle tuple outputs properly
                if isinstance(output, tuple):
                    if len(grad_outputs) == len(output):
                        torch.autograd.backward(output, grad_outputs)
                    else:
                        torch.autograd.backward(output, grad_outputs[0])
                else:
                    torch.autograd.backward(output, grad_outputs[0])
            # Clean up the file after we're done with it
            manager.cleanup_tensor(file_path)
            return (
                (
                    None,  # forward_function
                    hidden_states.grad,  # hidden_states grad
                )
                + (None,) * len(ctx.args)  # for each arg
                + (
                    None,  # prefetch_size
                    None,  # prefetch_to_gpu
                    None,  # save_workers
                )
            )
        except Exception as e:
            logger.error(f"Error in backward pass: {e}")
            # Clean up the file even on error
            manager.cleanup_tensor(file_path)
            raise
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -18,8 +18,6 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
    "mixtral",
    "qwen2",
    "qwen2_moe",
    "qwen3",
    "qwen3_moe",
    "falcon",
    "phi",
    "phi3",
--- a/src/axolotl/monkeypatch/peft/init.py
+++ b/src/axolotl/monkeypatch/peft/init.py
--- a/src/axolotl/monkeypatch/peft/utils.py
+++ b/src/axolotl/monkeypatch/peft/utils.py
@@ -1,78 +0,0 @@
 """
 Patch prepare_model_for_kbit_training to not upcast everything
 """
 import inspect
 import logging
 import peft
 import axolotl
 from axolotl.monkeypatch.utils import detab_code
 LOG = logging.getLogger(__name__)
 ORIGINAL_PREPARE_CODE = """
        for param in model.parameters():
            if (
                (param.dtype == torch.float16) or (param.dtype == torch.bfloat16)
            ) and param.__class__.__name__ != "Params4bit":
                param.data = param.data.to(torch.float32)
 """
 PATCHED_PREPARE_CODE = """
        for name, param in model.named_parameters():
            if (
                (param.dtype == torch.float16) or (param.dtype == torch.bfloat16)
            ) and param.__class__.__name__ != "Params4bit" and all(embed_name not in name for embed_name in ["embed_tokens", "lm_head"]):
                param.data = param.data.to(torch.float32)
 """
 def get_peft_prep_code() -> str:
    prepare = inspect.getsource(peft.utils.other.prepare_model_for_kbit_training)
    return prepare
 def check_peft_prep_code_is_patchable() -> bool:
    prep_code = get_peft_prep_code()
    prep_code, _ = detab_code(prep_code)
    return ORIGINAL_PREPARE_CODE in prep_code
 def patch_peft_prep_code():
    """
    monkeypatch create_accelerator_and_postprocess so it checks for additional kwargs
    """
    try:
        prep_code = get_peft_prep_code()
    except OSError:
        return
    peft.utils.other._original_create_accelerator_and_postprocess = (  # pylint: disable=protected-access
        prep_code
    )
    prep_code, _ = detab_code(prep_code)
    if ORIGINAL_PREPARE_CODE not in prep_code:
        return
    prep_code = prep_code.replace(ORIGINAL_PREPARE_CODE, PATCHED_PREPARE_CODE)
    prep_code = prep_code.replace(
        "def prepare_model_for_kbit_training(",
        "def fixed_prepare_model_for_kbit_training(",
        1,
    )
    items_to_import = []
    for item in dir(peft.utils.other):
        if item in prep_code:
            items_to_import.append(item)
    exec(  # pylint: disable=exec-used  # nosec B102
        "from peft.utils.other import (" + ", ".join(x for x in items_to_import) + ")",
        globals(),
    )
    exec(prep_code, globals())  # pylint: disable=exec-used  # nosec B102
    LOG.info("patching prepare_model_for_kbit_training to allow for overrides")
    peft.utils.other.prepare_model_for_kbit_training = fixed_prepare_model_for_kbit_training  # pylint: disable=protected-access  # pylint: disable=undefined-variable  # noqa: F821
    axolotl.loaders.model.prepare_model_for_kbit_training = fixed_prepare_model_for_kbit_training  # pylint: disable=protected-access  # pylint: disable=undefined-variable  # noqa: F821
--- a/src/axolotl/monkeypatch/ring_attn/init.py
+++ b/src/axolotl/monkeypatch/ring_attn/init.py
@@ -1,22 +0,0 @@
 """Init for ring attention monkeypatch module"""
 # pylint: disable=unused-import
 # flake8: noqa
 from .patch import (
    get_ring_attn_group,
    patch_prepare_data_loader,
    patch_prepare_device_mesh,
    register_ring_attn,
    set_ring_attn_group,
    update_ring_attn_params,
 )
 __all__ = (
    "get_ring_attn_group",
    "patch_prepare_data_loader",
    "patch_prepare_device_mesh",
    "register_ring_attn",
    "set_ring_attn_group",
    "update_ring_attn_params",
 )
--- a/src/axolotl/monkeypatch/ring_attn/adapters/init.py
+++ b/src/axolotl/monkeypatch/ring_attn/adapters/init.py
--- a/src/axolotl/monkeypatch/ring_attn/patch.py
+++ b/src/axolotl/monkeypatch/ring_attn/patch.py
@@ -1,225 +0,0 @@
 """Ring attention group registration and flash attention patching.
 Make use of the `ring-flash-attn` (https://github.com/zhuzilin/ring-flash-attention)
 package, specifically the `hf_adapter.substitute_hf_flash_attn` function to patch in
 their sequence parallel version of Flash Attention 2.
 We also provide some patches for accelerate functions to prepare the dataloader for
 sequence parallelism training.
 """
 import inspect
 import accelerate
 import torch
 import torch.distributed as dist
 from accelerate.logging import get_logger
 from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
 from axolotl.utils.schemas.enums import RingAttnFunc
 LOG = get_logger(__name__)
 RING_ATTN_GROUP = None
 ORIGINAL_PREPARE_DATALOADER_CODE = """            submesh_fsdp_size = 1
            submesh_dp_size = 1
            submesh_tp_size = 1
            if "tp" in torch_device_mesh.mesh_dim_names:
                submesh_tp_size = torch_device_mesh["tp"].size()
            if "dp" in torch_device_mesh.mesh_dim_names:
                submesh_dp_size = torch_device_mesh["dp"].size()
            if "fsdp" in torch_device_mesh.mesh_dim_names:
                submesh_fsdp_size = torch_device_mesh["fsdp"].size()
            process_index = process_index // submesh_tp_size"""
 NEW_PREPARE_DATALOADER_CODE = """            submesh_fsdp_size = 1
            submesh_dp_size = 1
            submesh_tp_size = 1
            submesh_cp_size = 1
            if "cp" in torch_device_mesh.mesh_dim_names:
                submesh_cp_size = torch_device_mesh["cp"].size()
            if "tp" in torch_device_mesh.mesh_dim_names:
                submesh_tp_size = torch_device_mesh["tp"].size()
            if "dp" in torch_device_mesh.mesh_dim_names:
                submesh_dp_size = torch_device_mesh["dp"].size()
            if "fsdp" in torch_device_mesh.mesh_dim_names:
                submesh_fsdp_size = torch_device_mesh["fsdp"].size()
            process_index = process_index // (submesh_tp_size * submesh_cp_size)"""
 def get_ring_attn_group() -> dist.ProcessGroup:
    """Getter for ring attention group on this rank."""
    if RING_ATTN_GROUP is None:
        raise RuntimeError("register_ring_attn() not yet called")
    return RING_ATTN_GROUP
 def set_ring_attn_group(ring_attn_group: dist.ProcessGroup | None):
    """Setter for ring attention group on this rank."""
    global RING_ATTN_GROUP  # pylint: disable=global-statement
    RING_ATTN_GROUP = ring_attn_group
 def register_ring_attn(
    sequence_parallel_degree: int,
    heads_k_stride: int | None,
    ring_attn_func: RingAttnFunc | None,
 ):
    """Create ring attention group and substitute flash attn with ring flash attn.
    Args:
        sequence_parallel_degree: Sequence parallelism factor.
        heads_k_stride: Sequence parallelism K head stride size. Passed through to
            `varlen_llama3` `ring_flash_attn` implementation.
        ring_attn_func: `ring_flash_attn` ring attention implemention. If sample
            packing is enabled, it must be a `varlen` function; otherwise, it must be a
            `batch` function.
    """
    rank = dist.get_rank()
    world_size = dist.get_world_size()
    if rank == 0:
        LOG.info(
            "Enabling ring attention sequence parallelism: "
            f"each sequence will be processed across {sequence_parallel_degree} GPUs"
        )
    assert sequence_parallel_degree <= world_size, (
        f"sequence_parallel_degree ({sequence_parallel_degree}) "
        f"must be less than or equal to world_size ({world_size})"
    )
    assert world_size % sequence_parallel_degree == 0, (
        f"sequence_parallel_degree ({sequence_parallel_degree}) "
        f"must evenly divide world_size ({world_size})"
    )
    # Assign ranks to sequence parallel groups
    group_assignments = {}
    for i in range(world_size // sequence_parallel_degree):
        ring_attn_ranks = list(
            range(
                i * sequence_parallel_degree,
                (i + 1) * sequence_parallel_degree,
            )
        )
        group = dist.new_group(ranks=ring_attn_ranks, backend="nccl")
        # Track which GPUs are in which groups
        for r in ring_attn_ranks:
            group_assignments[r] = i
        if rank in ring_attn_ranks:
            set_ring_attn_group(group)
    # Log the GPU group assignments
    if rank == 0:
        LOG.info(f"Sequence parallel group assignments: {group_assignments}")
    if ring_attn_func is RingAttnFunc.VARLEN_LLAMA3:
        from ring_flash_attn import substitute_hf_flash_attn
        substitute_hf_flash_attn(
            process_group=get_ring_attn_group(), heads_k_stride=heads_k_stride or 1
        )
    elif ring_attn_func is RingAttnFunc.BATCH_RING:
        from axolotl.monkeypatch.ring_attn.adapters.batch import (
            substitute_hf_flash_attn,
        )
        substitute_hf_flash_attn(
            process_group=get_ring_attn_group(),
            ring_attn_func=ring_attn_func,
        )
 def update_ring_attn_params(position_ids: torch.Tensor | None):
    """
    Calculate the cumulative sequence lengths for the current forward pass and pass the
    value to the substituted `ring_flash_attn`.
    Args:
        position_ids: Optional tensor of position IDs (for sample packed data).
    """
    from ring_flash_attn import update_ring_flash_attn_params
    cu_seqlens, _ = get_cu_seqlens_from_pos_ids(position_ids)
    cu_seqlens = cu_seqlens.squeeze().to(device=torch.cuda.current_device())
    update_ring_flash_attn_params(cu_seqlens, get_ring_attn_group())
 def patch_prepare_data_loader():
    """Patch `accelerate.data_loader.prepare_data_loader` to respect the SP degree.
    Raies:
        RuntimeError: If source code to patch does not exist.
    """
    original_fn = accelerate.data_loader.prepare_data_loader
    original_source = inspect.getsource(original_fn)
    if ORIGINAL_PREPARE_DATALOADER_CODE not in original_source:
        raise RuntimeError(
            "SP patch failed - target snippet not found. "
            "Check accelerate's version or update the patch."
        )
    patched_source = original_source.replace(
        ORIGINAL_PREPARE_DATALOADER_CODE, NEW_PREPARE_DATALOADER_CODE
    )
    # Create a new function from the patched source
    namespace = {}
    exec(  # pylint: disable=exec-used  # nosec B102
        patched_source, accelerate.data_loader.__dict__, namespace
    )
    patched_function = namespace["prepare_data_loader"]
    accelerate.data_loader.prepare_data_loader = patched_function
    LOG.info("Patched accelerate.data_loader.prepare_data_loader for SP support")
 def patch_prepare_device_mesh(sequence_parallel_degree: int):
    """Patches the `Accelerator._prepare_device_mesh` method to create a device mesh
    that includes sequence parallelism with the specified degree.
    Args:
        sequence_parallel_degree (int): The degree of sequence parallelism to use.
    """
    def _prepare_device_mesh(self):
        """Prepare the device mesh for distributed training. The dataloader will
        determine how to load data based on the device mesh.
        """
        if self.state.torch_tp_plugin:
            return self.state.torch_tp_plugin.torch_device_mesh
        if (
            self.distributed_type == accelerate.accelerator.DistributedType.DEEPSPEED
            and hasattr(self.state, "ds_device_mesh")
        ):
            return self.state.ds_device_mesh
        # Create device mesh with sequence parallelism
        world_size = dist.get_world_size()
        mesh_shape = (
            world_size // sequence_parallel_degree,
            sequence_parallel_degree,
        )
        device_ids = list(range(world_size))
        # Note that we use "cp" instead of "sp" to match the PyTorch native "context
        # parallelism" implementation naming
        return dist.DeviceMesh(
            "cuda",
            torch.tensor(device_ids).reshape(mesh_shape),
            mesh_dim_names=("dp", "cp"),
        )
    # Replace the original method with our new method
    # pylint: disable=protected-access
    accelerate.accelerator.Accelerator._prepare_device_mesh = _prepare_device_mesh
    LOG.info(
        "Successfully patched Accelerator._prepare_device_mesh "
        f"with sequence_parallel_degree={sequence_parallel_degree}"
    )
--- a/src/axolotl/prompt_strategies/chat_template.py
+++ b/src/axolotl/prompt_strategies/chat_template.py
@@ -424,20 +424,6 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
            LOG.debug(f"Should train: {should_train}")
            # turn not trainable, skip having to find the turn indices
            # unless last turn and train_on_eos/train_on_eot is all
            if not should_train and (
                self.train_on_eos != "all" and self.train_on_eot != "all"
            ):
                if index == len(turns) - 1:
                    LOG.warning(
                        "Last turn is not trainable, skipping having to find the turn indices. "
                        "This may cause incorrect last EOT/EOS token to be unmasked."
                        "This is likely a dataset design issue. Please ensure last turn is trainable."
                    )
                continue
            turn_start_idx, turn_end_idx = self.find_turn(turns=turns, turn_idx=index)
            LOG.debug(f"Turn indices: start={turn_start_idx}, end={turn_end_idx}")
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -2,17 +2,17 @@
 import importlib
 import inspect
 import logging
 import os
 import signal
 import sys
 import weakref
-from contextlib import ExitStack
+from contextlib import nullcontext
 from pathlib import Path
 from typing import Any, Dict
 import torch
 import transformers.modelcard
 from accelerate.logging import get_logger
 from accelerate.utils import save_fsdp_model
 from datasets import Dataset
 from huggingface_hub.errors import OfflineModeIsEnabled
@@ -21,23 +21,19 @@ from transformers import PreTrainedModel, PreTrainedTokenizer, ProcessorMixin
 from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
 from transformers.trainer import Trainer
 from axolotl.cli.art import print_axolotl_text_art
 from axolotl.common.datasets import TrainDatasetMeta
 from axolotl.contribs.lgpl import (  # pylint: disable = no-name-in-module
    fix_untrained_tokens,
 )
 from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFRLTrainerBuilder
-from axolotl.integrations.base import PluginManager
+from axolotl.core.trainers.mixins.sequence_parallel import (
-from axolotl.loaders import (
+    SequenceParallelContextManager,
    ModelLoader,
    load_processor,
    load_tokenizer,
 )
-from axolotl.utils.ctx_managers.sequence_parallel import SequenceParallelContextManager
+from axolotl.integrations.base import PluginManager
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import cleanup_distributed
 from axolotl.utils.freeze import freeze_layers_except
-from axolotl.utils.schemas.enums import RLType
+from axolotl.utils.models import load_model, load_processor, load_tokenizer
 from axolotl.utils.trainer import setup_trainer
 try:
@@ -45,7 +41,7 @@ try:
 except ImportError:
    BetterTransformer = None
-LOG = logging.getLogger(__name__)
+LOG = get_logger(__name__)
 def setup_model_and_tokenizer(
@@ -66,6 +62,7 @@ def setup_model_and_tokenizer(
    # Load tokenizer
    LOG.debug(
        f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}",
        main_process_only=True,
    )
    tokenizer = load_tokenizer(cfg)
@@ -80,8 +77,7 @@ def setup_model_and_tokenizer(
        msg += " and peft_config..."
    LOG.debug(msg)
-    model_loader = ModelLoader(cfg, tokenizer, processor=processor)
+    model, peft_config = load_model(cfg, tokenizer, processor=processor)
    model, peft_config = model_loader.load()
    if model.generation_config is not None:
        model.generation_config.do_sample = True
@@ -111,15 +107,14 @@ def setup_reference_model(
        Reference model if needed for RL training, `None` otherwise.
    """
    model_ref = None
-    if cfg.rl and cfg.rl != RLType.ORPO:
+    if cfg.rl and cfg.rl != "orpo":
        if cfg.adapter and not cfg.rl_adapter_ref_model:
            # use built-in trl autounwrap
            LOG.debug("Passing model_ref: None to RL trainer")
            model_ref = None  # explicit setting to None
        else:
            # load the model again for model_ref/baseline
-            model_loader = ModelLoader(cfg, tokenizer, reference_model=True)
+            model_ref, _ = load_model(cfg, tokenizer, reference_model=True)
            model_ref, _ = model_loader.load()
    return model_ref
@@ -193,33 +188,28 @@ def execute_training(
        trainer: The configured trainer object.
        resume_from_checkpoint: Path to checkpoint to resume from, if applicable.
    """
    with ExitStack() as stack:
    # Define the context managers to use
-        if cfg.flash_optimum:
+    flash_context = (
            stack.enter_context(
        torch.backends.cuda.sdp_kernel(
            enable_flash=True,
            enable_math=True,
            enable_mem_efficient=True,
        )
        if cfg.flash_optimum
        else nullcontext()
    )
-
+    sequence_parallel_context = (
        if cfg.sequence_parallel_degree > 1:
            models = [trainer.model]
            if hasattr(trainer, "ref_model"):
                models.append(trainer.ref_model)
            stack.enter_context(
        SequenceParallelContextManager(
-                    models=models,
+            model=trainer.model,
            sequence_parallel_degree=cfg.sequence_parallel_degree,
                    gradient_accumulation_steps=cfg.gradient_accumulation_steps,
            ring_attn_func=cfg.ring_attn_func,
                    heads_k_stride=cfg.heads_k_stride,
        )
        if cfg.sequence_parallel_degree > 1
        else nullcontext()
    )
    LOG.info("Starting trainer...")
    with flash_context, sequence_parallel_context:
        trainer.train(resume_from_checkpoint=resume_from_checkpoint)
@@ -526,8 +516,6 @@ def train(
    Returns:
        Tuple of (model, tokenizer) after training
    """
    print_axolotl_text_art()
    # Setup model, tokenizer, (causal or RLHF) trainer, etc.
    (
        trainer,
@@ -537,9 +525,6 @@ def train(
        processor,
    ) = setup_model_and_trainer(cfg, dataset_meta)
    plugin_manager = PluginManager.get_instance()
    plugin_manager.post_trainer_create(cfg, trainer)
    # Handle untrained tokens if configured
    safe_serialization = cfg.save_safetensors is True
    train_dataset = dataset_meta.train_dataset
@@ -562,6 +547,7 @@ def train(
    if not cfg.use_ray:
        cleanup_distributed()
    plugin_manager = PluginManager.get_instance()
    plugin_manager.post_train(cfg, model)
    return model, tokenizer, trainer
--- a/src/axolotl/utils/init.py
+++ b/src/axolotl/utils/init.py
@@ -43,12 +43,3 @@ def set_pytorch_cuda_alloc_conf():
            os.environ["PYTORCH_CUDA_ALLOC_CONF"] = (
                "expandable_segments:True,roundup_power2_divisions:16"
            )
 def patch_optimized_env():
    """
    Patch environment variables to improve VRAM usage and increase download speed
    """
    if os.getenv("HF_HUB_ENABLE_HF_TRANSFER") is None:
        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
    set_pytorch_cuda_alloc_conf()
--- a/src/axolotl/utils/callbacks/init.py
+++ b/src/axolotl/utils/callbacks/init.py
@@ -868,28 +868,3 @@ class GCCallback(TrainerCallback):
    ):
        torch.cuda.empty_cache()
        gc.collect()
 def colab_inference_post_train_callback(trainer: Trainer):
    class ColabCallback(TrainerCallback):
        """Callback to prep model for inference on Google Colab"""
        def __init__(self, cfg):
            self.gpu_name = torch.cuda.get_device_name(0)
            self.cfg = cfg
        def on_train_end(
            self, args, state, control, **kwargs
        ):  # pylint: disable=unused-argument
            """
            handle T4 gpu, we need to convert attention to eager for inference
            """
            if "Tesla T4" in self.gpu_name and self.cfg.xformers_attention:
                trainer.model.config._attn_implementation = (  # pylint: disable=protected-access
                    "eager"
                )
            trainer.model.gradient_checkpointing_disable()
            trainer.model.config.use_cache = True
            trainer.model.eval()
    return ColabCallback
--- a/src/axolotl/utils/callbacks/mlflow_.py
+++ b/src/axolotl/utils/callbacks/mlflow_.py
@@ -1,7 +1,6 @@
 """MLFlow module for trainer callbacks"""
 import logging
 import os
 from shutil import copyfile
 from tempfile import NamedTemporaryFile
 from typing import TYPE_CHECKING
@@ -17,11 +16,6 @@ if TYPE_CHECKING:
 LOG = logging.getLogger("axolotl.callbacks")
 def should_log_artifacts() -> bool:
    truths = ["TRUE", "1", "YES"]
    return os.getenv("HF_MLFLOW_LOG_ARTIFACTS", "FALSE").upper() in truths
 class SaveAxolotlConfigtoMlflowCallback(TrainerCallback):
    # pylint: disable=duplicate-code
    """Callback to save axolotl config to mlflow"""
@@ -38,7 +32,6 @@ class SaveAxolotlConfigtoMlflowCallback(TrainerCallback):
    ):
        if is_main_process():
            try:
                if should_log_artifacts():
                with NamedTemporaryFile(
                    mode="w", delete=False, suffix=".yml", prefix="axolotl_config_"
                ) as temp_file:
@@ -47,10 +40,6 @@ class SaveAxolotlConfigtoMlflowCallback(TrainerCallback):
                    LOG.info(
                        "The Axolotl config has been saved to the MLflow artifacts."
                    )
                else:
                    LOG.info(
                        "Skipping logging artifacts to MLflow (hf_mlflow_log_artifacts is false)"
                    )
            except (FileNotFoundError, ConnectionError) as err:
                LOG.warning(f"Error while saving Axolotl config to MLflow: {err}")
        return control
--- a/src/axolotl/utils/config/init.py
+++ b/src/axolotl/utils/config/init.py
@@ -11,10 +11,9 @@ from transformers.utils.import_utils import is_torch_npu_available
 from axolotl.integrations.base import PluginManager
 from axolotl.integrations.config import merge_input_args
 from axolotl.loaders import MULTIMODAL_AUTO_MODEL_MAPPING
 from axolotl.loaders.utils import load_model_config
 from axolotl.utils.bench import log_gpu_memory_usage
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import MULTIMODAL_AUTO_MODEL_MAPPING, load_model_config
 from axolotl.utils.schemas.config import (
    AxolotlConfigWCapabilities as AxolotlConfigWCapabilitiesBase,
 )
@@ -60,7 +59,7 @@ def choose_device(cfg):
 def resolve_dtype(cfg):
    if (
-        not cfg.fp16 and cfg.bf16 == "auto" and not cfg.use_ray
+        cfg.bf16 == "auto" and not cfg.use_ray
    ):  # if we use ray we want to defer this check to the worker node
        if is_torch_bf16_gpu_available():
            LOG.debug("bf16 support detected, enabling for this configuration.")
@@ -71,9 +70,6 @@ def resolve_dtype(cfg):
            if cfg.fp16 is None and not cfg.float16:
                cfg.fp16 = True
    if cfg.fp16 and cfg.bf16 == "auto":
        cfg.bf16 = False
    if cfg.device == "mps":
        cfg.load_in_8bit = False
        cfg.tf32 = False
--- a/src/axolotl/utils/ctx_managers/init.py
+++ b/src/axolotl/utils/ctx_managers/init.py
@@ -1,6 +0,0 @@
 """Init for context manager submodule"""
 # pylint: disable=unused-import
 # flake8: noqa
 from .sequence_parallel import SequenceParallelContextManager
--- a/src/axolotl/utils/ctx_managers/sequence_parallel.py
+++ b/src/axolotl/utils/ctx_managers/sequence_parallel.py
@@ -1,376 +0,0 @@
 """Module for Axolotl trainer sequence parallelism manager and utilities"""
 import functools
 import inspect
 import torch
 import torch.distributed as dist
 from torch import nn
 from torch.utils.hooks import RemovableHandle
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.utils import ModelOutput
 from axolotl.monkeypatch.ring_attn import (
    get_ring_attn_group,
    patch_prepare_data_loader,
    patch_prepare_device_mesh,
    register_ring_attn,
    update_ring_attn_params,
 )
 from axolotl.utils.schemas.enums import RingAttnFunc
 # TODO(djsaunde): implement zigzag, stripe patterns here (and elsewhere) in this
 # module. Currently, we just focus on batch ring and varlen llama3 for simplicity.
 def apply_sequence_parallelism(
    batch: dict[str, torch.Tensor],
    local_rank: int,
    local_world_size: int,
    gradient_accumulation_steps: int,
    ring_attn_func: RingAttnFunc,  # pylint: disable=unused-argument
 ) -> tuple[dict[str, torch.Tensor], int, int]:
    """
    Apply sequence parallelism slicing to a batch.
    Special handling is implemented for integer logits_to_keep, which indicates
    to only keep the last N tokens in the sequence during generation.
    Args:
        batch: Batch dictionary (e.g., input_ids, attention_mask, etc.).
        local_rank: Local rank in the sequence parallel group.
        local_world_size: World size of the sequence parallel group.
        gradient_accumulation_steps: Number of steps to accumulate gradients over.
        ring_attn_func: Which ring attention function to use. Currently unused, but
            related to above TODO.
    Returns:
        tuple of:
            - Batch dictionary with sliced tensors.
            - The original sequence length before padding.
            - The number of padding tokens added.
    """
    original_seq_len = batch["input_ids"].size(1)
    # Update ring attention params if needed
    if batch.get("position_ids") is not None:
        update_ring_attn_params(position_ids=batch["position_ids"])
    else:
        # If position_ids aren't already in the batch, create them
        batch["position_ids"] = torch.arange(
            0,
            original_seq_len,
            dtype=torch.long,
            device=batch["input_ids"].device,
        ).expand(batch["input_ids"].size(0), -1)
    if "logits_to_keep" in batch and isinstance(batch["logits_to_keep"], int):
        logits_to_keep = batch["logits_to_keep"]
        # Calculate which positions in the full sequence contain the last N tokens
        start_position = max(0, original_seq_len - logits_to_keep)
        chunk_size = original_seq_len // local_world_size
        rank_start = local_rank * chunk_size
        rank_end = rank_start + chunk_size
        # Create a boolean mask tensor for this rank's chunk
        mask = torch.zeros(
            chunk_size,
            dtype=torch.bool,
            device=batch["input_ids"].device,
        )
        if rank_end > start_position:
            # Calculate how many of the last N tokens fall within this rank's range
            tokens_in_rank = min(rank_end, original_seq_len) - max(
                rank_start, start_position
            )
            # Calculate where these tokens start in the local chunk
            local_start_idx = max(0, start_position - rank_start)
            # Set the appropriate positions in the mask to True
            mask[local_start_idx : local_start_idx + tokens_in_rank] = True
        # Replace the integer with the boolean mask
        batch["logits_to_keep"] = mask
    # Add padding to make sequence length divisible by local_world_size
    total_seq_len = original_seq_len
    pad_len = 0
    divisor = min(local_world_size, 64)
    if total_seq_len % divisor != 0:
        pad_len = divisor - (total_seq_len % divisor)
        # Apply padding to all relevant tensors
        for key in batch:
            if (
                isinstance(batch[key], torch.Tensor)
                and batch[key].dim() > 1
                and batch[key].size(1) == total_seq_len
            ):
                # Create padding tensor
                pad_value = -100 if key == "labels" else 0
                padding = torch.full(
                    (batch[key].size(0), pad_len, *batch[key].shape[2:]),
                    pad_value,
                    dtype=batch[key].dtype,
                    device=batch[key].device,
                )
                # Concatenate padding to the right side of the tensor
                batch[key] = torch.cat([batch[key], padding], dim=1)
            if key == "logits_to_keep":
                # Create padding tensor
                padding = torch.ones(
                    1,
                    dtype=batch[key].dtype,
                    device=batch[key].device,
                )
                # Concatenate padding to the right side of the tensor
                batch[key] = torch.cat([batch[key], padding], dim=0)
        # Update the total sequence length after padding
        total_seq_len = batch["input_ids"].size(1)
    # Slice batch for sequence parallel
    for key in batch:
        if not isinstance(batch[key], torch.Tensor) or batch[key].dim() <= 1:
            continue
        # Split in sequential fashion and grab this rank's chunk
        if batch[key].size(1) == total_seq_len:
            batch[key] = (
                batch[key].chunk(local_world_size, dim=1)[local_rank].contiguous()
            )
        elif key == "logits_to_keep":
            batch[key] = (
                batch[key].chunk(local_world_size, dim=0)[local_rank].contiguous()
            )
        # Handle num_items_in_batch
        if "num_items_in_batch" in batch:
            # Approximation; this needed since num_items_in_batch may be counted across
            # all samples in a gradient accumulated batch, not on a per-step basis.
            batch["num_items_in_batch"] = (
                batch["labels"] != -100
            ).sum() * gradient_accumulation_steps
    return batch, original_seq_len, pad_len
 class SequenceParallelContextManager:
    """Context manager for sequence parallelism operations.
    This class provides a context that will automatically apply sequence parallelism
    during model forward passes using a pre-forward hook, and gather outputs from
    across the sequence parallelism group using a post-forward hook.
    Args:
        models: List of models to apply sequence parallelism to pre- and post- forward
            hooks.
        sequence_parallel_degree: Number of processes to split sequences over.
        gradient_accumulation_steps: Number of steps to accumulate gradients over.
        ring_attn_func: Which ring attention function to use. Currently unused.
        heads_k_stride: Sequence parallelism K head stride size. Passed through to
            `varlen_llama3` `ring_flash_attn` implementation.
    """
    def __init__(
        self,
        models: list[nn.Module],
        sequence_parallel_degree: int,
        gradient_accumulation_steps: int,
        ring_attn_func: RingAttnFunc,
        heads_k_stride: int | None,
    ):
        self.models = models
        self.sequence_parallel_degree = sequence_parallel_degree
        self.gradient_accumulation_steps = gradient_accumulation_steps
        self.ring_attn_func = ring_attn_func
        self.heads_k_stride = heads_k_stride
        self._register_ring_attn()
        # Set distributed info for local rank
        self.process_group = get_ring_attn_group()
        self.local_rank = dist.get_rank(self.process_group)
        self.local_world_size = dist.get_world_size(self.process_group)
        # Will store hook handles for removal
        self.hook_handles: list[RemovableHandle] = []
        # Store original sequence length and padding information
        self.original_seq_len = 0
        self.pad_len = 0
        # Create a partially applied version of the apply_sequence_parallelism function
        self.apply_sequence_parallelism = functools.partial(
            apply_sequence_parallelism,
            local_rank=self.local_rank,
            local_world_size=self.local_world_size,
            gradient_accumulation_steps=self.gradient_accumulation_steps,
            ring_attn_func=self.ring_attn_func,
        )
    def __enter__(self):
        self._register_model_hooks()
        return self
    def __exit__(self, exc_type, exc_val, exc_tb):
        # Remove all hooks
        for handle in self.hook_handles:
            handle.remove()
        self.hook_handles = []
        # TODO(djsaunde): Un-patch attention and accelerate functions (low priority)
    def _register_ring_attn(self):
        # Initialize ring attn for sequence parallelism
        register_ring_attn(
            sequence_parallel_degree=self.sequence_parallel_degree,
            heads_k_stride=self.heads_k_stride,
            ring_attn_func=self.ring_attn_func,
        )
        # Patches for accelerate functionality
        patch_prepare_data_loader()
        patch_prepare_device_mesh(
            sequence_parallel_degree=self.sequence_parallel_degree
        )
    def _register_model_hooks(self):
        # Forward pre-hook to apply sequence parallelism
        def sequence_parallel_pre_hook(_, args, kwargs):
            # Get parameter names from the model's forward function
            forward_params = list(
                inspect.signature(self.models[0].forward).parameters.keys()
            )
            updated_kwargs = kwargs.copy()
            for i, arg in enumerate(args):
                if i < len(forward_params):
                    updated_kwargs[forward_params[i]] = arg
            # Any excess positional arguments are kept as-is
            remaining_args = args[len(forward_params) :]
            # Apply sequence parallelism to updated kwargs
            updated_kwargs, self.original_seq_len, self.pad_len = (
                self.apply_sequence_parallelism(updated_kwargs)
            )
            return remaining_args, updated_kwargs
        # Forward post-hook to gather outputs
        def sequence_parallel_post_hook(_, __, output: ModelOutput) -> ModelOutput:
            # Gather the sharded outputs
            output = self._gather_outputs(output)
            # Remove padding if it was added
            if self.pad_len > 0:
                for key, value in output.items():
                    if isinstance(value, torch.Tensor) and value.dim() > 1:
                        if value.size(1) == self.original_seq_len + self.pad_len:
                            # Slice to remove padding
                            output[key] = value[:, : self.original_seq_len].contiguous()
            return output
        # Register both hooks
        for model in self.models:
            self.hook_handles.append(
                model.register_forward_pre_hook(
                    sequence_parallel_pre_hook, with_kwargs=True
                )
            )
            self.hook_handles.append(
                model.register_forward_hook(sequence_parallel_post_hook)
            )
    def _gather_outputs(self, output: CausalLMOutputWithPast) -> CausalLMOutputWithPast:
        """Gather sharded outputs from all ranks and reconstruct the full tensor."""
        for key, value in output.items():
            if isinstance(value, torch.Tensor) and value.dim() > 1:
                output[key] = AllGatherWithGrad.apply(value, self.process_group)
        return output
 class AllGatherWithGrad(torch.autograd.Function):
    """Custom autograd function for all-gather to preserve gradients."""
    @staticmethod
    def forward(
        ctx: torch.autograd.function.FunctionCtx,
        input_tensor: torch.Tensor,
        group: dist.ProcessGroup,
    ) -> torch.Tensor:
        """
        Forward pass of all-gather of data with sequence dimension.
        Args:
            ctx: `torch.autograd` function context.
            input_tensor: Tensor from model output with sequence dimension.
            group: `torch.distributed` process group.
        Returns:
            Tensor from gathering the `input_tensor` from across the process group and
                concatenating along the sequence dimension.
        """
        ctx.group = group
        ctx.rank = dist.get_rank(group)
        world_size = dist.get_world_size(group)
        # Gather shape metadata
        local_shape = torch.tensor(list(input_tensor.shape), device=input_tensor.device)
        all_shapes = [torch.zeros_like(local_shape) for _ in range(world_size)]
        dist.all_gather(all_shapes, local_shape, group=group)
        # Store sequence lengths for backward pass
        seq_lens = [int(shape[1].item()) for shape in all_shapes]
        ctx.seq_lens = seq_lens
        # Perform all_gather operation
        gathered = [
            torch.zeros(
                tuple(shape.tolist()),
                dtype=input_tensor.dtype,
                device=input_tensor.device,
            )
            for shape in all_shapes
        ]
        dist.all_gather(gathered, input_tensor, group=group)
        # Concatenate tensors along sequence dimension
        result = torch.cat(gathered, dim=1)
        return result
    @staticmethod
    def backward(
        ctx: torch.autograd.function.FunctionCtx, grad_output: torch.Tensor
    ) -> tuple[torch.Tensor, None]:
        """
        Backward pass for all-gather operation.
        Extracts the gradient slice corresponding to this rank's original input
        from the full gradient tensor.
        Args:
            ctx: `torch.autograd` function context.
            grad_output: Gradient from subsequent layers with respect to the
                concatenated output tensor.
        Returns:
            Tuple containing the gradient slice for this rank's input tensor and `None`
                for the process group parameter which doesn't require gradients.
        """
        rank = ctx.rank
        seq_lens = ctx.seq_lens
        # Extract gradient for this rank's chunk
        offset = sum(seq_lens[:rank])
        grad_slice = grad_output[:, offset : offset + seq_lens[rank]].contiguous()
        return grad_slice, None
--- a/src/axolotl/utils/data/rl.py
+++ b/src/axolotl/utils/data/rl.py
@@ -10,7 +10,6 @@ import yaml
 from datasets import Dataset, DatasetDict, concatenate_datasets, load_from_disk
 from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
 from axolotl.loaders import load_tokenizer
 from axolotl.prompt_strategies.dpo import load as load_dpo
 from axolotl.prompt_strategies.kto import load as load_kto
 from axolotl.prompt_strategies.orpo import load as load_orpo
@@ -18,9 +17,9 @@ from axolotl.utils.data.shared import datasets_w_name_generator, load_dataset_w_
 from axolotl.utils.data.utils import deduplicate_and_log_datasets, md5
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import is_main_process, zero_first
-from axolotl.utils.schemas.enums import RLType
+from axolotl.utils.models import load_tokenizer
-LOG = logging.getLogger(__name__)
+LOG = logging.getLogger("axolotl")
 def _get_path(ds_hash, cfg):
@@ -72,7 +71,6 @@ def map_dataset(cfg, data_set, ds_transform_fn, tokenizer, **map_kwargs):
    data_set = data_set.map(
        ds_transform_fn,
        desc="Mapping RL Dataset",
        num_proc=cfg.dataset_processes,
        **map_kwargs,
    )
@@ -82,7 +80,7 @@ def map_dataset(cfg, data_set, ds_transform_fn, tokenizer, **map_kwargs):
 def drop_long_rl_seq(
    sample, rl, tokenizer, sequence_len  # pylint: disable=invalid-name
 ):
-    if rl in (RLType.DPO, RLType.IPO, RLType.ORPO, RLType.SIMPO):
+    if rl in ("dpo", "ipo", "orpo", "simpo"):
        if not (
            sample.get("prompt") and sample.get("chosen") and sample.get("rejected")
        ):
@@ -102,7 +100,7 @@ def drop_long_rl_seq(
            len_prompt + len_rejected
        ) <= sequence_len
-    if rl is RLType.KTO:
+    if rl == "kto":
        if not (sample.get("prompt") and sample.get("completion")):
            raise ValueError("Prompt and completion keys are required for KTO datasets")
@@ -116,7 +114,7 @@ def drop_long_rl_seq(
        return (len_prompt + len_completion) <= sequence_len
-    if rl is RLType.GRPO:
+    if rl == "grpo":
        return True
    raise ValueError("Unknown RL type")
@@ -139,9 +137,9 @@ def load_prepare_preference_datasets(cfg):
            if _type:
                if isinstance(_type, DictDefault):
                    _type = "user_defined.default"
-                if _cfg.rl is RLType.ORPO:
+                if _cfg.rl == "orpo":
                    ds_transform_fn = load_orpo(_type, _cfg, dataset_idx=i)
-                elif _cfg.rl is RLType.KTO:
+                elif _cfg.rl == "kto":
                    ds_transform_fn = load_kto(_type, _cfg, dataset_idx=i)
                else:
                    ds_transform_fn = load_dpo(_type, _cfg, dataset_idx=i)
@@ -152,7 +150,7 @@ def load_prepare_preference_datasets(cfg):
                split_datasets[i] = map_dataset(
                    cfg, data_set, ds_transform_fn, tokenizer, **map_kwargs
                )
-            elif _cfg.rl is RLType.KTO:
+            elif _cfg.rl == "kto":
                ds_transform_fn = load_kto(_type, _cfg, dataset_idx=i)
                map_kwargs = {}
                if isinstance(ds_transform_fn, tuple):
@@ -187,7 +185,7 @@ def load_prepare_preference_datasets(cfg):
                    )
        combined_datasets = concatenate_datasets(split_datasets)
-        combined_datasets = combined_datasets.shuffle(seed=cfg.seed or 42)
+        combined_datasets = combined_datasets.shuffle(seed=cfg.seed)
        return combined_datasets
@@ -207,8 +205,6 @@ def load_prepare_preference_datasets(cfg):
                eval_dataset = load_split(cfg.test_datasets, cfg)
        if not eval_dataset:
            if cfg.val_set_size:
                seed = cfg.seed if cfg.seed is not None else 42
                # ensure we end up with the same fingerprint by doing rank0 first and being able to cache
                to_hash_train = (
                    train_dataset._fingerprint  # pylint: disable=protected-access
@@ -217,7 +213,7 @@ def load_prepare_preference_datasets(cfg):
                    + "|"
                    + "train"
                    + "|"
-                    + str(seed)
+                    + str(cfg.seed or 42)
                )
                to_hash_test = (
                    train_dataset._fingerprint  # pylint: disable=protected-access
@@ -226,13 +222,13 @@ def load_prepare_preference_datasets(cfg):
                    + "|"
                    + "test"
                    + "|"
-                    + str(seed)
+                    + str(cfg.seed or 42)
                )
                train_fingerprint = md5(to_hash_train)
                test_fingerprint = md5(to_hash_test)
                ds_w_test_split = train_dataset.train_test_split(
                    test_size=cfg.val_set_size,
-                    seed=seed,
+                    seed=cfg.seed,
                    shuffle=False,
                    train_new_fingerprint=train_fingerprint,
                    test_new_fingerprint=test_fingerprint,
--- a/Show More
+++ b/Show More