improve handling and error if fa3 requested but not installeD

handle args to drop dropout
move fa3 tests to multigpu since we only run those on hopper
2025-05-19 10:11:14 -07:00 · 2025-05-18 15:17:40 -07:00 · 2025-05-18 15:17:39 -07:00 · 2025-05-18 15:17:39 -07:00 · 2025-05-18 15:17:39 -07:00 · 2025-05-18 15:17:39 -07:00
246 changed files with 5924 additions and 7787 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -17,7 +17,7 @@ jobs:
  build-base:
    if: github.repository_owner == 'axolotl-ai-cloud'
    # this job needs to be run on self-hosted GPU runners...
-    runs-on: ubuntu-latest-m
+    runs-on: axolotl-gpu-runner
    strategy:
      fail-fast: false
      matrix:
@@ -28,50 +28,49 @@ jobs:
            python_version: "3.11"
            pytorch: 2.5.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
          - cuda: "124"
            cuda_version: 12.4.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.6.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
          - cuda: "126"
            cuda_version: 12.6.3
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.6.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
          - cuda: "126"
            cuda_version: 12.6.3
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.7.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
          - cuda: "128"
-            cuda_version: 12.6.3
+            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.7.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
+          - cuda: "126"
+            cuda_version: 12.6.3
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: 2.6.0
+            suffix: "-hopper"
+            torch_cuda_arch_list: "9.0+PTX"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: nightly
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base-nightly"
-#          # "next" is for release candidates of pytorch
-#          - cuda: "128"
-#            cuda_version: 12.8.1
-#            cudnn_version: ""
-#            python_version: "3.11"
-#            pytorch: next
-#            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-#            dockerfile: "Dockerfile-base-next"
+          - cuda: "128"
+            cuda_version: 12.8.1
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: next
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -93,61 +92,9 @@ jobs:
        uses: docker/build-push-action@v4
        with:
          context: .
-          file: ./docker/${{ matrix.dockerfile }}
+          file: ${{ matrix.pytorch == 'nightly' && './docker/Dockerfile-base-nightly' || matrix.pytorch == 'next' && './docker/Dockerfile-base-next' || './docker/Dockerfile-base' }}
          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-          labels: ${{ steps.metadata.outputs.labels }}
-          build-args: |
-            CUDA_VERSION=${{ matrix.cuda_version }}
-            CUDNN_VERSION=${{ matrix.cudnn_version }}
-            CUDA=${{ matrix.cuda }}
-            PYTHON_VERSION=${{ matrix.python_version }}
-            PYTORCH_VERSION=${{ matrix.pytorch }}
-            TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }}
-  build-base-uv:
-    if: github.repository_owner == 'axolotl-ai-cloud'
-    runs-on: ubuntu-latest-m
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - cuda: "126"
-            cuda_version: 12.6.3
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.6.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-uv-base"
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.7.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-uv-base"
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Docker metadata
-        id: metadata
-        uses: docker/metadata-action@v5
-        with:
-          images: |
-            axolotlai/axolotl-base-uv
-      - name: Login to Docker Hub
-        uses: docker/login-action@v2
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Build
-        uses: docker/build-push-action@v4
-        with:
-          context: .
-          file: ./docker/${{ matrix.dockerfile }}
-          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}${{ matrix.suffix || '' }}
          labels: ${{ steps.metadata.outputs.labels }}
          build-args: |
            CUDA_VERSION=${{ matrix.cuda_version }}
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -9,7 +9,6 @@ on:
       - '.github/workflows/*.yml'
       - "*.[q]md"
       - "examples/**/*.y[a]?ml"
-       - ".pre-commit-config.yaml"
  workflow_dispatch:

 jobs:
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -8,7 +8,7 @@ on:
      - 'setup.py'
      - 'pyproject.toml'
      - '.github/workflows/multi-gpu-e2e.yml'
-      - 'src/axolotl/core/trainers/mixins/context_parallel.py'
+      - 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
      - 'src/axolotl/utils/distributed.py'
  workflow_dispatch:
  schedule:
@@ -32,21 +32,25 @@ jobs:
            pytorch: 2.6.0
            axolotl_extras: vllm
            num_gpus: 2
-            nightly_build: "true"
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.6.0
+            axolotl_extras:
+            suffix: "-hopper"
+            num_gpus: 2
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.5.1
            axolotl_extras:
            num_gpus: 2
-            nightly_build: "true"
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.7.0
            axolotl_extras:
            num_gpus: 2
-            nightly_build: "true"
    runs-on: [self-hosted, modal]
    timeout-minutes: 120
    steps:
@@ -59,7 +63,7 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==1.0.2 jinja2
+          pip install modal==0.71.8 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
@@ -68,7 +72,6 @@ jobs:
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
--- a/.github/workflows/precommit-autoupdate.yml
+++ b/.github/workflows/precommit-autoupdate.yml
@@ -25,6 +25,7 @@ jobs:
          pre-commit autoupdate
          if [[ -n $(git status --porcelain) ]]; then
            echo "changes=true" >> $GITHUB_OUTPUT
+            git diff .pre-commit-config.yaml > pre-commit-update.diff
          fi

      - name: Create Pull Request
@@ -38,3 +39,11 @@ jobs:
          commit-message: "chore: update pre-commit hooks"
          body: |
            Automated PR to update pre-commit hooks to their latest versions.
+
+            <details>
+            <summary>Changes:</summary>
+
+            ```diff
+            ${{ steps.update.outputs.diff }}
+            ```
+            </details>
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -44,6 +44,98 @@ jobs:
        env:
          SKIP: no-commit-to-branch

+#  preload-cache:
+#    name: Preload HF cache
+#    runs-on: ubuntu-latest
+#    strategy:
+#      fail-fast: false
+#      matrix:
+#        python_version: ["3.11"]
+#        pytorch_version: ["2.6.0"]
+#    timeout-minutes: 20
+#
+#    env:
+#      AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
+#
+#    steps:
+#      - name: Check out repository code
+#        uses: actions/checkout@v4
+#
+#      - name: Restore HF cache
+#        id: hf-cache-restore
+#        uses: actions/cache/restore@v4
+#        with:
+#          path: |
+#            /home/runner/.cache/huggingface/hub/datasets--*
+#            /home/runner/.cache/huggingface/hub/models--*
+#          key: ${{ runner.os }}-hf-hub-cache-v2
+#
+#      - name: Restore Cache from S3
+#        id: hf-cache-restore-s3
+#        run: |
+#          mkdir -p /home/runner/.cache/huggingface/hub
+#          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd
+#
+#      - name: Setup Python
+#        uses: actions/setup-python@v5
+#        with:
+#          python-version: ${{ matrix.python_version }}
+#          cache: 'pip' # caching pip dependencies
+#
+#      - name: upgrade pip
+#        run: |
+#          pip3 install --upgrade pip
+#          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
+#
+#      - name: Install PyTorch
+#        run: |
+#          pip3 install torch==${{ matrix.pytorch_version }}
+#
+#      - name: Install dependencies
+#        run: |
+#          pip3 show torch
+#          pip3 install --no-build-isolation -U -e .
+#          python scripts/unsloth_install.py | sh
+#          python scripts/cutcrossentropy_install.py | sh
+#          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+#
+#      - name: Make sure PyTorch version wasn't clobbered
+#        run: |
+#          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
+#
+#      - name: Ensure axolotl CLI was installed
+#        run: |
+#          axolotl --help
+#
+#      - name: Pre-Download dataset fixture
+#        run: |
+#          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
+#
+#      - name: Run tests
+#        run: |
+#          pytest -v tests/conftest.py
+#
+#      - name: Upload coverage to Codecov
+#        uses: codecov/codecov-action@v5
+#        with:
+#          token: ${{ secrets.CODECOV_TOKEN }}
+#          files: ./coverage.xml
+#          flags: unittests,pytorch-${{ matrix.pytorch_version }}
+#          fail_ci_if_error: false
+#
+#      - name: cleanup pip cache
+#        run: |
+#          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
+#
+#      - name: Save HF cache
+#        id: hf-cache
+#        uses: actions/cache/save@v4
+#        with:
+#          path: |
+#            /home/runner/.cache/huggingface/hub/datasets--*
+#            /home/runner/.cache/huggingface/hub/models--*
+#          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
+
  pytest:
    name: PyTest
    runs-on: ubuntu-latest
@@ -59,6 +151,15 @@ jobs:
      - name: Check out repository code
        uses: actions/checkout@v4

+#      - name: Restore HF cache
+#        id: hf-cache-restore
+#        uses: actions/cache/restore@v4
+#        with:
+#          path: |
+#            /home/runner/.cache/huggingface/hub/datasets--*
+#            /home/runner/.cache/huggingface/hub/models--*
+#          key: ${{ runner.os }}-hf-hub-cache-v2
+
      - name: Restore Cache from S3
        id: hf-cache-restore-s3
        run: |
@@ -121,6 +222,7 @@ jobs:
  pytest-sdist:
    name: PyTest from Source Dist
    runs-on: ubuntu-latest
+#    needs: [preload-cache]
    strategy:
      fail-fast: false
      matrix:
@@ -132,6 +234,15 @@ jobs:
      - name: Check out repository code
        uses: actions/checkout@v4

+#      - name: Restore HF cache
+#        id: hf-cache-restore
+#        uses: actions/cache/restore@v4
+#        with:
+#          path: |
+#            /home/runner/.cache/huggingface/hub/datasets--*
+#            /home/runner/.cache/huggingface/hub/models--*
+#          key: ${{ runner.os }}-hf-hub-cache-v2
+
      - name: Restore Cache from S3
        id: hf-cache-restore-s3
        run: |
@@ -201,13 +312,6 @@ jobs:
            pytorch: 2.6.0
            num_gpus: 1
            axolotl_extras: vllm
-          - cuda: 126
-            cuda_version: 12.6.3
-            python_version: "3.11"
-            pytorch: 2.6.0
-            num_gpus: 1
-            axolotl_extras:
-            dockerfile: "Dockerfile-uv.jinja"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -218,7 +322,7 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==1.0.2 jinja2
+          pip install modal==0.71.8 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
@@ -229,7 +333,6 @@ jobs:
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
          modal run cicd.e2e_tests
@@ -281,7 +384,7 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==1.0.2 jinja2
+          pip install modal==0.71.8 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
@@ -292,7 +395,6 @@ jobs:
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
          modal run cicd.e2e_tests
@@ -322,7 +424,7 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==1.0.2 jinja2
+          pip install modal==0.71.8 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,15 +19,15 @@ repos:
    hooks:
      - id: isort
 -   repo: https://github.com/PyCQA/flake8
-    rev: 7.2.0
+    rev: 7.1.2
    hooks:
    - id: flake8
 -   repo: https://github.com/pylint-dev/pylint
-    rev: v3.3.7
+    rev: v3.3.6
    hooks:
    - id: pylint
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.16.0
+    rev: v1.15.0
    hooks:
    - id: mypy
      additional_dependencies:
--- a/.runpod/src/config/config.yaml
+++ b/.runpod/src/config/config.yaml
@@ -242,12 +242,16 @@
 # early_stopping_patience: 3

 # # Specify a scheduler and kwargs to use with the optimizer
-# lr_scheduler: # 'one_cycle' | empty for cosine
+# lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
 # lr_scheduler_kwargs:

 # # For one_cycle optim
 # lr_div_factor: # Learning rate div factor

+# # For log_sweep optim
+# log_sweep_min_lr:
+# log_sweep_max_lr:
+
 # # Specify optimizer
 # # Valid values are driven by the Transformers OptimizerNames class, see:
 # # https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
--- a/README.md
+++ b/README.md
@@ -51,7 +51,7 @@ Features:

 - NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
 - Python 3.11
- PyTorch ≥2.5.1
+- PyTorch ≥2.4.1

 ### Installation

--- a/_quarto.yml
+++ b/_quarto.yml
@@ -17,9 +17,7 @@ quartodoc:
        - convert
        - prompt_tokenizers
        - logging_config
-        - core.builders.base
-        - core.builders.causal
-        - core.builders.rl
+        - core.trainer_builder
        - core.training_args
        - core.chat.messages
        - core.chat.format.chatml
@@ -45,7 +43,6 @@ quartodoc:
        - cli.vllm_serve
        - cli.cloud.base
        - cli.cloud.modal_
-        - cli.quantize
    - title: Trainers
      desc: Training implementations
      contents:
@@ -57,25 +54,17 @@ quartodoc:
        - core.trainers.grpo.trainer
        - core.trainers.grpo.sampler
        - core.trainers.utils
-    - title: Model Loading
-      desc: Functionality for loading and patching models, tokenizers, etc.
-      contents:
-        - loaders.model
-        - loaders.tokenizer
-        - loaders.processor
-        - loaders.adapter
-        - loaders.patch_manager
-        - loaders.constants
    - title: Mixins
      desc: Mixin classes for augmenting trainers
      contents:
        - core.trainers.mixins.optimizer
        - core.trainers.mixins.rng_state_loader
        - core.trainers.mixins.scheduler
+        - core.trainers.mixins.sequence_parallel
    - title: Context Managers
      desc: Context managers for altering trainer behaviors
      contents:
-        - utils.ctx_managers.context_parallel
+        - utils.ctx_managers.sequence_parallel
    - title: Prompt Strategies
      desc: Prompt formatting strategies
      contents:
@@ -129,16 +118,17 @@ quartodoc:
        - monkeypatch.trainer_fsdp_optim
        - monkeypatch.transformers_fa_utils
        - monkeypatch.unsloth_
+        - monkeypatch.attention.mllama
        - monkeypatch.data.batch_dataset_fetcher
        - monkeypatch.mixtral
-        - monkeypatch.gradient_checkpointing.offload_cpu
-        - monkeypatch.gradient_checkpointing.offload_disk
    - title: Utils
      desc: Utility functions
      contents:
+        - utils.models
        - utils.tokenization
        - utils.chat_templates
        - utils.lora
+        - utils.lora_embeddings
        - utils.model_shard_quant
        - utils.bench
        - utils.freeze
@@ -149,7 +139,8 @@ quartodoc:
        - utils.optimizers.adopt
        - utils.data.pretraining
        - utils.data.sft
-        - utils.quantization
+        - utils.gradient_checkpointing.offload_cpu
+        - utils.gradient_checkpointing.offload_disk
    - title: Schemas
      desc: Pydantic data models for Axolotl config
      contents:
@@ -199,14 +190,12 @@ quartodoc:
        - utils.callbacks.lisa
        - utils.callbacks.mlflow_
        - utils.callbacks.comet_
-        - utils.callbacks.qat
+
 website:
  title: "Axolotl"
  description: "We make fine-tuning accessible, scalable, and fun"
  favicon: favicon.jpg

-  google-analytics: "G-9KYCVJBNMQ"
-
  navbar:
    logo: image/axolotl_logo_digital_white.svg
    title: false
@@ -259,8 +248,6 @@ website:
            - docs/lr_groups.qmd
            - docs/lora_optims.qmd
            - docs/dataset_loading.qmd
-            - docs/qat.qmd
-            - docs/quantize.qmd

        - section: "Core Concepts"
          contents:
@@ -274,7 +261,7 @@ website:
            - docs/unsloth.qmd
            - docs/torchao.qmd
            - docs/custom_integrations.qmd
-            - docs/context_parallelism.qmd
+            - docs/sequence_parallelism.qmd

        - section: "Troubleshooting"
          contents:
--- a/cicd/Dockerfile-uv.jinja
+++ b/cicd/Dockerfile-uv.jinja
@@ -1,52 +0,0 @@
-FROM axolotlai/axolotl-base-uv:{{ BASE_TAG }}
-
-ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
-ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
-ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
-ENV CUDA="{{ CUDA }}"
-ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
-ENV GITHUB_REF="{{ GITHUB_REF }}"
-ENV GITHUB_SHA="{{ GITHUB_SHA }}"
-ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
-ENV HF_HOME="{{ HF_HOME }}"
-
-RUN apt-get update && \
-    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
-
-WORKDIR /workspace
-
-RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
-
-WORKDIR /workspace/axolotl
-
-RUN git fetch origin +$GITHUB_REF && \
-    git checkout FETCH_HEAD
-
-# If AXOLOTL_EXTRAS is set, append it in brackets
-RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
-        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
-        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
-        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
-        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
-        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
-    fi
-
-RUN uv pip install packaging==23.2 setuptools==75.8.0
-RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
-    else \
-        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
-    fi
-
-RUN python scripts/unsloth_install.py --uv | sh
-RUN python scripts/cutcrossentropy_install.py --uv | sh
-
-# So we can test the Docker image
-RUN uv pip install -r requirements-dev.txt -r requirements-tests.txt
-
-# fix so that git fetch/pull from remote works
-RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
-    git config --get remote.origin.fetch
-
-# helper for huggingface-login cli
-RUN git config --global credential.helper store
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -32,6 +32,11 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
    fi

 RUN pip install packaging==23.2 setuptools==75.8.0
+RUN if [ "$PYTORCH_VERSION" = "2.6.0" ] && [ "$CUDA" = "126" ] ; then \
+        curl -L -O https://d1dttdx32dkk5p.cloudfront.net/fa3/cu${CUDA}/torch-${PYTORCH_VERSION}/flash_attn_3-3.0.0b1-cp311-cp311-linux_x86_64.whl; \
+        pip3 install --no-cache-dir flash_attn_3-3.0.0b1-cp311-cp311-linux_x86_64.whl; \
+        rm flash_attn_3-3.0.0b1-cp311-cp311-linux_x86_64.whl; \
+    fi
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -24,9 +24,9 @@ df_template = template_env.get_template("Dockerfile.jinja")
 df_args = {
    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.5.1"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu124-2.5.1"),
-    "CUDA": os.environ.get("CUDA", "124"),
+    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
+    "CUDA": os.environ.get("CUDA", "121"),
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
@@ -55,7 +55,7 @@ VOLUME_CONFIG = {
 }

 N_GPUS = int(os.environ.get("N_GPUS", 2))
-GPU_CONFIG = f"H100:{N_GPUS}"
+GPU_CONFIG = modal.gpu.H100(count=N_GPUS)


 def run_cmd(cmd: str, run_folder: str):
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
@@ -8,9 +8,8 @@ import tempfile

 import jinja2
 import modal
-import modal.experimental
 from jinja2 import select_autoescape
-from modal import App
+from modal import App, Image

 cicd_path = pathlib.Path(__file__).parent.resolve()

@@ -18,15 +17,14 @@ template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
 template_env = jinja2.Environment(
    loader=template_loader, autoescape=select_autoescape()
 )
-dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja")
-df_template = template_env.get_template(dockerfile)
+df_template = template_env.get_template("Dockerfile.jinja")

 df_args = {
    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.5.1"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu124-2.5.1"),
-    "CUDA": os.environ.get("CUDA", "124"),
+    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
+    "CUDA": os.environ.get("CUDA", "121"),
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
@@ -40,11 +38,11 @@ temp_dir = tempfile.mkdtemp()
 with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
    f.write(dockerfile_contents)

-cicd_image = modal.experimental.raw_dockerfile_image(
+cicd_image = Image.from_dockerfile(
    pathlib.Path(temp_dir) / "Dockerfile",
-    # context_mount=None,
+    context_mount=None,
    force_build=True,
-    # gpu="A10G",
+    gpu="A10G",
 ).env(df_args)

 app = App("Axolotl CI/CD", secrets=[])
@@ -57,7 +55,7 @@ VOLUME_CONFIG = {
 }

 N_GPUS = int(os.environ.get("N_GPUS", 1))
-GPU_CONFIG = f"L40S:{N_GPUS}"
+GPU_CONFIG = modal.gpu.L40S(count=N_GPUS)


 def run_cmd(cmd: str, run_folder: str):
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -1,5 +1,5 @@
-ARG CUDA_VERSION="11.8.0"
-ARG CUDNN_VERSION="8"
+ARG CUDA_VERSION="12.4.1"
+ARG CUDNN_VERSION=""
 ARG UBUNTU_VERSION="22.04"
 ARG MAX_JOBS=4

@@ -7,16 +7,16 @@ FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION A

 ENV PATH="/root/miniconda3/bin:${PATH}"

-ARG PYTHON_VERSION="3.10"
-ARG PYTORCH_VERSION="2.1.2"
-ARG CUDA="118"
+ARG PYTHON_VERSION="3.11"
+ARG PYTORCH_VERSION="2.5.1"
+ARG CUDA="124"
 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"

 ENV PYTHON_VERSION=$PYTHON_VERSION
 ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST

 RUN apt-get update \
-    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
+    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config curl && rm -rf /var/lib/apt/lists/* \
    && wget \
    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
    && mkdir /root/.conda \
@@ -38,6 +38,10 @@ RUN git lfs install --skip-repo && \
    # The base image ships with `pydantic==1.8.2` which is not working
    pip3 install -U --no-cache-dir pydantic==1.10.10

-RUN if [ "$PYTORCH_VERSION" = "2.7.0" ] ; then \
+RUN if [ "$TORCH_CUDA_ARCH_LIST" = "9.0+PTX" ] ; then \
+        curl -L -O https://d1dttdx32dkk5p.cloudfront.net/fa3/cu${CUDA}/torch-${PYTORCH_VERSION}/flash_attn_3-3.0.0b1-cp311-cp311-linux_x86_64.whl; \
+        pip3 install --no-cache-dir flash_attn_3-3.0.0b1-cp311-cp311-linux_x86_64.whl; \
+        rm flash_attn_3-3.0.0b1-cp311-cp311-linux_x86_64.whl; \
+    elif [ "$PYTORCH_VERSION" = "2.7.0" ] ; then \
        pip3 install flash-attn==2.7.4.post1; \
    fi
--- a/docker/Dockerfile-uv-base
+++ b/docker/Dockerfile-uv-base
@@ -1,36 +0,0 @@
-ARG CUDA_VERSION="12.6.3"
-ARG CUDNN_VERSION=""
-ARG UBUNTU_VERSION="22.04"
-ARG MAX_JOBS=4
-
-FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
-
-ARG PYTHON_VERSION="3.11"
-ARG PYTORCH_VERSION="2.6.0"
-ARG CUDA="126"
-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
-
-ENV PYTHON_VERSION=$PYTHON_VERSION
-ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
-ENV UV_TORCH_BACKEND="cu${CUDA}"
-
-RUN apt-get update \
-    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config curl && rm -rf /var/lib/apt/lists/* \
-    && git lfs install --skip-repo \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh
-
-ENV PATH="/root/.local/bin:${PATH}"
-
-RUN uv python install ${PYTHON_VERSION}
-
-WORKDIR /workspace
-
-RUN uv venv --no-project --relocatable axolotl-venv
-
-ENV PATH="/workspace/axolotl-venv/bin:${PATH}"
-
-RUN uv pip install packaging setuptools wheel \
-    && uv pip install torch==${PYTORCH_VERSION} \
-    && uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" \
-    && uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" \
-    && uv pip install awscli pydantic
--- a/docs/cli.qmd
+++ b/docs/cli.qmd
@@ -209,16 +209,6 @@ axolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir

 This would be necessary to use with other frameworks. If you have an adapter, merge it with the non-quantized linearized model before delinearizing.

-### quantize
-
-Quantizes a model using the quantization configuration specified in your YAML file.
-
-```bash
-axolotl quantize config.yml
-```
-
-See [Quantization](./quantize.qmd) for more details.
-

 ## Legacy CLI Usage

--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -65,20 +65,6 @@ bnb_config_kwargs:
  bnb_4bit_quant_type: nf4
  bnb_4bit_use_double_quant: true

-# quantization aware training
-qat:
-  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
-  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4" and "int8"
-  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
-  fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after
-
-# post-training quantization
-quantization:
-  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are uintX for X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8
-  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
-  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
-  quantize_embedding: # Optional[bool] = False. Whether to quantize the embedding layer.
-

 # Whether you are training a 4-bit GPTQ quantized model
 gptq: true
@@ -112,10 +98,8 @@ plugins:
  # - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

 # A list of one or more datasets to finetune the model with
-# See https://docs.axolotl.ai/docs/dataset_loading.html for guide on loading datasets
-# See https://docs.axolotl.ai/docs/dataset-formats/ for guide on dataset formats
 datasets:
-  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory
+  # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
  - path: vicgalle/alpaca-gpt4
    # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]
    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
@@ -237,7 +221,7 @@ datasets:
 # The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.
 shuffle_merged_datasets: true

-# Deduplicates datasets and test_datasets with identical entries.
+Deduplicates datasets and test_datasets with identical entries.
 dataset_exact_deduplication: true

 # A list of one or more datasets to eval the model with.
@@ -286,25 +270,10 @@ trl:

  num_generations: # Optional[int]. Number of generations to sample.
  log_completions: # Optional[bool]. Whether to log completions.
-  num_completions_to_print: # Optional[int]. Number of completions to print when log_completions is True.

  sync_ref_model: # Optional[bool]. Whether to sync the reference model.
  ref_model_mixup_alpha: # Optional[float]. Mixup alpha for the reference model.
  ref_model_sync_steps: # Optional[int]. Sync steps for the reference model.
-  scale_rewards: # Optional[bool]. Whether to scale rewards by their standard deviation.
-
-  temperature: # Optional[float]. Sampling temperature for the GRPO policy.
-  top_p: # Optional[float]. Top-p sampling probability for the generation policy.
-  top_k: # Optional[int]. Top-k sampling for the generation policy.
-  min_p: # Optional[float]. Minimum probability for the generation policy.
-  repetition_penalty: # Optional[float]. Penalty for tokens that appear in prompt and generated text.
-
-  num_iterations: # Optional[int]. Number of iterations per batch (μ) for GRPO.
-  epsilon: # Optional[float]. Epsilon value for clipping in the GRPO algorithm.
-  epsilon_high: # Optional[float]. Upper-bound epsilon value for clipping in the GRPO algorithm.
-  use_liger_loss: # Optional[bool]. Whether to use Liger loss for GRPO.
-  loss_type: # Optional[str]. Loss formulation to use. Supported values: grpo, bnpo, dr_grpo.
-  mask_truncated_completions: # Optional[bool]. Whether to exclude truncated completions from loss calculation.


 # reward modelling: `True` or `False`
@@ -514,7 +483,6 @@ output_dir: ./completed-model
 # setting to `auto` will enable torch compile when torch>=2.5.1
 torch_compile:  # Optional[Union[Literal["auto"], bool]]
 torch_compile_backend:  # Optional[str]
-torch_compile_mode:  # 'default' | 'reduce-overhead' | 'max-autotune'

 # Training hyperparameters

@@ -561,7 +529,7 @@ profiler_steps: # enable the pytorch profiler to capture the first N steps of tr
 loss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)
 loss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)

-# Save model as safetensors (require safetensors package). Default True
+# Save model as safetensors (require safetensors package)
 save_safetensors:

 # Whether to mask out or include the human's prompt from the training labels
@@ -583,24 +551,7 @@ gradient_checkpointing: false
 early_stopping_patience: 3

 # Specify a scheduler and kwargs to use with the optimizer
-# Valid values are driven by the Transformers SchedulerType class, see:
-# https://github.com/huggingface/transformers/blob/5f4ecf2d9f867a1255131d2461d75793c0cf1db2/src/transformers/trainer_utils.py#L420
-# Valid values include
-# - 'linear'
-# - 'cosine' (default)
-# - 'cosine_with_restarts'
-# - 'polynomial'
-# - 'constant'
-# - 'constant_with_warmup'
-# - 'inverse_sqrt'
-# - 'reduce_lr_on_plateau'
-# - 'cosine_with_min_lr'
-# - 'warmup_stable_decay'
-
-# Additional schedulers include:
-# - 'one_cycle'
-# - 'rex'
-lr_scheduler:
+lr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | 'linear' | 'cosine_with_restarts' | 'polynomial' | 'constant' | 'constant_with_warmup' | 'inverse_sqrt' | 'reduce_lr_on_plateau' | 'cosine_with_min_lr' | 'warmup_stable_decay' | empty for cosine
 lr_scheduler_kwargs:
 cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
 cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)
@@ -618,7 +569,7 @@ lr_div_factor: # Learning rate div factor
 #
 # Valid values for 'optimizer' include:
 # - adamw_torch
-# - adamw_torch_fused (default)
+# - adamw_torch_fused
 # - adamw_torch_xla
 # - adamw_torch_npu_fused
 # - adamw_apex_fused
@@ -764,13 +715,13 @@ ddp_timeout:
 ddp_bucket_cap_mb:
 ddp_broadcast_buffers:

-# Context parallelism
+# Sequence parallelism
 # Set to a divisor of the number of GPUs available to split sequences into chunks of equal size.
 # Use in long context training to prevent OOM when sequences cannot fit into a single GPU's VRAM.
 # E.g., if 4 GPUs are available, set this value to 2 to split each sequence into two equal-sized
 # subsequences, or set to 4 to split into four equal-sized subsequences.
-# See https://docs.axolotl.ai/docs/context_parallelism.html for more details.
-context_parallel_degree:
+# See https://docs.axolotl.ai/docs/sequence_parallelism.html for more details.
+sequence_parallel_degree:
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 # Must evenly divide the number of KV heads in your model.
 heads_k_stride: 1
--- a/docs/dataset-formats/index.qmd
+++ b/docs/dataset-formats/index.qmd
@@ -36,6 +36,10 @@ It is typically recommended to save your dataset as `.jsonl` due to its flexibil

 Axolotl supports loading from a Hugging Face hub repo or from local files.

+::: {.callout-important}
+For pre-training only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts.
+:::
+
 ### Pre-training from Hugging Face hub datasets

 As an example, to train using a Hugging Face dataset `hf_org/name`, you can pass the following config:
@@ -73,21 +77,18 @@ datasets:
    type: completion
 ```

-From local files:
+From local files (either example works):

 ```yaml
 datasets:
  - path: A.jsonl
    type: completion

-  - path: B.jsonl
+  - path: json
+    data_files: ["A.jsonl", "B.jsonl", "C.jsonl"]
    type: completion
 ```

-::: {.callout-important}
-For `completion` only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts. If you are interested in having this for `pretraining_dataset` too, please let us know or help make a PR!
-:::
-
 ### Pre-training dataset configuration tips

 #### Setting max_steps
--- a/docs/dataset_loading.qmd
+++ b/docs/dataset_loading.qmd
@@ -54,7 +54,7 @@ datasets:

 #### Files

-To load a JSON file, you would do something like this:
+Usually, to load a JSON file, you would do something like this:

 ```python
 from datasets import load_dataset
@@ -66,11 +66,19 @@ Which translates to the following config:

 ```yaml
 datasets:
-  - path: data.json
-    ds_type: json
+  - path: json
+    data_files: /path/to/your/file.jsonl
 ```

-In the example above, it can be seen that we can just point the `path` to the file or directory along with the `ds_type` to load the dataset.
+However, to make things easier, we have added a few shortcuts for loading local dataset files.
+
+You can just point the `path` to the file or directory along with the `ds_type` to load the dataset. The below example shows for a JSON file:
+
+```yaml
+datasets:
+  - path: /path/to/your/file.jsonl
+    ds_type: json
+```

 This works for CSV, JSON, Parquet, and Arrow files.

--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -8,10 +8,6 @@ format:

 This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai).

-::: {.callout-important}
-For Blackwell GPUs, please use the tags with Pytorch 2.7.0 and CUDA 12.8.
-:::
-
 ## Base

 The base image is the most minimal image that can install Axolotl. It is based on the `nvidia/cuda` image. It includes python, torch, git, git-lfs, awscli, pydantic, and more.
@@ -36,6 +32,7 @@ Tags examples:
 - `main-base-py3.11-cu126-2.7.0`
 - `main-base-py3.11-cu124-2.6.0`
 - `main-base-py3.11-cu124-2.5.1`
+- `main-base-py3.11-cu124-2.4.1`

 ## Main

@@ -76,10 +73,12 @@ Tags examples:
 - `main-py3.11-cu126-2.7.0`
 - `main-py3.11-cu124-2.6.0`
 - `main-py3.11-cu124-2.5.1`
+- `main-py3.11-cu124-2.4.1`
 - `main-latest`
 - `main-20250303-py3.11-cu124-2.6.0`
 - `main-20250303-py3.11-cu124-2.5.1`
- `0.9.2`
+- `main-20250303-py3.11-cu124-2.4.1`
+- `0.7.1`

 ## Cloud

--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -110,17 +110,3 @@ description: Frequently asked questions
 > A: If `eot_tokens: ` is not provided, the default behavior is the same as before. EOS tokens used to delimit turns are masked/unmasked depending on whether the turn is trainable.

 > Internally, `eot_tokens: tokenizer.eos_token` and `train_on_eot: train_on_eos` (which defaults to `turn`). This transition helps clarify the naming and behavior of EOT/EOS tokens.
-
-**Q: `Data processing error: CAS service error`**
-
-> A: Try disabling XET with `export HF_HUB_DISABLE_XET=1`
-
-**Q: `torch._inductor.exc.LoweringException: NoValidChoicesError: No choices to select, please consider adding ATEN into max_autotune_gemm_backends config (defined in torch/_inductor/config.py) to allow at least one choice. `**
-
-> A: Depending on the version of torch, you may need to include this in your YAML:
-
-> ```yaml
-> flex_attn_compile_kwargs:
->   dynamic: false
->   mode: max-autotune-no-cudagraphs
-> ```
--- a/docs/getting-started.qmd
+++ b/docs/getting-started.qmd
@@ -180,7 +180,7 @@ Now that you have the basics, you might want to:
 Check our other guides for details on these topics:

 - [Configuration Guide](config.qmd) - Full configuration options
- [Dataset Loading](dataset_loading.qmd) - Loading datasets from various sources
+- [Dataset Loading](dataset-loading.qmd) - Loading datasets from various sources
 - [Dataset Formats](dataset-formats) - Working with different data formats
 - [Multi-GPU Training](multi-gpu.qmd)
 - [Multi-Node Training](multi-node.qmd)
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -15,7 +15,7 @@ This guide covers all the ways you can install and set up Axolotl for your envir

 - NVIDIA GPU (Ampere architecture or newer for `bf16` and Flash Attention) or AMD GPU
 - Python ≥3.10
- PyTorch ≥2.5.1
+- PyTorch ≥2.4.1

 ## Installation Methods {#sec-installation-methods}

@@ -25,10 +25,6 @@ Please make sure to have Pytorch installed before installing Axolotl in your loc
 Follow the instructions at: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
 :::

-::: {.callout-important}
-For Blackwell GPUs, please use Pytorch 2.7.0 and CUDA 12.8.
-:::
-
 ### PyPI Installation (Recommended) {#sec-pypi}

 ```{.bash}
@@ -41,40 +37,6 @@ installed) in order not to clobber it, and so that we set the correct version of
 dependencies that are specific to the PyTorch version or other installed
 co-dependencies.

-### uv Installation {#sec-uv}
-
-uv is a fast, reliable Python package installer and resolver built in Rust. It offers significant performance improvements over pip and provides better dependency resolution, making it an excellent choice for complex environments.
-
-Install uv if not already installed
-```{.bash}
-curl -LsSf https://astral.sh/uv/install.sh | sh
-source $HOME/.local/bin/env
-```
-
-Choose your CUDA version to use with PyTorch; e.g. `cu124`, `cu126`, `cu128`,
-then create the venv and activate
-```{.bash}
-export UV_TORCH_BACKEND=cu126
-uv venv --no-project --relocatable
-source .venv/bin/activate
-```
-
-Install PyTorch
- PyTorch 2.6.0 recommended
-```{.bash}
-uv pip install packaging setuptools wheel
-uv pip install torch==2.6.0
-uv pip install awscli pydantic
-```
-
-Install axolotl from PyPi
-```{.bash}
-uv pip install --no-build-isolation axolotl[deepspeed,flash-attn]
-
-# optionally install with vLLM if you're using torch==2.6.0 and want to train w/ GRPO
-uv pip install --no-build-isolation axolotl[deepspeed,flash-attn,vllm]
-```
-
 ### Edge/Development Build {#sec-edge-build}

 For the latest features between releases:
@@ -110,10 +72,6 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it \
 ```
 :::

-::: {.callout-important}
-For Blackwell GPUs, please use `axolotlai/axolotl:main-py3.11-cu128-2.7.0` or the cloud variant `axolotlai/axolotl-cloud:main-py3.11-cu128-2.7.0`.
-:::
-
 Please refer to the [Docker documentation](docker.qmd) for more information on the different Docker images that are available.

 ## Cloud Environments {#sec-cloud}
--- a/docs/lora_optims.qmd
+++ b/docs/lora_optims.qmd
@@ -84,10 +84,6 @@ lora_qkv_kernel: true
 lora_o_kernel: true
 ```

-::: {.callout-note}
-Currently, LoRA kernels are not supported for RLHF training, only SFT.
-:::
-
 ## Requirements

 - One or more NVIDIA or AMD GPUs (in order to use the Triton kernels)
--- a/docs/multi-gpu.qmd
+++ b/docs/multi-gpu.qmd
@@ -18,7 +18,7 @@ Axolotl supports several methods for multi-GPU training:

 - DeepSpeed (recommended)
 - FSDP (Fully Sharded Data Parallel)
- Context parallelism
+- Sequence parallelism
 - FSDP + QLoRA

 ## DeepSpeed {#sec-deepspeed}
@@ -80,14 +80,27 @@ fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
 ```

-## Context parallelism {#sec-sequence-parallelism}
+## Sequence parallelism {#sec-sequence-parallelism}

-We support context parallelism (SP) via the
+We support sequence parallelism (SP) via the
 [ring-flash-attention](https://github.com/zhuzilin/ring-flash-attention) project. This
 allows one to split up sequences across GPUs, which is useful in the event that a
 single sequence causes OOM errors during model training.

-See our [dedicated guide](context_parallelism.qmd) for more information.
+First, install `ring-flash-attn`, recommended via `pip install axolotl[ring-flash-attn]`,
+or from source with `pip install .[ring-flash-attn]`.
+
+Your Axolotl YAML config should contain the following lines:
+
+```{.yaml}
+sequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU
+flash_attention: true  # Required with sequence parallelism
+
+# Optional; strides across the key dimension. Larger values use more memory but will make training faster.
+heads_k_stride: 1
+```
+
+See our [dedicated guide](sequence_parallelism.qmd) for more details.

 ### FSDP + QLoRA {#sec-fsdp-qlora}

--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -43,7 +43,7 @@ datasets:
 # leave the vision model and vision tower frozen
 # load_in_8bit: true
 adapter: lora
-lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 # (optional) if you want to resize images to a set size
 image_size: 512
--- a/docs/qat.qmd
+++ b/docs/qat.qmd
@@ -1,32 +0,0 @@
---
-title: "Quantization Aware Training (QAT)"
-back-to-top-navigation: true
-toc: true
-toc-expand: 2
-toc-depth: 4
---
-
-## Overview
-
-[Quantization Aware Training](https://pytorch.org/blog/introduction-to-quantization-on-pytorch/#quantization-aware-training) (QAT) is a technique for improving the accuracy of models which are quantized
-by applying "fake" quantizations to the model's weights (and optionally, activations) during training. This fake
-quantization allows for the model to adjust for noise introduced by the quantization, so when the model is eventually
-quantized, the accuracy loss is minimized. We use the quantization techniques implemented in [torchao](https://github.com/pytorch/ao) to provide
-support for QAT and post-training quantization (PTQ) in axolotl.
-
-We recommend reviewing the excellent QAT tutorial in the [torchtune library](https://pytorch.org/torchtune/main/tutorials/qat_finetune.html#quantizing-the-qat-model),
-and the QAT documentation in the [torchao library](https://github.com/pytorch/ao/tree/main/torchao/quantization/qat), for more details.
-
-## Configuring QAT in Axolotl
-
-To enable QAT in axolotl, add the following to your configuration file:
-
-```yaml
-qat:
-  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
-  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4" and "int8"
-  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
-  fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after
-```
-
-Once you have finished training, you must quantize your model by using the same quantization configuration which you used to train the model with. You can use the [`quantize` command](./quantize.md) to do this.
--- a/docs/quantize.qmd
+++ b/docs/quantize.qmd
@@ -1,53 +0,0 @@
---
-title: "Quantization with torchao"
-back-to-top-navigation: true
-toc: true
-toc-expand: 2
-toc-depth: 4
---
-
-Quantization is a technique to lower the memory footprint of your model, potentially at the cost of accuracy or model performance. We support quantizing your model using the [torchao](https://github.com/pytorch/ao) library. Quantization is supported for both post-training quantization (PTQ) and quantization-aware training (QAT).
-
-
-::: {.callout-note}
-
-We do not currently support quantization techniques such as GGUF/GPTQ,EXL2 at the moment.
-
-:::
-
-## Configuring Quantization in Axolotl
-
-Quantization is configured using the `quantization` key in your configuration file.
-
-```yaml
-base_model: # The path to the model to quantize.
-quantization:
-  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are uintX for X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8
-  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
-  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
-  quantize_embedding: # Optional[bool] = False. Whether to quantize the embedding layer.
-
-output_dir:  # The path to the output directory.
-```
-
-Once quantization is complete, your quantized model will be saved in the `{output_dir}/quantized` directory.
-
-You may also use the `quantize` command to quantize a model which has been trained with [QAT](./qat.md) - you can do this by using the existing QAT configuration file which
-you used to train the model:
-
-```yaml
-# qat.yml
-qat:
-  activation_dtype: int8
-  weight_dtype: int8
-  group_size: 256
-  quantize_embedding: true
-
-output_dir: # The path to the output directory used during training where the final checkpoint has been saved.
-```
-
-```bash
-axolotl quantize qat.yml
-```
-
-This ensures that an identical quantization configuration is used to quantize the model as was used to train it.
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -16,8 +16,7 @@ feedback. Various methods include, but not limited to:
 - [Identity Preference Optimization (IPO)](#ipo)
 - [Kahneman-Tversky Optimization (KTO)](#kto)
 - [Odds Ratio Preference Optimization (ORPO)](#orpo)
- [Group Relative Policy Optimization (GRPO)](#grpo)
- Proximal Policy Optimization (PPO) (not yet supported in axolotl, if you're interested in contributing, please reach out!)
+- Proximal Policy Optimization (PPO) (not yet supported in axolotl)


 ## RLHF using Axolotl
@@ -583,20 +582,7 @@ datasets:

 To see other examples of custom reward functions, please see [TRL GRPO Docs](https://github.com/huggingface/trl/blob/main/docs/source/grpo_trainer.md#using-a-custom-reward-function).

-To see all configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/v0.9.2/src/axolotl/utils/schemas/trl.py).
-
-#### GRPO with DAPO/Dr. GRPO loss
-
-The DAPO paper and subsequently Dr. GRPO paper proposed an alternative loss function for GRPO to remediate the penalty in longer responses.
-
-```yaml
-trl:
-  loss_type: dr_grpo
-  # Normalizes loss based on max completion length (default: 256)
-  max_completion_length:
-```
-
-For more information, see [GRPO docs](https://huggingface.co/docs/trl/v0.17.0/en/grpo_trainer#loss-types).
+To see description of the configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/config/models/input/v0_4_1/trl.py).

 ### SimPO

--- a/docs/sequence_parallelism.qmd
+++ b/docs/sequence_parallelism.qmd
@@ -1,16 +1,16 @@
 ---
-title: Context Parallelism
+title: Sequence Parallelism
 description: Train with long sequences split across multiple GPUs.
 ---

-Context parallelism is a technique that splits sequences across multiple GPUs,
+Sequence parallelism is a technique that splits sequences across multiple GPUs,
 allowing you to train with very long sequences that wouldn't fit on a single GPU. Each
 GPU processes a different portion of the sequence, and the results are aggregated
 through a ring communication pattern.

-## When to Use Context Parallelism
+## When to Use Sequence Parallelism

-Use context parallelism when:
+Use sequence parallelism when:

 - You need to train with sequence lengths that don't fit into a single GPU's memory
 - You have multiple GPUs available
@@ -18,11 +18,11 @@ Use context parallelism when:

 ## Configuration

-To enable context parallelism, add the following to your configuration file:
+To enable sequence parallelism, add the following to your configuration file:

 ```yaml
 # Set to a divisor (> 1) of the number of GPUs available
-context_parallel_degree: 4  # Split sequences across 4 GPUs
+sequence_parallel_degree: 4  # Split sequences across 4 GPUs
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 heads_k_stride: 1
 # Optional; one of "varlen_llama3" or "batch_ring". Defaults to
@@ -30,23 +30,23 @@ heads_k_stride: 1
 ring_attn_func:
 ```

-The `context_parallel_degree` should be a divisor of the total number of GPUs. For example:
+The `sequence_parallel_degree` should be a divisor of the total number of GPUs. For example:

 - With 8 GPUs, valid values would be 2, 4, or 8
 - With 4 GPUs, valid values would be 2 or 4

 ## Implementation Details

-When context parallelism is enabled:
+When sequence parallelism is enabled:

-1. Each sequence is divided into equal chunks across the GPUs in a context parallel group
+1. Each sequence is divided into equal chunks across the GPUs in a sequence parallel group
 2. The data collator handles the chunking of input_ids, attention_mask, labels, and position_ids
-3. Position IDs are adjusted to maintain proper relative positions
+3. Position IDs are adjusted to maintain proper relative positions, especially for packed sequences
 4. The trainer uses special ring communication patterns for attention operations

 ## Requirements

-To use context parallelism, you need:
+To use sequence parallelism, you need:

 - Multiple GPUs (at least 2)
 - The `ring-flash-attn` package. Install with:
@@ -66,12 +66,10 @@ sequence_len: 8192

 ...

-context_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU
+sequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU
+flash_attention: true  # Required with sequence parallelism
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 heads_k_stride: 1
-# Optional; one of "varlen_llama3" or "batch_ring". Defaults to
-# "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
-ring_attn_func:

 ...
 ```
@@ -79,22 +77,22 @@ ring_attn_func:
 This will train the Llama 3 8B model with 8K context length, with each sequence split
 into 2 subsequences of length 4096 across 2 GPUs.

-## Sample Packing with Context Parallelism
+## Sample Packing with Sequence Parallelism

-Context parallelism is compatible with Axolotl's sample packing functionality. When using both features together:
+Sequence parallelism is compatible with Axolotl's sample packing functionality. When using both features together:

 1. Samples are first packed together
-2. The packed sequences are then divided across GPUs in the context parallel group
+2. The packed sequences are then divided across GPUs in the sequence parallel group
 3. Position IDs are automatically adjusted to maintain proper relative positions

 ## Effect on Batch Size

-When using context parallelism, your effective global batch size is **divided** by the `context_parallel_degree`. This happens because:
+When using sequence parallelism, your effective global batch size is **divided** by the `sequence_parallel_degree`. This happens because:

- Each group of `context_parallel_degree` GPUs works on the same batch (just different parts of each sequence)
+- Each group of `sequence_parallel_degree` GPUs works on the same batch (just different parts of each sequence)
 - The number of batches processed per step decreases

 For example:
- With 8 GPUs and no context parallelism: 8 different batches processed per step
- With 8 GPUs and `context_parallel_degree=4`: Only 2 different batches processed per step (each split across 4 GPUs)
+- With 8 GPUs and no sequence parallelism: 8 different batches processed per step
+- With 8 GPUs and `sequence_parallel_degree=4`: Only 2 different batches processed per step (each split across 4 GPUs)
 - If your per-GPU `micro_batch_size` is 2, the global batch size decreases from 16 to 4
--- a/examples/gemma3/gemma-3-4b-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-qlora.yml
@@ -28,7 +28,7 @@ pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 wandb_project:
 wandb_entity:
--- a/examples/gemma3/gemma-3-4b-vision-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-vision-qlora.yml
@@ -30,7 +30,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 wandb_project:
 wandb_entity:
--- a/examples/llama-3-vision/lora-11b.yaml
+++ b/examples/llama-3-vision/lora-11b.yaml
@@ -29,7 +29,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 wandb_project:
 wandb_entity:
--- a/examples/llama-3/3b-qat-fsdp2.yaml
+++ b/examples/llama-3/3b-qat-fsdp2.yaml
@@ -1,79 +0,0 @@
-base_model: meta-llama/Llama-3.2-3B
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-
-load_in_8bit: false
-load_in_4bit: false
-strict: false
-
-plugins:
-  - axolotl.integrations.liger.LigerPlugin
-
-liger_rope: true
-liger_rms_norm: true
-liger_glu_activation: true
-liger_layer_norm: true
-liger_fused_linear_cross_entropy: true
-
-datasets:
-  - path: yahma/alpaca-cleaned
-    type: alpaca
-
-output_dir: ./outputs/qat_out/
-
-sample_packing: true
-pad_to_sequence_len: true
-sequence_len: 512
-
-flex_attention: true
-flex_attn_compile_kwargs:
-  dynamic: false
-  mode: max-autotune-no-cudagraphs
-
-qat:
-  activation_dtype: int8
-  weight_dtype: int4
-  group_size: 32
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 1
-micro_batch_size: 16
-num_epochs: 1
-optimizer: adamw_torch_fused
-
-cosine_constant_lr_ratio: 0
-cosine_min_lr_ratio: 1.0
-learning_rate: 2e-5
-save_only_model: true
-bf16: true
-
-resume_from_checkpoint:
-logging_steps: 1
-
-evals_per_epoch: 1
-saves_per_epoch: 1
-
-warmup_steps: 10
-weight_decay: 0.0
-fsdp:
-  - full_shard
-  - auto_wrap
-
-fsdp_config:
-  fsdp_version: 2
-  fsdp_offload_params: false
-  fsdp_cpu_ram_efficient_loading: true
-  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
-  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_sharding_strategy: FULL_SHARD
-  fsdp_reshard_after_forward: true
-  fsdp_activation_checkpointing: true
-
-special_tokens:
-  pad_token: <|end_of_text|>
--- a/examples/llama-3/lora-1b.yml
+++ b/examples/llama-3/lora-1b.yml
@@ -5,7 +5,7 @@ base_model: NousResearch/Llama-3.2-1B
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
-
+dataset_prepared_path: last_run_prepared
 val_set_size: 0.1
 output_dir: ./outputs/lora-out

@@ -38,7 +38,6 @@ wandb_log_model:
 gradient_accumulation_steps: 2
 micro_batch_size: 2
 num_epochs: 1
-
 optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
--- a/examples/llava/lora-7b.yaml
+++ b/examples/llava/lora-7b.yaml
@@ -25,7 +25,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 wandb_project:
 wandb_entity:
--- a/examples/mistral/mistral-small-3.1-24B-lora.yml
+++ b/examples/mistral/mistral-small-3.1-24B-lora.yml
@@ -27,7 +27,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 wandb_project:
 wandb_entity:
--- a/examples/pixtral/lora-12b.yml
+++ b/examples/pixtral/lora-12b.yml
@@ -25,7 +25,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 wandb_project:
 wandb_entity:
--- a/examples/qwen2/dpo.yaml
+++ b/examples/qwen2/dpo.yaml
@@ -2,6 +2,7 @@ base_model: Qwen/Qwen2.5-0.5B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name

+
 chat_template: qwen_25
 rl: dpo
 datasets:
--- a/examples/qwen3/8b-qat-fsdp2.yml
+++ b/examples/qwen3/8b-qat-fsdp2.yml
@@ -1,78 +0,0 @@
-base_model: Qwen/Qwen3-8B
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-
-load_in_8bit: false
-load_in_4bit: false
-strict: false
-
-plugins:
-  - axolotl.integrations.liger.LigerPlugin
-
-liger_rope: true
-liger_rms_norm: true
-liger_glu_activation: true
-liger_layer_norm: true
-liger_fused_linear_cross_entropy: true
-
-datasets:
-  - path: tatsu-lab/alpaca
-    type: alpaca
-
-output_dir: ./outputs/qat_out/
-
-sequence_len: 2048
-sample_packing: true
-flex_attention: true
-pad_to_sequence_len: true
-
-flex_attn_compile_kwargs:
-  dynamic: false
-  mode: max-autotune-no-cudagraphs
-
-qat:
-  activation_dtype: int8
-  weight_dtype: int4
-  group_size: 256
-  fake_quant_after_n_steps: 1000
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 1
-micro_batch_size: 2
-max_steps: 2000
-optimizer: adamw_torch_fused
-lr_scheduler: cosine
-learning_rate: 2e-5
-
-bf16: true
-tf32: true
-
-resume_from_checkpoint:
-logging_steps: 1
-
-evals_per_epoch: 1
-saves_per_epoch: 1
-
-warmup_steps: 10
-weight_decay: 0.0
-fsdp:
-  - full_shard
-  - auto_wrap
-
-fsdp_config:
-  fsdp_version: 2
-  fsdp_offload_params: false
-  fsdp_cpu_ram_efficient_loading: true
-  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  fsdp_transformer_layer_cls_to_wrap: Qwen3DecoderLayer
-  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_sharding_strategy: FULL_SHARD
-  fsdp_reshard_after_forward: true
-  fsdp_activation_checkpointing: true
-
-special_tokens:
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,20 +6,21 @@ triton>=3.0.0
 mamba-ssm==1.2.0.post1
 xformers>=0.0.23.post1
 autoawq==0.2.7.post3
-liger-kernel==0.5.10
+liger-kernel==0.5.9
 # END section

 packaging==23.2

-huggingface_hub==0.32.2
+huggingface_hub==0.31.0
 peft==0.15.2
-transformers==4.52.3
+transformers==4.51.3
 tokenizers>=0.21.1
-accelerate==1.7.0
-datasets==3.6.0
-deepspeed>=0.17.0
-trl==0.18.1
-hf_xet==1.1.2
+accelerate==1.6.0
+datasets==3.5.1
+deepspeed>=0.15.4
+trl==0.17.0
+hf_xet==1.1.0
+hqq==0.2.5

 optimum==1.16.2
 hf_transfer
@@ -62,7 +63,7 @@ langdetect==1.0.9
 immutabledict==4.2.0
 antlr4-python3-runtime==4.13.2

-torchao==0.10.0
+torchao==0.9.0
 schedulefree==1.4.1

 axolotl-contribs-lgpl==0.0.6
--- a/scripts/cutcrossentropy_install.py
+++ b/scripts/cutcrossentropy_install.py
@@ -9,8 +9,6 @@ except ImportError as exc:
    raise ImportError("Install torch via `pip install torch`") from exc
 from packaging.version import Version as V

-USE_UV = "--uv" in sys.argv[1:]
-
 v = V(torch.__version__)

 # no cut-cross-entropy support for torch < 2.4.0
@@ -25,9 +23,7 @@ if cce_spec:
    if not importlib.util.find_spec("cut_cross_entropy.transformers"):
        UNINSTALL_PREFIX = "pip uninstall -y cut-cross-entropy && "

-UV_PREFIX = "uv " if USE_UV else ""
-
 print(
    UNINSTALL_PREFIX
-    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@a1174ca"'
+    + 'pip install "cut-cross-entropy[transformers] @ git+https://github.com/apple/ml-cross-entropy.git@bad6f7b49c75fdec69471abb71b4cddd0f0c6438"'
 )
--- a/scripts/motd
+++ b/scripts/motd
@@ -11,7 +11,7 @@
                                 =@#       @#  #@=     #@   =#@@@@#=    +#@@=  +#@@@@#=    .##@@+   @@
    @@@@  @@@@@@@@@@@@@@@@

-Welcome to the axolotl cloud image! If the you've mounted a disk to /workspace and the axolotl directory is empty, run the following commands:
+Welcome to the axolotl cloud image! If the you've mounted a disk to /workspace and the axolotl directory ie empty, run the following commands:

 ```
 cd /workspace
--- a/scripts/unsloth_install.py
+++ b/scripts/unsloth_install.py
@@ -1,15 +1,11 @@
 # noqa
 # pylint: skip-file
-import sys
-
 try:
    import torch
 except ImportError:
    raise ImportError("Install torch via `pip install torch`")
 from packaging.version import Version as V

-use_uv = "--uv" in sys.argv[1:]
-
 v = V(torch.__version__)
 cuda = str(torch.version.cuda)
 try:
@@ -35,7 +31,6 @@ elif v < V("2.6.0"):
 else:
    raise RuntimeError(f"Torch = {v} too new!")
 x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "")
-uv_prefix = "uv " if use_uv else ""
 print(
-    f'{uv_prefix}pip install unsloth-zoo==2024.12.1 && {uv_prefix}pip install --no-deps "unsloth[{x}]==2024.12.4"'
+    f'pip install unsloth-zoo==2024.12.1 && pip install --no-deps "unsloth[{x}]==2024.12.4"'
 )
--- a/setup.py
+++ b/setup.py
@@ -118,7 +118,7 @@ extras_require = {
        "yunchang==0.6.0",
    ],
    "deepspeed": [
-        "deepspeed==0.17.0",
+        "deepspeed==0.15.4",
        "deepspeed-kernels",
    ],
    "mamba-ssm": [
--- a/src/axolotl/cli/args.py
+++ b/src/axolotl/cli/args.py
@@ -28,6 +28,7 @@ class TrainerCliArgs:
    debug: bool = field(default=False)
    debug_text_only: bool = field(default=False)
    debug_num_examples: int = field(default=0)
+    merge_lora: bool = field(default=False)
    prompter: Optional[str] = field(default=None)
    shard: bool = field(default=False)
    main_process_port: Optional[int] = field(default=None)
@@ -88,26 +89,6 @@ class VllmServeCliArgs:
        },
    )

-    enable_reasoning: Optional[bool] = field(
-        default=None,
-    )
-
-    reasoning_parser: Optional[str] = field(
-        default=None,
-    )
-
-
-@dataclass
-class QuantizeCliArgs:
-    """Dataclass with CLI arguments for `axolotl quantize` command."""
-
-    base_model: Optional[str] = field(default=None)
-    weight_dtype: Optional[str] = field(default=None)
-    activation_dtype: Optional[str] = field(default=None)
-    quantize_embedding: Optional[bool] = field(default=None)
-    group_size: Optional[int] = field(default=None)
-    output_dir: Optional[str] = field(default=None)
-

@dataclass
 class EvaluateCliArgs:
--- a/src/axolotl/cli/checks.py
+++ b/src/axolotl/cli/checks.py
@@ -1,5 +1,6 @@
 """Various checks for Axolotl CLI."""

+import logging
 import os
 from pathlib import Path

@@ -7,9 +8,7 @@ from accelerate.commands.config import config_args
 from huggingface_hub import HfApi
 from huggingface_hub.utils import LocalTokenNotFoundError

-from axolotl.utils.logging import get_logger
-
-LOG = get_logger(__name__)
+LOG = logging.getLogger(__name__)


 def check_accelerate_default_config() -> None:
--- a/src/axolotl/cli/cloud/modal_.py
+++ b/src/axolotl/cli/cloud/modal_.py
@@ -82,7 +82,7 @@ class ModalCloud(Cloud):
        return res

    def get_image(self):
-        docker_tag = "main-py3.11-cu124-2.6.0"
+        docker_tag = "main-py3.11-cu124-2.5.1"
        if self.config.docker_tag:
            docker_tag = self.config.docker_tag
        docker_image = f"axolotlai/axolotl:{docker_tag}"
--- a/src/axolotl/cli/config.py
+++ b/src/axolotl/cli/config.py
@@ -1,6 +1,7 @@
 """Configuration loading and processing."""

 import json
+import logging
 import os
 import tempfile
 from pathlib import Path
@@ -21,12 +22,11 @@ from axolotl.utils.config import (
    validate_config,
 )
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.logging import get_logger
 from axolotl.utils.mlflow_ import setup_mlflow_env_vars
 from axolotl.utils.trainer import prepare_opinionated_env, prepare_optim_env
 from axolotl.utils.wandb_ import setup_wandb_env_vars

-LOG = get_logger(__name__, use_environ=True)
+LOG = logging.getLogger(__name__)


 def check_remote_config(config: Union[str, Path]) -> Union[str, Path]:
@@ -119,12 +119,12 @@ def choose_config(path: Path) -> str:
        )

    if len(yaml_files) == 1:
-        LOG.info(f"Using default YAML file '{yaml_files[0]}'")
+        print(f"Using default YAML file '{yaml_files[0]}'")
        return str(yaml_files[0])

-    LOG.info("Choose a YAML file:")
+    print("Choose a YAML file:")
    for idx, file in enumerate(yaml_files):
-        LOG.info(f"{idx + 1}. {file}")
+        print(f"{idx + 1}. {file}")

    chosen_file = None
    while chosen_file is None:
@@ -133,9 +133,9 @@ def choose_config(path: Path) -> str:
            if 1 <= choice <= len(yaml_files):
                chosen_file = str(yaml_files[choice - 1])
            else:
-                LOG.info("Invalid choice. Please choose a number from the list.")
+                print("Invalid choice. Please choose a number from the list.")
        except ValueError:
-            LOG.info("Invalid input. Please enter a number.")
+            print("Invalid input. Please enter a number.")

    return chosen_file

--- a/src/axolotl/cli/evaluate.py
+++ b/src/axolotl/cli/evaluate.py
@@ -1,5 +1,6 @@
 """CLI to run evaluation on a model."""

+import logging
 import os
 from pathlib import Path
 from typing import Union
@@ -16,9 +17,8 @@ from axolotl.common.datasets import load_datasets, load_preference_datasets
 from axolotl.evaluate import evaluate
 from axolotl.utils import patch_optimized_env
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.logging import get_logger

-LOG = get_logger(__name__)
+LOG = logging.getLogger(__name__)


 def do_evaluate(cfg: DictDefault, cli_args: TrainerCliArgs) -> None:
--- a/src/axolotl/cli/inference.py
+++ b/src/axolotl/cli/inference.py
@@ -1,6 +1,7 @@
 """CLI to run inference on a trained model."""

 import importlib
+import logging
 import sys
 from pathlib import Path
 from threading import Thread
@@ -21,9 +22,8 @@ from axolotl.utils.chat_templates import (
    get_chat_template_from_config,
 )
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.logging import get_logger

-LOG = get_logger(__name__)
+LOG = logging.getLogger(__name__)


 def get_multi_line_input() -> str:
--- a/src/axolotl/cli/main.py
+++ b/src/axolotl/cli/main.py
@@ -2,6 +2,7 @@

 # pylint: disable=redefined-outer-name

+import logging
 import os
 import subprocess  # nosec B404
 import tempfile
@@ -16,7 +17,6 @@ import axolotl
 from axolotl.cli.args import (
    EvaluateCliArgs,
    PreprocessCliArgs,
-    QuantizeCliArgs,
    TrainerCliArgs,
    VllmServeCliArgs,
 )
@@ -30,11 +30,8 @@ from axolotl.cli.utils import (
 )
 from axolotl.integrations.lm_eval.cli import lm_eval
 from axolotl.utils import patch_optimized_env
-from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.config import AxolotlInputConfig

-LOG = get_logger(__name__)
-

@click.group()
@click.version_option(version=axolotl.__version__, prog_name="axolotl")
@@ -179,7 +176,7 @@ def train(

                    do_cli(config=cfg_file, **kwargs)
        except subprocess.CalledProcessError as exc:
-            LOG.error(f"Failed to train/fine-tune config '{cfg_file}': {exc}")
+            logging.error(f"Failed to train/fine-tune config '{cfg_file}': {exc}")
            if not sweep:
                raise exc

@@ -336,16 +333,6 @@ def vllm_serve(config: str, **cli_args: VllmServeCliArgs):
    do_vllm_serve(config, cli_args)


-@cli.command()
-@click.argument("config", type=click.Path(exists=True, path_type=str))
-@add_options_from_dataclass(QuantizeCliArgs)
-@filter_none_kwargs
-def quantize(config: str, **cli_args: QuantizeCliArgs):
-    from axolotl.cli.quantize import do_quantize
-
-    do_quantize(config, cli_args)
-
-
@cli.command()
@click.argument("model", type=click.Path(exists=True, path_type=str))
@click.argument("output", type=click.Path(exists=False, path_type=str))
--- a/src/axolotl/cli/merge_lora.py
+++ b/src/axolotl/cli/merge_lora.py
@@ -1,18 +1,20 @@
 """CLI to merge a trained LoRA into a base model."""

+import logging
 from pathlib import Path
 from typing import Union

 import fire
+import transformers
 from dotenv import load_dotenv

+from axolotl.cli.args import TrainerCliArgs
 from axolotl.cli.art import print_axolotl_text_art
 from axolotl.cli.config import load_cfg
 from axolotl.cli.utils import load_model_and_tokenizer
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.logging import get_logger

-LOG = get_logger(__name__)
+LOG = logging.getLogger(__name__)


 def do_merge_lora(*, cfg: DictDefault) -> None:
@@ -66,6 +68,12 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs) -> None:
    Raises:
        ValueError: If target directory for LoRA merged model does not exist.
    """
+    # pylint: disable=duplicate-code
+    parser = transformers.HfArgumentParser(TrainerCliArgs)
+    parsed_cli_args, _ = parser.parse_args_into_dataclasses(
+        return_remaining_strings=True
+    )
+    parsed_cli_args.merge_lora = True

    parsed_cfg = load_cfg(
        config,
@@ -73,7 +81,7 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs) -> None:
        load_in_8bit=False,
        load_in_4bit=False,
        flash_attention=False,
-        context_parallel_degree=None,
+        sequence_parallel_degree=None,
        deepspeed=None,
        fsdp=None,
        fsdp_config=None,
--- a/src/axolotl/cli/merge_sharded_fsdp_weights.py
+++ b/src/axolotl/cli/merge_sharded_fsdp_weights.py
@@ -1,6 +1,7 @@
 """CLI to merge sharded FSDP model checkpoints into a single combined checkpoint."""

 import json
+import logging
 import os
 import shutil
 from pathlib import Path
@@ -10,6 +11,7 @@ import fire
 import torch
 import torch.distributed.checkpoint as dist_cp
 import torch.distributed.checkpoint.format_utils as dist_cp_format_utils
+import transformers
 from accelerate.utils import (
    SAFE_WEIGHTS_INDEX_NAME,
    SAFE_WEIGHTS_NAME,
@@ -22,11 +24,11 @@ from huggingface_hub import split_torch_state_dict_into_shards
 from safetensors.torch import save_file as safe_save_file
 from torch.distributed.checkpoint.format_utils import _EmptyStateDictLoadPlanner

+from axolotl.cli.args import TrainerCliArgs
 from axolotl.cli.art import print_axolotl_text_art
 from axolotl.cli.config import load_cfg
-from axolotl.utils.logging import get_logger

-LOG = get_logger(__name__)
+LOG = logging.getLogger(__name__)


 class BFloat16CastPlanner(_EmptyStateDictLoadPlanner):
@@ -195,6 +197,11 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
    """
    # pylint: disable=duplicate-code
    print_axolotl_text_art()
+    parser = transformers.HfArgumentParser(TrainerCliArgs)
+    parsed_cli_args, _ = parser.parse_args_into_dataclasses(
+        return_remaining_strings=True
+    )
+    parsed_cli_args.merge_lora = True
    parsed_cfg = load_cfg(config, **kwargs)

    fsdp_dir = Path(parsed_cfg.output_dir) / "pytorch_model_fsdp_0"
--- a/src/axolotl/cli/preprocess.py
+++ b/src/axolotl/cli/preprocess.py
@@ -1,5 +1,6 @@
 """CLI to run preprocessing of a dataset."""

+import logging
 import warnings
 from pathlib import Path
 from typing import Union
@@ -19,10 +20,9 @@ from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
 from axolotl.common.datasets import load_datasets, load_preference_datasets
 from axolotl.integrations.base import PluginManager
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.logging import get_logger
 from axolotl.utils.trainer import disable_datasets_caching

-LOG = get_logger(__name__)
+LOG = logging.getLogger(__name__)


 def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None:
--- a/src/axolotl/cli/quantize.py
+++ b/src/axolotl/cli/quantize.py
@@ -1,90 +0,0 @@
-"""
-CLI to post-training quantize a model using torchao
-"""
-
-from pathlib import Path
-from typing import Union
-
-from transformers import AutoModelForCausalLM
-
-from axolotl.cli.art import print_axolotl_text_art
-from axolotl.cli.config import load_cfg
-from axolotl.loaders import load_tokenizer
-from axolotl.utils.logging import get_logger
-from axolotl.utils.quantization import TorchIntDType, quantize_model_for_ptq
-
-LOG = get_logger(__name__)
-
-
-def do_quantize(
-    config: Union[Path, str],
-    cli_args: dict,
-):
-    """
-    Quantizes a model's model's weights
-
-    Args:
-        config (Union[Path, str]): The path to the config file
-        cli_args (dict): Additional command-line arguments
-    """
-    print_axolotl_text_art()
-
-    cfg = load_cfg(config)
-
-    if cfg.qat and cfg.quantization:
-        raise ValueError(
-            "QAT and quantization cannot be used together. Please specify only one of qat or quantization in your config file."
-        )
-
-    if cfg.qat:
-        quantize_cfg = cfg.qat
-    elif cfg.quantization:
-        quantize_cfg = cfg.quantization
-    else:
-        raise ValueError(
-            "No quantization configuration found. Please specify either qat or quantization in your config file."
-        )
-
-    model_path = cli_args.get("model_path") or cfg.output_dir
-    if weight_dtype := cli_args.get("weight_dtype"):
-        weight_dtype = TorchIntDType[weight_dtype]
-    else:
-        weight_dtype = quantize_cfg.weight_dtype
-    if activation_dtype := cli_args.get("activation_dtype"):
-        activation_dtype = TorchIntDType[activation_dtype]
-    else:
-        activation_dtype = quantize_cfg.activation_dtype
-    group_size = cli_args.get("group_size") or quantize_cfg.group_size
-    quantize_embedding = (
-        cli_args.get("quantize_embedding") or quantize_cfg.quantize_embedding
-    )
-    output_dir = cli_args.get("output_dir") or cfg.output_dir
-
-    LOG.info(f"Loading model from {model_path}...")
-    tokenizer = load_tokenizer(cfg)
-    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
-
-    LOG.info(
-        f"Quantizing model with configuration: \n"
-        f"\tweight_dtype: {weight_dtype}\n"
-        f"\tactivation_dtype: {activation_dtype}\n"
-        f"\tgroup_size: {group_size}\n"
-        f"\tquantize_embedding: {quantize_embedding}"
-    )
-
-    quantize_model_for_ptq(
-        model, weight_dtype, group_size, activation_dtype, quantize_embedding
-    )
-
-    LOG.info(f"Saving quantized model to: {str(Path(output_dir) / 'quantized')}...")
-    model.save_pretrained(
-        str(Path(output_dir) / "quantized"),
-        safe_serialization=False,
-        progressbar=True,
-    )
-    tokenizer.save_pretrained(
-        str(Path(output_dir) / "quantized"),
-        safe_serialization=False,
-        progressbar=True,
-    )
-    LOG.info(f"Quantized model saved to: {str(Path(output_dir) / 'quantized')}...")
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -1,6 +1,7 @@
 """CLI to run training on a model."""

 import gc
+import logging
 import os
 from pathlib import Path
 from typing import Union
@@ -21,6 +22,8 @@ from axolotl.utils import patch_optimized_env
 from axolotl.utils.config import normalize_config, resolve_dtype
 from axolotl.utils.dict import DictDefault

+LOG = logging.getLogger(__name__)
+

 def do_train(cfg: DictDefault, cli_args: TrainerCliArgs):
    """
--- a/src/axolotl/cli/utils.py
+++ b/src/axolotl/cli/utils.py
@@ -4,6 +4,7 @@ import concurrent.futures
 import dataclasses
 import hashlib
 import json
+import logging
 from functools import wraps
 from pathlib import Path
 from types import NoneType
@@ -19,12 +20,10 @@ from transformers import (
    ProcessorMixin,
 )

-from axolotl.loaders import load_processor, load_tokenizer
-from axolotl.loaders.model import ModelLoader
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.logging import get_logger
+from axolotl.utils.models import load_model, load_processor, load_tokenizer

-LOG = get_logger(__name__)
+LOG = logging.getLogger(__name__)


 def strip_optional_type(field_type: type | str | None):
@@ -319,8 +318,7 @@ def load_model_and_tokenizer(
    tokenizer = load_tokenizer(cfg)

    LOG.info("loading model...")
-    model_loader = ModelLoader(cfg, tokenizer, inference=inference)
-    model, _ = model_loader.load()
+    model, _ = load_model(cfg, tokenizer, inference=inference)

    processor = None
    if cfg.is_multimodal:
--- a/src/axolotl/cli/vllm_serve.py
+++ b/src/axolotl/cli/vllm_serve.py
@@ -2,27 +2,14 @@
 CLI to start the vllm server for online RL
 """

-import os
-from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Union

-import trl
 from trl.scripts.vllm_serve import ScriptArguments

 from axolotl.cli.config import load_cfg


-@dataclass
-class AxolotlScriptArguments(ScriptArguments):
-    """
-    Additional arguments for the VLLM server
-    """
-
-    reasoning_parser: str = field(default="", kw_only=True)
-    enable_reasoning: bool | None = field(default=None, kw_only=True)
-
-
 def do_vllm_serve(
    config: Union[Path, str],
    cli_args: dict,
@@ -37,7 +24,6 @@ def do_vllm_serve(
    Returns:
        process_id: the process id of the started VLLM server
    """
-    patch_vllm_worker()
    cfg = load_cfg(config)
    model = cfg.base_model

@@ -57,16 +43,9 @@ def do_vllm_serve(
    enable_prefix_caching = (
        cli_args.get("enable_prefix_caching") or cfg.vllm.enable_prefix_caching
    )
-    reasoning_parser = (
-        cli_args.get("reasoning_parser") or cfg.vllm.reasoning_parser or ""
-    )
-    enable_reasoning = (
-        cli_args.get("enable_reasoning") or cfg.vllm.enable_reasoning or False
-    )

-    # pylint: disable=unexpected-keyword-arg
-    vllm_script_args = AxolotlScriptArguments(
-        model=model,
+    vllm_script_args = ScriptArguments(
+        model,
        tensor_parallel_size=tensor_parallel_size,
        host=host,
        port=port,
@@ -74,67 +53,5 @@ def do_vllm_serve(
        dtype=dtype,
        max_model_len=max_model_len,
        enable_prefix_caching=enable_prefix_caching,
-        reasoning_parser=reasoning_parser,
-        enable_reasoning=enable_reasoning,
    )
    vllm_serve_main(vllm_script_args)
-
-
-def patch_vllm_worker():
-    from multiprocessing.connection import Connection
-
-    from vllm import LLM
-
-    def llm_worker(
-        script_args: AxolotlScriptArguments,
-        data_parallel_rank: int,
-        master_port: int,
-        connection: Connection,
-    ) -> None:
-        # Set required environment variables for DP to work with vLLM
-        os.environ["VLLM_DP_RANK"] = str(data_parallel_rank)
-        os.environ["VLLM_DP_RANK_LOCAL"] = str(data_parallel_rank)
-        os.environ["VLLM_DP_SIZE"] = str(script_args.data_parallel_size)
-        os.environ["VLLM_DP_MASTER_PORT"] = str(master_port)
-
-        llm = LLM(
-            model=script_args.model,
-            revision=script_args.revision,
-            tensor_parallel_size=script_args.tensor_parallel_size,
-            gpu_memory_utilization=script_args.gpu_memory_utilization,
-            enforce_eager=script_args.enforce_eager,
-            dtype=script_args.dtype,
-            # Automatic Prefix Caching caches the KV cache of existing queries, so that a new query can
-            # directly reuse the KV cache if it shares the same prefix with one of the existing queries.
-            # This is particularly useful here because we generate completions from the same prompts.
-            enable_prefix_caching=script_args.enable_prefix_caching,
-            kv_cache_dtype=script_args.kv_cache_dtype,
-            max_model_len=script_args.max_model_len,
-            worker_extension_cls="trl.scripts.vllm_serve.WeightSyncWorkerExtension",
-            enable_reasoning=script_args.enable_reasoning,
-            reasoning_parser=script_args.reasoning_parser,
-        )
-
-        # Send ready signal to parent process
-        connection.send({"status": "ready"})
-
-        while True:
-            # Wait for commands from the parent process
-            try:
-                command = connection.recv()
-            except KeyboardInterrupt:
-                llm.collective_rpc(method="close_communicator")
-                break
-
-            # Handle commands
-            if command["type"] in ["call", "fire_and_forget"]:
-                method_name = command["method"]
-                args, kwargs = command.get("args", ()), command.get("kwargs", {})
-                method = getattr(llm, method_name)
-                result = method(*args, **kwargs)
-                if command["type"] == "call":
-                    connection.send(result)
-            elif command["type"] == "shutdown":
-                break
-
-    trl.scripts.vllm_serve.llm_worker = llm_worker
--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -1,5 +1,6 @@
 """Dataset loading utilities."""

+import logging
 import math
 import random
 from dataclasses import dataclass
@@ -9,15 +10,14 @@ from datasets import Dataset

 import axolotl.monkeypatch.data.batch_dataset_fetcher  # pylint: disable=unused-import  # noqa: F401
 from axolotl.cli.args import PreprocessCliArgs, TrainerCliArgs
-from axolotl.loaders import load_processor, load_tokenizer
 from axolotl.utils.data import prepare_dataset
 from axolotl.utils.data.rl import load_prepare_preference_datasets
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.logging import get_logger
+from axolotl.utils.models import load_processor, load_tokenizer
 from axolotl.utils.schemas.enums import RLType
 from axolotl.utils.tokenization import check_dataset_labels

-LOG = get_logger(__name__)
+LOG = logging.getLogger(__name__)


@dataclass
--- a/src/axolotl/core/builders/init.py
+++ b/src/axolotl/core/builders/init.py
@@ -1,6 +0,0 @@
-"""Trainer builder classes"""
-
-from .causal import HFCausalTrainerBuilder
-from .rl import HFRLTrainerBuilder
-
-__all__ = ["HFCausalTrainerBuilder", "HFRLTrainerBuilder"]
--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -1,503 +0,0 @@
-# Copyright 2024 Axolotl AI. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Base class for trainer builder"""
-
-import abc
-import importlib
-import logging
-import sys
-from abc import abstractmethod
-from contextlib import suppress
-from pathlib import Path
-from typing import Any
-
-import torch
-from transformers import (
-    TrainerCallback,
-)
-from transformers.training_args import OptimizerNames
-
-from axolotl.integrations.base import PluginManager
-from axolotl.monkeypatch.trainer.lr import patch_trainer_get_lr
-from axolotl.utils import is_comet_available, is_mlflow_available
-from axolotl.utils.callbacks import (
-    GCCallback,
-    GPUStatsCallback,
-    SaveAxolotlConfigtoWandBCallback,
-)
-from axolotl.utils.callbacks.profiler import PytorchProfilerCallback
-from axolotl.utils.schemas.enums import CustomSupportedOptimizers
-
-LOG = logging.getLogger(__name__)
-
-with suppress(ImportError):
-    import torch._dynamo  # pylint: disable=ungrouped-imports
-
-
-class TrainerBuilderBase(abc.ABC):
-    """Base class for trainer builder."""
-
-    def __init__(self, cfg, model, tokenizer, processor=None):
-        self.cfg = cfg
-        self.model = model
-        self.tokenizer = tokenizer
-        self.processor = processor
-
-        self._train_dataset = None
-        self._eval_dataset = None
-        self._model_ref = None
-        self._peft_config = None
-
-        # If the model supports tagging, add the axolotl tag.
-        # This makes sure the tag is correctly pushed even if a user calls
-        # model.push_to_hub instead of trainer.push_to_hub.
-        if hasattr(model, "add_model_tags"):
-            model.add_model_tags(["axolotl"])
-
-        patch_trainer_get_lr()
-
-    @property
-    def model_ref(self):
-        return self._model_ref
-
-    @model_ref.setter
-    def model_ref(self, model):
-        self._model_ref = model
-
-    @property
-    def train_dataset(self):
-        return self._train_dataset
-
-    @train_dataset.setter
-    def train_dataset(self, dataset):
-        self._train_dataset = dataset
-
-    @property
-    def eval_dataset(self):
-        return self._eval_dataset
-
-    @eval_dataset.setter
-    def eval_dataset(self, dataset):
-        self._eval_dataset = dataset
-
-    @property
-    def peft_config(self):
-        return self._peft_config
-
-    @peft_config.setter
-    def peft_config(self, peft_config):
-        self._peft_config = peft_config
-
-    @abstractmethod
-    def build(self, total_num_steps):
-        pass
-
-    def get_callbacks(self) -> list[TrainerCallback]:
-        callbacks = []
-
-        plugin_manager = PluginManager.get_instance()
-        callbacks.extend(
-            plugin_manager.add_callbacks_pre_trainer(cfg=self.cfg, model=self.model)
-        )
-
-        if self.cfg.profiler_steps:
-            callbacks.append(
-                PytorchProfilerCallback(
-                    steps_to_profile=self.cfg.profiler_steps,
-                )
-            )
-
-        if self.cfg.gc_steps:
-            callbacks.append(GCCallback(gc_steps=self.cfg.gc_steps))
-
-        if self.cfg.use_wandb:
-            callbacks.append(
-                SaveAxolotlConfigtoWandBCallback(self.cfg.axolotl_config_path)
-            )
-        if self.cfg.use_mlflow and is_mlflow_available():
-            from axolotl.utils.callbacks.mlflow_ import (
-                SaveAxolotlConfigtoMlflowCallback,
-            )
-
-            callbacks.extend(
-                [
-                    SaveAxolotlConfigtoMlflowCallback(self.cfg.axolotl_config_path),
-                ]
-            )
-        if self.cfg.use_comet and is_comet_available():
-            from axolotl.utils.callbacks.comet_ import SaveAxolotlConfigtoCometCallback
-
-            callbacks.append(
-                SaveAxolotlConfigtoCometCallback(self.cfg.axolotl_config_path)
-            )
-
-        callbacks.append(GPUStatsCallback(cfg=self.cfg))
-
-        return callbacks
-
-    def get_post_trainer_create_callbacks(self, trainer):
-        """
-        Callbacks added after the trainer is created, usually b/c these need access to the trainer
-        """
-        callbacks = []
-        if self.cfg.plugins:
-            plugin_manager = PluginManager.get_instance()
-            callbacks.extend(
-                [
-                    cb
-                    for cb in plugin_manager.add_callbacks_post_trainer(
-                        self.cfg, trainer
-                    )
-                    if cb
-                ]
-            )
-        return callbacks
-
-    def hook_pre_create_training_args(self, training_arguments_kwargs):
-        # TODO
-        return training_arguments_kwargs
-
-    def hook_post_create_training_args(self, training_arguments):
-        # TODO
-        return training_arguments
-
-    def hook_pre_create_trainer(self, trainer_kwargs, trainer_cls):
-        # TODO
-        return trainer_kwargs, trainer_cls
-
-    def hook_post_create_trainer(self, trainer):
-        # TODO
-        return trainer
-
-    def _configure_warmup_and_logging(
-        self, total_num_steps: int, training_args_kwargs: dict
-    ):
-        warmup_steps = 0
-        warmup_ratio = 0.0
-        if self.cfg.warmup_steps:
-            warmup_steps = self.cfg.warmup_steps
-        elif self.cfg.warmup_ratio:
-            if total_num_steps:
-                warmup_steps = max(int(self.cfg.warmup_ratio * total_num_steps), 0)
-            else:
-                warmup_ratio = self.cfg.warmup_ratio
-        elif total_num_steps:
-            warmup_steps = min(int(0.03 * total_num_steps), 100)
-        else:
-            warmup_ratio = 0.03
-
-        if warmup_steps == 1:
-            warmup_steps = 2
-
-        if self.cfg.logging_steps is not None:
-            training_args_kwargs["logging_steps"] = self.cfg.logging_steps
-        else:
-            training_args_kwargs["logging_steps"] = (
-                500  # transformers defaults to 500
-                if not total_num_steps
-                else max(min(int(0.005 * total_num_steps), 10), 1)
-            )
-
-        training_args_kwargs["warmup_ratio"] = warmup_ratio
-        training_args_kwargs["warmup_steps"] = warmup_steps
-
-    def _configure_precision_settings(self, training_args_kwargs: dict):
-        training_args_kwargs["fp16"] = (self.cfg.fp16 and not self.cfg.bf16) or False
-        training_args_kwargs["tf32"] = self.cfg.tf32
-        if self.cfg.bf16 == "full":
-            training_args_kwargs["bf16_full_eval"] = True
-        else:
-            training_args_kwargs["bf16"] = self.cfg.bf16 or self.cfg.bfloat16
-
-    def _configure_scheduler(self, training_args_kwargs: dict):
-        if self.cfg.lr_scheduler in ["one_cycle", "rex"]:
-            training_args_kwargs["lr_scheduler_type"] = "cosine"
-            training_args_kwargs["alternate_lr_scheduler_type"] = self.cfg.lr_scheduler
-        else:
-            training_args_kwargs["lr_scheduler_type"] = (
-                self.cfg.lr_scheduler if self.cfg.lr_scheduler else "cosine"
-            )
-        training_args_kwargs["lr_scheduler_kwargs"] = (
-            self.cfg.lr_scheduler_kwargs if self.cfg.lr_scheduler_kwargs else {}
-        )
-
-    def _configure_optimizer(self, training_args_kwargs: dict, trainer_kwargs: dict):
-        def _configure_custom_optimizer(
-            training_args_kwargs: dict, trainer_kwargs: dict
-        ):
-            # Common optimizer kwargs
-            optimizer_kwargs = {
-                "lr": training_args_kwargs["learning_rate"],
-                "weight_decay": training_args_kwargs["weight_decay"],
-            }
-
-            # Adam-specific kwargs
-            adam_kwargs: dict = {}
-            if training_args_kwargs.get("adam_beta1") and training_args_kwargs.get(
-                "adam_beta2"
-            ):
-                adam_kwargs["betas"] = (
-                    training_args_kwargs.get("adam_beta1"),
-                    training_args_kwargs.get("adam_beta2"),
-                )
-            if training_args_kwargs.get("adam_epsilon"):
-                adam_kwargs["eps"] = training_args_kwargs.get("adam_epsilon")
-
-            if self.cfg.optimizer == "muon":
-                from axolotl.contribs.mit.muon import (  # pylint: disable=no-name-in-module
-                    MuonOptimizerFactory,
-                )
-
-                optimizer_cls = MuonOptimizerFactory
-                optimizer_kwargs.update(adam_kwargs)
-            elif self.cfg.optimizer == "optimi_adamw":
-                from optimi import AdamW
-
-                optimizer_kwargs["foreach"] = False
-                optimizer_cls = AdamW
-                optimizer_kwargs.update(adam_kwargs)
-            elif self.cfg.optimizer == "ao_adamw_4bit":
-                # TODO remove 20250401
-                from torchao.prototype.low_bit_optim import AdamW4bit
-
-                optimizer_cls = AdamW4bit
-                optimizer_kwargs.update(adam_kwargs)
-
-                LOG.warning(
-                    f"`ao_adamw_4bit` will be deprecated soon. Please use `{OptimizerNames.ADAMW_TORCH_4BIT}` instead."
-                )
-            elif self.cfg.optimizer == "ao_adamw_8bit":
-                from torchao.prototype.low_bit_optim import AdamW8bit
-
-                optimizer_cls = AdamW8bit
-                optimizer_kwargs.update(adam_kwargs)
-            elif self.cfg.optimizer == "ao_adamw_fp8":
-                from torchao.prototype.low_bit_optim import AdamWFp8
-
-                optimizer_cls = AdamWFp8
-                optimizer_kwargs.update(adam_kwargs)
-            elif self.cfg.optimizer == "adopt_adamw":
-                from axolotl.utils.optimizers.adopt import ADOPT
-
-                optimizer_cls = ADOPT
-                adam_kwargs["decouple"] = True
-                optimizer_kwargs.update(adam_kwargs)
-            elif self.cfg.optimizer == "came_pytorch":
-                from came_pytorch import CAME
-
-                optimizer_cls = CAME
-
-                beta1 = training_args_kwargs.get("adam_beta1", 0.9)
-                beta2 = training_args_kwargs.get("adam_beta2", 0.999)
-                beta3 = training_args_kwargs.get("adam_beta3", 0.9999)
-                eps1 = training_args_kwargs.get("adam_epsilon", 1e-30)
-                eps2 = training_args_kwargs.get("adam_epsilon2", 1e-16)
-                adam_kwargs["betas"] = (beta1, beta2, beta3)
-                adam_kwargs["eps"] = (eps1, eps2)
-
-                optimizer_kwargs.update(adam_kwargs)
-            else:
-                raise ValueError(
-                    f"Unhandled optimizer: {self.cfg.optimizer}. Please raise an Issue."
-                )
-
-            # Parse any additional optimizer args from config
-            if self.cfg.optim_args:
-                if isinstance(self.cfg.optim_args, dict):
-                    optimizer_kwargs.update(self.cfg.optim_args)
-                else:
-                    # Parse string format "key1=value1,key2=value2"
-                    for mapping in self.cfg.optim_args.replace(" ", "").split(","):
-                        key, value = mapping.split("=")
-                        optimizer_kwargs[key] = value
-
-            # Note: This is not used in training_args_kwargs, but in trainer_kwargs
-            trainer_kwargs["optimizer_cls_and_kwargs"] = (
-                optimizer_cls,
-                optimizer_kwargs,
-            )
-
-        # Handle custom optimizer
-        custom_supported_optimizers = [opt.value for opt in CustomSupportedOptimizers]
-        if self.cfg.optimizer in custom_supported_optimizers:
-            _configure_custom_optimizer(training_args_kwargs, trainer_kwargs)
-        else:
-            # Use transformers' optimizer
-            training_args_kwargs["optim"] = self.cfg.optimizer
-
-            # Parse any additional optimizer args from config
-            if self.cfg.optim_args:
-                if isinstance(self.cfg.optim_args, dict):
-                    optim_args = ",".join(
-                        [f"{key}={value}" for key, value in self.cfg.optim_args.items()]
-                    )
-                else:
-                    optim_args = self.cfg.optim_args
-                training_args_kwargs["optim_args"] = optim_args
-
-            if (
-                self.cfg.optimizer == "adamw_anyprecision"
-                and Path(self.cfg.torchdistx_path).exists()
-            ):
-                sys.path.append(self.cfg.torchdistx_path)
-                importlib.import_module("torchdistx")
-
-    def _configure_hub_parameters(self, training_args_kwargs: dict):
-        if self.cfg.hub_model_id:
-            training_args_kwargs["hub_model_id"] = self.cfg.hub_model_id
-            training_args_kwargs["push_to_hub"] = True
-            training_args_kwargs["hub_private_repo"] = True
-            training_args_kwargs["hub_always_push"] = True
-
-            if self.cfg.hub_strategy:
-                training_args_kwargs["hub_strategy"] = self.cfg.hub_strategy
-
-    def _configure_save_and_eval_strategy(self, training_args_kwargs: dict):
-        # save_strategy and save_steps
-        if self.cfg.save_steps:
-            training_args_kwargs["save_strategy"] = "steps"
-            training_args_kwargs["save_steps"] = self.cfg.save_steps
-        elif self.cfg.save_strategy:
-            training_args_kwargs["save_strategy"] = self.cfg.save_strategy
-        else:
-            # default to saving each epoch if not defined
-            training_args_kwargs["save_strategy"] = "epoch"
-
-        training_args_kwargs["save_total_limit"] = (
-            self.cfg.save_total_limit if self.cfg.save_total_limit else 4
-        )
-
-        # eval_strategy and eval_steps
-        if not self.eval_dataset or self.cfg.val_set_size == 0:
-            # do not eval if no eval_dataset or val_set_size=0
-            training_args_kwargs["eval_strategy"] = "no"
-        elif self.cfg.eval_steps:
-            training_args_kwargs["eval_strategy"] = "steps"
-            training_args_kwargs["eval_steps"] = self.cfg.eval_steps
-        elif self.cfg.eval_strategy:
-            training_args_kwargs["eval_strategy"] = self.cfg.eval_strategy
-
-    def _configure_reporting(self, training_args_kwargs: dict):
-        report_to = []
-        if self.cfg.use_wandb:
-            report_to.append("wandb")
-        if self.cfg.use_mlflow:
-            report_to.append("mlflow")
-        if self.cfg.use_tensorboard:
-            report_to.append("tensorboard")
-        if self.cfg.use_comet:
-            report_to.append("comet_ml")
-
-        training_args_kwargs["report_to"] = report_to
-
-        if self.cfg.use_wandb:
-            training_args_kwargs["run_name"] = self.cfg.wandb_name
-        elif self.cfg.use_mlflow:
-            training_args_kwargs["run_name"] = self.cfg.mlflow_run_name
-        else:
-            training_args_kwargs["run_name"] = None
-
-    def _configure_torch_compile(self, training_args_kwargs: dict):
-        if self.cfg.torch_compile and getattr(torch, "_dynamo", None):
-            torch._dynamo.config.suppress_errors = (  # pylint: disable=protected-access
-                True
-            )
-            training_args_kwargs["torch_compile"] = self.cfg.torch_compile
-            if self.cfg.torch_compile_backend:
-                training_args_kwargs["torch_compile_backend"] = (
-                    self.cfg.torch_compile_backend
-                )
-            if self.cfg.torch_compile_mode:
-                training_args_kwargs["torch_compile_mode"] = self.cfg.torch_compile_mode
-
-    def _configure_gradient_checkpointing(self, training_args_kwargs: dict):
-        if self.cfg.gradient_checkpointing:
-            training_args_kwargs["gradient_checkpointing"] = (
-                self.cfg.gradient_checkpointing
-            )
-            if self.cfg.gradient_checkpointing_kwargs is not None:
-                training_args_kwargs["gradient_checkpointing_kwargs"] = (
-                    self.cfg.gradient_checkpointing_kwargs
-                )
-            else:
-                training_args_kwargs["gradient_checkpointing_kwargs"] = {
-                    "use_reentrant": False
-                }
-
-    def _set_base_training_args(
-        self, total_num_steps
-    ) -> tuple[dict[str, Any], dict[str, Any]]:
-        training_args_kwargs: dict[str, Any] = {}
-        trainer_kwargs: dict[str, Any] = {}
-
-        self._configure_warmup_and_logging(total_num_steps, training_args_kwargs)
-        self._configure_precision_settings(training_args_kwargs)
-        self._configure_save_and_eval_strategy(training_args_kwargs)
-        self._configure_gradient_checkpointing(training_args_kwargs)
-
-        # set arg into trainer_args_kwargs with same name if value not None
-        for arg in [
-            # optim/scheduler
-            "adam_beta1",
-            "adam_beta2",
-            "adam_beta3",
-            "adam_epsilon",
-            "adam_epsilon2",
-            "cosine_min_lr_ratio",
-            "cosine_constant_lr_ratio",
-            "optim_target_modules",
-            # trainer
-            "max_grad_norm",
-            "dataloader_num_workers",
-            "dataloader_pin_memory",
-            "dataloader_prefetch_factor",
-            "gradient_accumulation_steps",
-            "learning_rate",
-            "embedding_lr",
-            "embedding_lr_scale",
-            "lr_groups",
-            "loraplus_lr_ratio",
-            "loraplus_lr_embedding",
-            "output_dir",
-            "save_safetensors",
-            "save_only_model",
-            "include_tokens_per_second",
-            "weight_decay",
-            "seed",
-        ]:
-            if hasattr(self.cfg, arg) and getattr(self.cfg, arg) is not None:
-                training_args_kwargs[arg] = getattr(self.cfg, arg)
-
-        training_args_kwargs["per_device_train_batch_size"] = self.cfg.micro_batch_size
-
-        if self.cfg.eval_batch_size:
-            training_args_kwargs["per_device_eval_batch_size"] = (
-                self.cfg.eval_batch_size
-            )
-
-        training_args_kwargs["max_steps"] = self.cfg.max_steps or total_num_steps or -1
-        training_args_kwargs["num_train_epochs"] = self.cfg.num_epochs
-
-        # max_length is not used in CausalTrainer
-        if self.cfg.reward_model or self.cfg.rl:
-            training_args_kwargs["max_length"] = self.cfg.sequence_len
-
-        self._configure_reporting(training_args_kwargs)
-        self._configure_hub_parameters(training_args_kwargs)
-        self._configure_scheduler(training_args_kwargs)
-        self._configure_optimizer(training_args_kwargs, trainer_kwargs)
-        self._configure_torch_compile(training_args_kwargs)
-
-        return training_args_kwargs, trainer_kwargs
--- a/src/axolotl/core/builders/causal.py
+++ b/src/axolotl/core/builders/causal.py
@@ -1,489 +0,0 @@
-"""Builder for causal trainers"""
-
-import inspect
-import math
-import os
-from pathlib import Path
-from typing import Type, Union
-
-import transformers
-from transformers import (
-    DataCollatorWithFlattening,
-    EarlyStoppingCallback,
-)
-from trl.trainer.utils import RewardDataCollatorWithPadding
-
-from axolotl.core.builders.base import TrainerBuilderBase
-from axolotl.core.trainers import (
-    AxolotlMambaTrainer,
-    AxolotlPRMTrainer,
-    AxolotlRewardTrainer,
-    AxolotlTrainer,
-    ReLoRATrainer,
-)
-from axolotl.core.training_args import (
-    AxolotlPRMConfig,
-    AxolotlRewardConfig,
-    AxolotlTrainingArguments,
-)
-from axolotl.integrations.base import PluginManager
-from axolotl.monkeypatch.multipack import SUPPORTED_MULTIPACK_MODEL_TYPES
-from axolotl.monkeypatch.relora import ReLoRACallback
-from axolotl.processing_strategies import get_processing_strategy
-from axolotl.utils import is_comet_available, is_mlflow_available
-from axolotl.utils.callbacks import (
-    EvalFirstStepCallback,
-    LossWatchDogCallback,
-    SaveBetterTransformerModelCallback,
-    bench_eval_callback_factory,
-    causal_lm_bench_eval_callback_factory,
-    colab_inference_post_train_callback,
-    log_prediction_callback_factory,
-)
-from axolotl.utils.callbacks.lisa import lisa_callback_factory
-from axolotl.utils.callbacks.qat import QATCallback
-from axolotl.utils.chat_templates import get_chat_template_from_config
-from axolotl.utils.collators import (
-    BatchSamplerDataCollatorForSeq2Seq,
-    DataCollatorForSeq2Seq,
-    MambaDataCollator,
-    V2BatchSamplerDataCollatorForSeq2Seq,
-)
-from axolotl.utils.collators.mm_chat import MultiModalChatDataCollator
-from axolotl.utils.logging import get_logger
-
-LOG = get_logger(__name__)
-
-
-class HFCausalTrainerBuilder(TrainerBuilderBase):
-    """
-    Build the HuggingFace training args/trainer for causal models and reward modeling
-    using TRL.
-    """
-
-    def get_callbacks(self):
-        callbacks = super().get_callbacks()
-        callbacks.append(EvalFirstStepCallback())
-
-        if self.cfg.relora_steps:
-            callbacks.append(ReLoRACallback(self.cfg))
-
-        if (
-            hasattr(self.model, "use_bettertransformer")
-            and self.model.use_bettertransformer is True
-        ):
-            callbacks.append(SaveBetterTransformerModelCallback())
-
-        # TODO: check if can move to base class
-        if self.cfg.loss_watchdog_threshold is not None:
-            callbacks.append(LossWatchDogCallback(self.cfg))
-
-        if self.cfg.qat:
-            callbacks.append(QATCallback(self.cfg.qat))
-
-        return callbacks
-
-    def get_post_trainer_create_callbacks(self, trainer):
-        callbacks = []
-        if self.cfg.use_wandb and self.cfg.eval_table_size > 0:
-            LogPredictionCallback = log_prediction_callback_factory(
-                trainer, self.tokenizer, "wandb"
-            )
-            callbacks.append(LogPredictionCallback(self.cfg))
-        if (
-            self.cfg.use_mlflow
-            and is_mlflow_available()
-            and self.cfg.eval_table_size > 0
-        ):
-            LogPredictionCallback = log_prediction_callback_factory(
-                trainer, self.tokenizer, "mlflow"
-            )
-            callbacks.append(LogPredictionCallback(self.cfg))
-        if self.cfg.use_comet and is_comet_available() and self.cfg.eval_table_size > 0:
-            LogPredictionCallback = log_prediction_callback_factory(
-                trainer, self.tokenizer, "comet_ml"
-            )
-            callbacks.append(LogPredictionCallback(self.cfg))
-
-        if self.cfg.do_bench_eval:
-            callbacks.append(bench_eval_callback_factory(trainer, self.tokenizer))
-        if self.cfg.do_causal_lm_eval:
-            CausalLMBenchEvalCallback = causal_lm_bench_eval_callback_factory(
-                trainer, self.tokenizer
-            )
-            callbacks.append(CausalLMBenchEvalCallback(self.cfg))
-
-        if self.cfg.early_stopping_patience:
-            early_stop_cb = EarlyStoppingCallback(
-                self.cfg.early_stopping_patience,
-            )
-            callbacks.append(early_stop_cb)
-
-        if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers:
-            callbacks.append(lisa_callback_factory(trainer))
-
-        if any("COLAB_" in key for key in os.environ):
-            ColabCallback = colab_inference_post_train_callback(trainer)
-            callbacks.append(ColabCallback(self.cfg))
-
-        callbacks.extend(super().get_post_trainer_create_callbacks(trainer=trainer))
-        return callbacks
-
-    def _get_trainer_cls(self):
-        if self.cfg.plugins:
-            plugin_manager = PluginManager.get_instance()
-            trainer_cls = plugin_manager.get_trainer_cls(self.cfg)
-            if trainer_cls:
-                return trainer_cls
-        if self.cfg.relora_steps:
-            return ReLoRATrainer
-        if self.cfg.model_config_type == "mamba":
-            return AxolotlMambaTrainer
-        if self.cfg.reward_model:
-            return AxolotlRewardTrainer
-        if self.cfg.process_reward_model:
-            return AxolotlPRMTrainer
-        return AxolotlTrainer
-
-    def build(self, total_num_steps):
-        training_arguments_kwargs, trainer_kwargs = self._set_base_training_args(
-            total_num_steps
-        )
-
-        if self.cfg.fsdp:
-            training_arguments_kwargs["fsdp"] = self.cfg.fsdp
-            if self.cfg.fsdp_config:
-                training_arguments_kwargs["fsdp_config"] = {
-                    k.lstrip("fsdp_"): v for k, v in dict(self.cfg.fsdp_config).items()
-                }
-
-        if self.cfg.adapter == "qlora":
-            training_arguments_kwargs["qlora"] = True
-
-        # deepspeed
-        if self.cfg.deepspeed:
-            training_arguments_kwargs["deepspeed"] = self.cfg.deepspeed
-
-        if self.cfg.lr_quadratic_warmup is not None:
-            training_arguments_kwargs["lr_quadratic_warmup"] = (
-                self.cfg.lr_quadratic_warmup
-            )
-
-        if self.cfg.dataloader_drop_last is not None:
-            training_arguments_kwargs["dataloader_drop_last"] = (
-                self.cfg.dataloader_drop_last
-            )
-        elif self.cfg.sample_packing and self.cfg.eval_sample_packing is False:
-            training_arguments_kwargs["dataloader_drop_last"] = True
-
-        if self.cfg.remove_unused_columns is not None:
-            training_arguments_kwargs["remove_unused_columns"] = (
-                self.cfg.remove_unused_columns
-            )
-
-        if self.cfg.do_bench_eval:
-            training_arguments_kwargs["do_bench_eval"] = self.cfg.do_bench_eval
-            if self.cfg.bench_dataset:
-                training_arguments_kwargs["bench_dataset"] = self.cfg.bench_dataset
-        if self.cfg.do_causal_lm_eval:
-            training_arguments_kwargs["do_causal_lm_eval"] = self.cfg.do_causal_lm_eval
-        if self.cfg.metric_for_best_model:
-            training_arguments_kwargs["metric_for_best_model"] = (
-                self.cfg.metric_for_best_model
-            )
-        if self.cfg.greater_is_better:
-            training_arguments_kwargs["greater_is_better"] = self.cfg.greater_is_better
-
-        # DDP Config
-        if self.cfg.ddp_timeout:
-            training_arguments_kwargs["ddp_timeout"] = self.cfg.ddp_timeout
-        # see https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html
-        if self.cfg.ddp_bucket_cap_mb:
-            training_arguments_kwargs["ddp_bucket_cap_mb"] = self.cfg.ddp_bucket_cap_mb
-        if self.cfg.ddp_broadcast_buffers is not None:
-            training_arguments_kwargs["ddp_broadcast_buffers"] = (
-                self.cfg.ddp_broadcast_buffers
-            )
-
-        # these are all the "standard" kwargs that are def used
-        training_arguments_kwargs["max_seq_length"] = self.cfg.sequence_len
-
-        if self.cfg.auto_find_batch_size is not None:
-            training_arguments_kwargs["auto_find_batch_size"] = (
-                self.cfg.auto_find_batch_size
-            )
-
-        training_arguments_kwargs["eval_accumulation_steps"] = (
-            self.cfg.gradient_accumulation_steps
-        )
-
-        training_arguments_kwargs["load_best_model_at_end"] = (
-            (
-                self.cfg.load_best_model_at_end is not False
-                or self.cfg.early_stopping_patience
-            )
-            and (
-                (not self.cfg.test_datasets and self.cfg.val_set_size > 0)
-                or (self.cfg.test_datasets and self.cfg.val_set_size == 0)
-            )
-            and self.cfg.save_steps
-            and self.cfg.eval_steps
-            and self.cfg.save_steps % self.cfg.eval_steps == 0
-        ) or False
-
-        # handle ddp
-        ddp_find_unused_parameters = None
-        if self.cfg.ddp:
-            ddp_find_unused_parameters = bool(self.cfg.ddp_find_unused_parameters)
-        training_arguments_kwargs["ddp_find_unused_parameters"] = (
-            ddp_find_unused_parameters
-        )
-
-        training_arguments_kwargs["group_by_length"] = self.cfg.group_by_length
-        training_arguments_kwargs["curriculum_sampling"] = self.cfg.curriculum_sampling
-
-        training_arguments_kwargs["sample_packing"] = bool(self.cfg.sample_packing)
-        training_arguments_kwargs["multipack_real_batches"] = (
-            self.cfg.multipack_real_batches
-            if self.cfg.multipack_real_batches is not None
-            else not self.cfg.flash_attention
-        )
-        training_arguments_kwargs["eval_sample_packing"] = bool(
-            self.cfg.eval_sample_packing
-        )
-        if self.cfg.sample_packing_bin_size is not None:
-            training_arguments_kwargs["sample_packing_bin_size"] = (
-                self.cfg.sample_packing_bin_size
-            )
-        if self.cfg.sample_packing_group_size is not None:
-            training_arguments_kwargs["sample_packing_group_size"] = (
-                self.cfg.sample_packing_group_size
-            )
-        if self.cfg.sample_packing_eff_est:
-            training_arguments_kwargs["sample_packing_efficiency"] = (
-                self.cfg.sample_packing_eff_est
-            )
-
-        if self.cfg.relora_steps:
-            training_arguments_kwargs["relora_steps"] = self.cfg.relora_steps
-            training_arguments_kwargs["relora_warmup_steps"] = (
-                self.cfg.relora_warmup_steps
-            )
-            if self.cfg.relora_anneal_steps:
-                training_arguments_kwargs["relora_anneal_steps"] = (
-                    self.cfg.relora_anneal_steps
-                )
-            if self.cfg.relora_prune_ratio:
-                training_arguments_kwargs["relora_prune_ratio"] = (
-                    self.cfg.relora_prune_ratio
-                )
-
-        if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers:
-            training_arguments_kwargs["lisa_n_layers"] = self.cfg.lisa_n_layers
-            training_arguments_kwargs["lisa_step_interval"] = (
-                self.cfg.lisa_step_interval
-            )
-            training_arguments_kwargs["lisa_layers_attribute"] = (
-                self.cfg.lisa_layers_attribute
-            )
-
-        training_arguments_kwargs = self.hook_pre_create_training_args(
-            training_arguments_kwargs
-        )
-        training_arguments_kwargs["model_type"] = self.cfg.model_config_type
-        training_arguments_kwargs["pretraining"] = bool(self.cfg.pretraining_dataset)
-        if self.cfg.chat_template:
-            training_arguments_kwargs["chat_template"] = get_chat_template_from_config(
-                cfg=self.cfg,
-                tokenizer=self.tokenizer,
-            )
-
-        if self.cfg.neftune_noise_alpha is not None:
-            training_arguments_kwargs["neftune_noise_alpha"] = (
-                self.cfg.neftune_noise_alpha
-            )
-
-        if self.cfg.accelerator_config:
-            training_arguments_kwargs["accelerator_config"] = (
-                self.cfg.accelerator_config
-            )
-
-        if self.cfg.image_size:
-            training_arguments_kwargs["image_size"] = self.cfg.image_size
-        if self.cfg.image_resize_algorithm:
-            training_arguments_kwargs["image_resize_algorithm"] = (
-                self.cfg.image_resize_algorithm
-            )
-        if self.cfg.kd_ce_alpha is not None:
-            training_arguments_kwargs["kd_ce_alpha"] = self.cfg.kd_ce_alpha
-        if self.cfg.kd_alpha is not None:
-            training_arguments_kwargs["kd_alpha"] = self.cfg.kd_alpha
-        if self.cfg.kd_temperature is not None:
-            training_arguments_kwargs["kd_temperature"] = self.cfg.kd_temperature
-        if self.cfg.kd_zscore_base_temp is not None:
-            training_arguments_kwargs["kd_zscore_base_temp"] = (
-                self.cfg.kd_zscore_base_temp
-            )
-        if self.cfg.kd_top_k_before_softmax is not None:
-            training_arguments_kwargs["kd_top_k_before_softmax"] = (
-                self.cfg.kd_top_k_before_softmax
-            )
-
-        if self.cfg.reward_model:
-            training_args_cls = AxolotlRewardConfig
-        elif self.cfg.process_reward_model:
-            training_args_cls = AxolotlPRMConfig
-        else:
-            training_args_cls = AxolotlTrainingArguments
-        training_args = training_args_cls(  # pylint: disable=unexpected-keyword-arg
-            **training_arguments_kwargs,
-        )
-        training_args = self.hook_post_create_training_args(training_args)
-
-        # unset run_name so wandb sets up experiment names
-        if self.cfg.use_wandb and training_args.run_name == training_args.output_dir:
-            training_args.run_name = (  # pylint: disable=attribute-defined-outside-init
-                None
-            )
-
-        data_collator_kwargs = {
-            "padding": True,  # True/"longest" is the default
-        }
-        multiple = 64
-        if self.cfg.pad_to_sequence_len:
-            data_collator_kwargs["pad_to_multiple_of"] = multiple * math.ceil(
-                self.cfg.sequence_len / multiple
-            )
-        else:
-            # A100 is best at 64, while others at 8. Let's use the larger so we don't have to check
-            # https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html
-            data_collator_kwargs["pad_to_multiple_of"] = multiple
-
-        trainer_cls = self._get_trainer_cls()
-
-        trainer_kwargs, trainer_cls = self.hook_pre_create_trainer(
-            trainer_kwargs, trainer_cls
-        )
-        if eval_data_collator := self.build_collator(
-            training_args, is_eval=True, **data_collator_kwargs
-        ):
-            if not (self.cfg.reward_model or self.cfg.process_reward_model):
-                trainer_kwargs["eval_data_collator"] = eval_data_collator
-        if not (self.cfg.reward_model or self.cfg.process_reward_model):
-            trainer_kwargs["bench_data_collator"] = transformers.DataCollatorForSeq2Seq(
-                self.tokenizer,
-                return_tensors="pt",
-                **data_collator_kwargs,
-            )
-        sig = inspect.signature(trainer_cls)
-        if "processing_class" in sig.parameters:
-            trainer_kwargs["processing_class"] = self.tokenizer
-        elif "tokenizer" in sig.parameters:
-            trainer_kwargs["tokenizer"] = self.tokenizer
-        if (
-            not (trainer_cls in [AxolotlRewardTrainer, AxolotlPRMTrainer])
-            and self.cfg.datasets is not None
-        ):
-            trainer_kwargs["dataset_tags"] = [
-                d["path"] for d in self.cfg.datasets if not Path(d["path"]).is_dir()
-            ]
-        trainer = trainer_cls(
-            model=self.model,
-            train_dataset=self.train_dataset,
-            eval_dataset=self.eval_dataset,
-            args=training_args,
-            data_collator=self.build_collator(training_args, **data_collator_kwargs),
-            callbacks=self.get_callbacks(),
-            **trainer_kwargs,
-        )
-        trainer = self.hook_post_create_trainer(trainer)
-        for callback in self.get_post_trainer_create_callbacks(trainer):
-            trainer.add_callback(callback)
-
-        if self.cfg.deepspeed and self.cfg.sample_packing:
-            trainer.accelerator.state.deepspeed_plugin.deepspeed_config[
-                "train_micro_batch_size_per_gpu"
-            ] = self.cfg.micro_batch_size
-
-        return trainer
-
-    def build_collator(
-        self, training_args: AxolotlTrainingArguments, is_eval=False, **kwargs
-    ):
-        if training_args.pretraining:
-            if (
-                self.cfg.pretraining_sample_concatenation is False
-                or self.cfg.micro_batch_size > 1
-            ):
-                return DataCollatorForSeq2Seq(self.tokenizer, **kwargs)
-            return None
-
-        if self.cfg.model_config_type == "mamba":
-            return MambaDataCollator(tokenizer=self.tokenizer)
-
-        use_batch_sampler_collator = False
-        if is_eval is False and training_args.sample_packing:
-            use_batch_sampler_collator = True
-        if is_eval and training_args.eval_sample_packing:
-            use_batch_sampler_collator = True
-
-        collator: Type[
-            Union[
-                V2BatchSamplerDataCollatorForSeq2Seq,
-                BatchSamplerDataCollatorForSeq2Seq,
-                DataCollatorForSeq2Seq,
-                DataCollatorWithFlattening,
-                RewardDataCollatorWithPadding,
-            ]
-        ]
-        collator_args = [self.tokenizer]
-        if self.cfg.reward_model:
-            collator = RewardDataCollatorWithPadding
-        elif use_batch_sampler_collator:
-            # Use V2BatchSamplerDataCollatorForSeq2Seq for flex attention,
-            # supported multipack models, or non-flash-attention llama
-            if (
-                self.cfg.flex_attention
-                or self.cfg.model_config_type in SUPPORTED_MULTIPACK_MODEL_TYPES
-                or (
-                    self.cfg.model_config_type in ["llama"]
-                    and self.cfg.flash_attention is not True
-                )
-            ):
-                collator = V2BatchSamplerDataCollatorForSeq2Seq
-            else:
-                collator = BatchSamplerDataCollatorForSeq2Seq
-        else:
-            if self.cfg.processor_type and self.processor:
-                collator = MultiModalChatDataCollator
-                kwargs["processing_strategy"] = get_processing_strategy(
-                    self.processor,
-                    training_args.chat_template,
-                    self.cfg.chat_template,
-                    image_size=training_args.image_size,
-                    image_resize_algorithm=training_args.image_resize_algorithm,
-                )
-            elif self.cfg.batch_flattening:
-                collator = DataCollatorWithFlattening
-                collator_args.pop(0)
-                kwargs.pop("pad_to_multiple_of", None)
-                kwargs.pop("padding", None)
-            elif self.cfg.kd_trainer:
-                from axolotl.integrations.kd.collator import (
-                    DataCollatorForKD,
-                    KDBatchSamplerDataCollatorForSeq2Seq,
-                )
-
-                if self.cfg.sample_packing:
-                    collator = KDBatchSamplerDataCollatorForSeq2Seq
-                else:
-                    collator = DataCollatorForKD
-            else:
-                collator = DataCollatorForSeq2Seq
-
-        kwargs["return_tensors"] = "pt"
-
-        return collator(
-            *collator_args,
-            **kwargs,
-        )
--- a/src/axolotl/core/builders/rl.py
+++ b/src/axolotl/core/builders/rl.py
@@ -1,246 +0,0 @@
-"""Builder for RLHF trainers"""
-
-import inspect
-from pathlib import Path
-
-from axolotl.core.builders.base import TrainerBuilderBase
-from axolotl.core.trainers import (
-    AxolotlCPOTrainer,
-    AxolotlKTOTrainer,
-    AxolotlORPOTrainer,
-)
-from axolotl.core.trainers.dpo import DPOStrategy
-from axolotl.core.trainers.dpo.args import AxolotlDPOConfig
-from axolotl.core.trainers.grpo import GRPOStrategy
-from axolotl.core.training_args import (
-    AxolotlCPOConfig,
-    AxolotlKTOConfig,
-    AxolotlORPOConfig,
-)
-from axolotl.integrations.base import PluginManager
-from axolotl.loaders.utils import ensure_dtype
-from axolotl.utils.logging import get_logger
-from axolotl.utils.schemas.enums import RLType
-
-LOG = get_logger(__name__)
-
-
-class HFRLTrainerBuilder(TrainerBuilderBase):
-    """Trainer factory class for TRL-based RLHF trainers (e.g. DPO)"""
-
-    def get_callbacks(self):
-        callbacks = super().get_callbacks()
-
-        return callbacks
-
-    def get_post_trainer_create_callbacks(self, trainer):
-        callbacks = super().get_post_trainer_create_callbacks(trainer=trainer)
-        return callbacks
-
-    def _get_trainer_cls(self, trainer_kwargs: dict):
-        """
-        Returns trainer_cls and trainer_cls_args
-        """
-        if self.cfg.plugins:
-            plugin_manager = PluginManager.get_instance()
-            trainer_cls = plugin_manager.get_trainer_cls(self.cfg)
-            trainer_cls_args = []  # type: ignore
-
-            if trainer_cls is not None:
-                return trainer_cls, trainer_cls_args
-
-        trainer_cls = None
-        trainer_cls_args = [self.model]
-
-        if self.cfg.rl is RLType.GRPO:
-            trainer_cls = GRPOStrategy.get_trainer_class(
-                context_parallel=self.cfg.context_parallel_degree > 1
-            )
-            trainer_cls_args.extend(GRPOStrategy.set_trainer_args(self.cfg))
-
-            trainer_kwargs.update(GRPOStrategy.set_trainer_kwargs(self.cfg))
-
-        elif self.cfg.rl in [RLType.DPO, RLType.IPO]:
-            trainer_cls = DPOStrategy.get_trainer_class()
-            trainer_cls_args.append(self.model_ref)
-
-        elif self.cfg.rl is RLType.ORPO:
-            trainer_cls = AxolotlORPOTrainer
-        elif self.cfg.rl is RLType.KTO:
-            trainer_cls = AxolotlKTOTrainer
-        elif self.cfg.rl is RLType.SIMPO:
-            trainer_cls = AxolotlCPOTrainer
-        else:
-            raise ValueError(f"Unsupported RL: {self.cfg.rl}")
-
-        return trainer_cls, trainer_cls_args
-
-    def _build_training_arguments(self, total_num_steps):
-        """
-        Returns training_args and trainer_kwargs
-        """
-        training_args_kwargs, trainer_kwargs = self._set_base_training_args(
-            total_num_steps=total_num_steps
-        )
-
-        if self.cfg.remove_unused_columns is not None:
-            training_args_kwargs["remove_unused_columns"] = (
-                self.cfg.remove_unused_columns
-            )
-        else:
-            training_args_kwargs["remove_unused_columns"] = False
-
-        # only rlhf
-        if self.cfg.dataset_processes:
-            training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
-
-        if self.cfg.trl and self.cfg.trl.beta is not None:
-            training_args_kwargs["beta"] = self.cfg.trl.beta
-        elif self.cfg.rl_beta is not None:
-            training_args_kwargs["beta"] = self.cfg.rl_beta
-        elif self.cfg.orpo_alpha is not None:
-            # trl does some odd mapping of alpha to beta to reuse the beta parameter ???
-            training_args_kwargs["beta"] = self.cfg.orpo_alpha
-
-        if self.cfg.rpo_alpha is not None:
-            training_args_kwargs["rpo_alpha"] = self.cfg.rpo_alpha
-
-        if self.cfg.use_wandb:
-            training_args_kwargs["run_name"] = self.cfg.wandb_name
-
-        training_args_cls = None
-        blocklist_args_kwargs = []
-        if self.cfg.rl is RLType.SIMPO:
-            training_args_cls = AxolotlCPOConfig
-            training_args_kwargs["loss_type"] = "simpo"
-            training_args_kwargs["simpo_gamma"] = self.cfg.simpo_gamma
-            if self.cfg.cpo_alpha is not None:
-                training_args_kwargs["cpo_alpha"] = self.cfg.cpo_alpha
-
-        elif self.cfg.rl is RLType.ORPO:
-            training_args_cls = AxolotlORPOConfig
-            if self.cfg.max_prompt_len:
-                training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len
-
-        elif self.cfg.rl is RLType.KTO:
-            training_args_cls = AxolotlKTOConfig
-
-            training_args_kwargs["desirable_weight"] = (
-                self.cfg.kto_desirable_weight or 1.0
-            )
-            training_args_kwargs["undesirable_weight"] = (
-                self.cfg.kto_undesirable_weight or 1.0
-            )
-
-            if self.cfg.max_prompt_len:
-                training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len
-
-        elif self.cfg.rl is RLType.GRPO:
-            training_args_cls = GRPOStrategy.get_training_args_class()
-            training_args_kwargs.update(GRPOStrategy.set_training_args_kwargs(self.cfg))
-            blocklist_args_kwargs = GRPOStrategy.get_blocklist_args_kwargs()
-
-        elif self.cfg.rl in [RLType.DPO, RLType.IPO]:
-            training_args_cls = AxolotlDPOConfig
-            if self.cfg.rl is RLType.IPO:
-                training_args_kwargs["loss_type"] = "ipo"
-
-            # Not compatible with IPO
-            if self.cfg.rl is RLType.DPO and self.cfg.dpo_label_smoothing:
-                training_args_kwargs["label_smoothing"] = self.cfg.dpo_label_smoothing
-
-            training_args_kwargs["max_completion_length"] = None
-            training_args_kwargs["max_prompt_length"] = self.cfg.sequence_len
-            training_args_kwargs["generate_during_eval"] = self.cfg.use_wandb
-            if self.cfg.dpo_use_weighting is not None:
-                training_args_kwargs["use_weighting"] = self.cfg.dpo_use_weighting
-            if self.cfg.dpo_use_logits_to_keep is not None:
-                training_args_kwargs["use_logits_to_keep"] = (
-                    self.cfg.dpo_use_logits_to_keep
-                )
-        else:
-            raise ValueError(f"Unsupported RL: {self.cfg.rl}")
-
-        for blocklist_key in blocklist_args_kwargs:
-            if blocklist_key in training_args_kwargs:
-                del training_args_kwargs[blocklist_key]
-
-        training_args = training_args_cls(  # pylint: disable=unexpected-keyword-arg
-            logging_first_step=True,
-            **training_args_kwargs,
-        )
-
-        # unset run_name so wandb sets up experiment names
-        if self.cfg.use_wandb and training_args.run_name == training_args.output_dir:
-            training_args.run_name = (  # pylint: disable=attribute-defined-outside-init
-                None
-            )
-
-        return training_args, trainer_kwargs
-
-    def build(self, total_num_steps):
-        training_args, trainer_kwargs = self._build_training_arguments(total_num_steps)
-
-        if self.eval_dataset:
-            trainer_kwargs["eval_dataset"] = self.eval_dataset
-        if self.cfg.adapter and self.peft_config and self.cfg.rl is not RLType.GRPO:
-            trainer_kwargs["peft_config"] = self.peft_config
-        if self.cfg.precompute_ref_log_probs is not None:
-            trainer_kwargs["precompute_ref_log_probs"] = (
-                self.cfg.precompute_ref_log_probs
-            )
-
-        trainer_cls, trainer_cls_args = self._get_trainer_cls(trainer_kwargs)
-
-        sig = inspect.signature(trainer_cls)
-        if "tokenizer" in sig.parameters:
-            trainer_kwargs["tokenizer"] = self.tokenizer
-        else:
-            trainer_kwargs["processing_class"] = self.tokenizer
-
-        if self.cfg.datasets is not None and (
-            trainer_cls is DPOStrategy.get_trainer_class()
-        ):
-            trainer_kwargs["dataset_tags"] = [
-                d["path"] for d in self.cfg.datasets if not Path(d["path"]).is_dir()
-            ]
-
-        trainer_kwargs, trainer_cls = self.hook_pre_create_trainer(
-            trainer_kwargs, trainer_cls
-        )
-
-        trainer = trainer_cls(
-            *trainer_cls_args,
-            args=training_args,
-            train_dataset=self.train_dataset,
-            callbacks=self.get_callbacks(),
-            **trainer_kwargs,
-        )
-        if self.cfg.fsdp:
-            ensure_dtype(trainer.model, dtype=self.cfg.torch_dtype)
-            if self.cfg.rl in [RLType.DPO, RLType.IPO] and trainer.ref_model:
-                ensure_dtype(trainer.ref_model, dtype=self.cfg.torch_dtype)
-
-        trainer = self.hook_post_create_trainer(trainer)
-        for callback in self.get_post_trainer_create_callbacks(trainer):
-            trainer.add_callback(callback)
-
-        return trainer
-
-
-class HFPPOTrainerBuilder(TrainerBuilderBase):
-    """
-    HF Factory class for PPO Trainer
-    """
-
-    def get_callbacks(self):
-        callbacks = super().get_callbacks()
-        return callbacks
-
-    def get_post_trainer_create_callbacks(self, trainer):
-        callbacks = super().get_post_trainer_create_callbacks(trainer=trainer)
-        return callbacks
-
-    def build(self, total_num_steps):
-        # TODO: build PPOConfig
-        raise NotImplementedError("PPO trainer builder is not implemented yet.")
--- a/src/axolotl/core/chat/messages.py
+++ b/src/axolotl/core/chat/messages.py
@@ -156,6 +156,7 @@ class Messages(BaseModel):
                        len(input_ids) : len(input_ids) + len(pending_input_ids)
                    ]
                    if new_pending_inputs != pending_input_ids:
+                        # logging.warning("tokenization mismatch from concatenation.")
                        pending_input_ids = new_pending_inputs
                    input_ids.extend(pending_input_ids)
                    if pending_weight:
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
--- a/src/axolotl/core/trainers/init.py
+++ b/src/axolotl/core/trainers/init.py
@@ -5,7 +5,7 @@

 from .base import AxolotlTrainer
 from .dpo.trainer import AxolotlDPOTrainer
-from .grpo.trainer import AxolotlGRPOContextParallelTrainer, AxolotlGRPOTrainer
+from .grpo.trainer import AxolotlGRPOSequenceParallelTrainer, AxolotlGRPOTrainer
 from .mamba import AxolotlMambaTrainer
 from .relora import ReLoRATrainer
 from .trl import (
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -4,16 +4,15 @@

 from __future__ import annotations

+import logging
 import os
 from collections import defaultdict
-from functools import partial, wraps
-from typing import Any, Callable, Literal, Optional
+from functools import wraps
+from typing import Literal

-from axolotl.utils.ctx_managers.context_parallel.distributed import get_context_parallel_manager
 import datasets
 import torch
 from datasets import Dataset
-from torch import nn
 from torch.utils.data import (
    BatchSampler,
    DataLoader,
@@ -30,18 +29,20 @@ from axolotl.core.trainers.mixins import (
    OptimizerMixin,
    RngLoaderMixin,
    SchedulerMixin,
+    SequenceParallelMixin,
 )
 from axolotl.core.trainers.utils import (
    sanitize_kwargs_for_ds_tagging,
    sanitize_kwargs_for_tagging,
 )
-from axolotl.utils.logging import get_logger
 from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths

-LOG = get_logger(__name__)
+LOG = logging.getLogger(__name__)


-class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
+class AxolotlTrainer(
+    SchedulerMixin, OptimizerMixin, RngLoaderMixin, SequenceParallelMixin, Trainer
+):
    """Extend the base Trainer for axolotl helpers"""

    args = None  # type: "AxolotlTrainingArguments"  # type: ignore[name-defined]
@@ -67,31 +68,9 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
        if self.args.orpo_alpha:
            self.loss_fct = torch.nn.CrossEntropyLoss(reduction="none")

-        # SPDA device mesh init
-        import torch.distributed as dist
-
-        world_size = dist.get_world_size()
-        mesh_shape = (
-            world_size // 2,
-            2,
-        )
-        self.world_mesh = dist.DeviceMesh(
-            "cuda",
-            torch.tensor(list(range(world_size))).reshape(mesh_shape),
-            mesh_dim_names=("dp", "cp"),
-        )
-
-    def training_step(
-        self, model: nn.Module, inputs: dict[str, torch.Tensor | Any], num_items_in_batch=None
-    ) -> torch.Tensor:
-        ctx_manager = get_context_parallel_manager(
-            world_mesh=self.world_mesh,
-            model=model,
-        )
-        to_shard = {k: v for k, v in inputs.items() if v.ndim > 1}
-        with ctx_manager(list(to_shard.values())):
-            super().training_step(model, inputs, num_items_in_batch)
-        
+        # Initialize sequence parallelism if enabled
+        if self.args.sequence_parallel_degree > 1:
+            self._setup_sequence_parallel()

    def _wrap_model(self, model, training=True, dataloader=None):
        if self.args.torch_compile:
@@ -141,12 +120,10 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
            drop_last=True,
        )

-    def _get_train_sampler(
-        self, train_dataset: Optional[Dataset] = None
-    ) -> Optional[Sampler]:
+    def _get_train_sampler(self) -> Sampler | None:
        """
-        Helper method to get the sampler for training. Handles cases for sample packing
-        and curriculum sampling (sequential).
+        Helper method to get the sampler for training. Handles cases for sequence
+        parallelism, sample packing, and curriculum sampling (sequential).

        Returns:
            If the dataset is non-empty, a sampler is returned, the type of which
@@ -155,7 +132,9 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
        use_sample_packing = self.args.sample_packing and not self.args.pretraining

        # Determine the base sampler first
-        if self.args.curriculum_sampling:
+        if self.args.sequence_parallel_degree > 1:
+            base_sampler = self._sp_get_train_sampler(self.train_dataset)
+        elif self.args.curriculum_sampling:
            base_sampler = SequentialSampler(self.train_dataset)
        elif use_sample_packing:
            base_sampler = RandomSampler(self.train_dataset)
@@ -167,26 +146,31 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
        if use_sample_packing:
            return self._create_multipack_sampler(
                base_sampler=base_sampler,
-                dataset=train_dataset,
+                dataset=self.train_dataset,
            )

        return base_sampler

    def _get_eval_sampler(self, eval_dataset: Dataset | None = None) -> Sampler | None:
        """
-        Helper method to get the sampler for evaluation. Handles sample packing case.
+        Helper method to get the sampler for evaluation. Handles sequence parallelism
+        and sample packing cases.

        Returns:
            If the dataset is non-empty, a sampler is returned, the type of which
                depends on the passed training args.
        """
+        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
+
        # Multipacking enabled if training is enabled and eval is not explicitly disabled
        use_multipack = (
            self.args.sample_packing and self.args.eval_sample_packing is not False
        )

        # Determine the base sampler
-        if use_multipack:
+        if self.args.sequence_parallel_degree > 1:
+            base_sampler = self._sp_get_eval_sampler(eval_dataset)
+        elif use_multipack:
            base_sampler = SequentialSampler(eval_dataset)
        else:
            return super()._get_eval_sampler(eval_dataset)
@@ -200,91 +184,149 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):

        return base_sampler

-    def _get_dataloader(
-        self,
-        dataset: Dataset,
-        description: str,
-        batch_size: int,
-        sampler_fn: Optional[Callable[[Dataset], torch.utils.data.Sampler]] = None,
-        is_training: bool = False,
-        dataloader_key: Optional[str] = None,
-    ) -> DataLoader:
-        """Create a [`~torch.utils.data.DataLoader`] from the given dataset."""
+    def _create_dataloader_params(self, is_eval=False, custom_batch_size=None):
+        """Create common dataloader parameters for train or eval."""
+        batch_size = custom_batch_size or (
+            self.args.eval_batch_size if is_eval else self._train_batch_size
+        )

-        data_collator = self.data_collator if is_training else self.eval_data_collator
-
-        if dataset.column_names and "length" in dataset.column_names:
-            dataset = dataset.remove_columns(["length"])
-
-        if isinstance(dataset, datasets.Dataset):
-            if is_training:
-                if not self.args.sample_packing or self.args.pretraining:
-                    dataset = self._remove_unused_columns(
-                        dataset, description="training"
-                    )
-            elif (
-                not is_training
-                and self.args.sample_packing
-                and self.args.eval_sample_packing is not False
-            ):
-                batch_size = (
-                    batch_size
-                    if self.args.sample_packing
-                    else self.args.per_device_eval_batch_size
-                )
-            else:
-                dataset = self._remove_unused_columns(dataset, description=description)
-        else:
-            data_collator = self._get_collator_with_removed_columns(
-                self.data_collator, description=description
-            )
-
-        dataloader_params = {
+        params = {
            "batch_size": batch_size,
-            "collate_fn": data_collator,
+            "collate_fn": self.data_collator,
            "num_workers": self.args.dataloader_num_workers,
            "pin_memory": self.args.dataloader_pin_memory,
-            "persistent_workers": self.args.dataloader_persistent_workers,
        }

-        if not isinstance(dataset, torch.utils.data.IterableDataset):
-            dataloader_params["drop_last"] = self.args.dataloader_drop_last
-            if sampler_fn is not None:
-                sampler = sampler_fn(dataset)
-                if isinstance(sampler, BatchSampler):
-                    # batch_size and batch_sampler are mutually exclusive
-                    dataloader_params["batch_sampler"] = sampler
-                    del dataloader_params["batch_size"]
-                    del dataloader_params["drop_last"]
-                else:
-                    dataloader_params["sampler"] = sampler
+        # Add persistent workers only for training
+        if not is_eval and hasattr(self.args, "dataloader_persistent_workers"):
+            params["persistent_workers"] = self.args.dataloader_persistent_workers
+
+        # Add prefetch factor if specified
+        if self.args.dataloader_prefetch_factor:
+            params["prefetch_factor"] = self.args.dataloader_prefetch_factor
+
+        return params
+
+    def _prepare_dataloader(
+        self, dataset, sampler, is_eval=False, custom_batch_size=None
+    ):
+        """Prepare a dataloader with the given dataset and sampler."""
+        # Get base parameters
+        dataloader_params = self._create_dataloader_params(is_eval, custom_batch_size)
+
+        # Add sampler configuration
+        if not isinstance(dataset, torch.utils.data.IterableDataset):
+            if isinstance(sampler, BatchSampler):
+                # batch_size and batch_sampler are mutually exclusive
+                dataloader_params["batch_sampler"] = sampler
+                del dataloader_params["batch_size"]
+            else:
+                dataloader_params["sampler"] = sampler
+                dataloader_params["drop_last"] = self.args.dataloader_drop_last
+
+            if not is_eval:
+                dataloader_params["worker_init_fn"] = seed_worker
+
+        # Create the dataloader
+        dataloader = DataLoader(dataset, **dataloader_params)

-            dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor
-            if is_training:
-                dataloader_params["worker_init_fn"] = partial(
-                    seed_worker,
-                    num_workers=self.args.dataloader_num_workers,
-                    rank=self.args.process_index,
-                )
        if self.args.sample_packing and (
-            (is_training and not self.args.pretraining)
-            or (not is_training and self.args.eval_sample_packing is not False)
+            (not is_eval and not self.args.pretraining)
+            or (is_eval and self.args.eval_sample_packing is not False)
        ):
            self.accelerator.even_batches = False

-        dataloader = DataLoader(dataset, **dataloader_params)
+        # Return unprepared dataloader if using sequence parallelism
+        # TODO(djsaunde): We might be able to use `accelerate`'s dataloader preparation
+        # if we use `dispatch_batches` and `slice_fn_for_dispatch` properly (i.e.,
+        # slice each batch along the sequence dimension).
+        if self.args.sequence_parallel_degree > 1:
+            return dataloader

-        # Accelerator.free_memory() will destroy the references, so
-        # we need to store the non-prepared version for eval dataloaders.
-        # fmt: off
-        if dataloader_key is not None and self.args.dataloader_persistent_workers:
-            if hasattr(self, "_eval_dataloaders"):
-                self._eval_dataloaders[dataloader_key] = dataloader  # type: ignore  # pylint: disable=access-member-before-definition
-            else:
-                self._eval_dataloaders = {dataloader_key: dataloader}  # pylint: disable=attribute-defined-outside-init
-        # fmt: on
+        # Otherwise prepare with accelerator
+        return self.accelerator.prepare_data_loader(dataloader)

-        return self.accelerator.prepare(dataloader)
+    def get_train_dataloader(self) -> DataLoader:
+        """Get dataloader for training"""
+        train_dataset = self.train_dataset
+        data_collator = self.data_collator  # type: ignore
+
+        # Handle dataset preprocessing
+        if isinstance(train_dataset, datasets.Dataset):
+            if self.args.sample_packing and not self.args.pretraining:
+                train_dataset = train_dataset.remove_columns(["length"])
+            if not self.args.sample_packing or self.args.pretraining:
+                train_dataset = self._remove_unused_columns(
+                    train_dataset, description="training"
+                )
+        else:
+            self.data_collator = self._get_collator_with_removed_columns(  # pylint: disable=attribute-defined-outside-init
+                data_collator,
+                description="training",
+            )
+
+        # Get sampler and create dataloader
+        sampler = self._get_train_sampler()
+        return self._prepare_dataloader(train_dataset, sampler, is_eval=False)
+
+    def get_eval_dataloader(self, eval_dataset: Dataset | None = None) -> DataLoader:
+        """Get dataloader for evaluation"""
+        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
+
+        # Handle special case: sample packing is enabled but eval_sample_packing is False
+        if self.args.sample_packing and self.args.eval_sample_packing is False:
+            self.data_collator = (  # pylint: disable=attribute-defined-outside-init
+                self.eval_data_collator
+            )
+            if "length" in eval_dataset.column_names:
+                eval_dataset = eval_dataset.remove_columns(["length"])
+            dataloader = super().get_eval_dataloader(eval_dataset)
+            self.data_collator = (  # pylint: disable=attribute-defined-outside-init
+                self.train_data_collator
+            )
+
+            return dataloader
+
+        # Handle sample packing or sequence parallelism
+        if (
+            self.args.sample_packing
+            and self.args.eval_sample_packing is not False
+            or self.args.sequence_parallel_degree > 1
+        ):
+            # Get appropriate data collator
+            self.data_collator = (  # pylint: disable=attribute-defined-outside-init
+                self.eval_data_collator
+                if hasattr(self, "eval_data_collator") and self.eval_data_collator
+                else self.data_collator
+            )
+            if "length" in eval_dataset.column_names:
+                eval_dataset = eval_dataset.remove_columns(["length"])
+
+            # Handle dataset preprocessing for SP
+            if self.args.sequence_parallel_degree > 1:
+                if isinstance(eval_dataset, datasets.Dataset):
+                    eval_dataset = self._remove_unused_columns(
+                        eval_dataset, description="evaluation"
+                    )
+                else:
+                    self.data_collator = self._get_collator_with_removed_columns(  # pylint: disable=attribute-defined-outside-init
+                        self.data_collator, description="evaluation"
+                    )
+
+            # Use eval_batch_size for sample packing, per_device_eval_batch_size otherwise
+            batch_size = (
+                self.args.eval_batch_size
+                if self.args.sample_packing
+                else self.args.per_device_eval_batch_size
+            )
+            sampler = self._get_eval_sampler(eval_dataset)
+            dataloader = self._prepare_dataloader(
+                eval_dataset, sampler, is_eval=True, custom_batch_size=batch_size
+            )
+
+            return dataloader
+
+        return super().get_eval_dataloader(eval_dataset)

    def _get_bench_sampler(
        self, bench_dataset: Dataset
--- a/src/axolotl/core/trainers/dpo/trainer.py
+++ b/src/axolotl/core/trainers/dpo/trainer.py
@@ -1,41 +1,92 @@
-"""DPO trainer for axolotl"""
+"""
+DPO trainer for axolotl
+"""

 import gc
+import random
 from functools import wraps
-from typing import Any, Dict, Union
+from typing import Any, Dict, Optional, Union

+import pandas as pd
 import torch
+import wandb
+from accelerate import PartialState
+from datasets import Dataset, IterableDataset
+from peft.optimizers import create_loraplus_optimizer
 from torch import nn
-from trl import DPOTrainer
+from torch.utils.data import DataLoader
+from transformers import (
+    BaseImageProcessor,
+    FeatureExtractionMixin,
+    PreTrainedTokenizerBase,
+    ProcessorMixin,
+    Trainer,
+)
+from transformers.trainer_utils import EvalLoopOutput
+from transformers.utils import is_sagemaker_mp_enabled
+from trl import DPOConfig, DPOTrainer, maybe_apply_chat_template, maybe_extract_prompt
+from trl.trainer.utils import log_table_to_comet_experiment

 from axolotl.core.trainers.mixins import RngLoaderMixin, SchedulerMixin
-from axolotl.core.trainers.mixins.optimizer import OptimizerInitMixin, OptimizerMixin
 from axolotl.core.trainers.utils import (
    sanitize_kwargs_for_ds_tagging,
    sanitize_kwargs_for_tagging,
 )

+if is_sagemaker_mp_enabled():
+    import smdistributed.modelparallel.torch as smp

-class AxolotlDPOTrainer(
-    RngLoaderMixin, SchedulerMixin, OptimizerMixin, OptimizerInitMixin, DPOTrainer
-):
-    """Extend the base DPOTrainer for axolotl helpers."""
+
+class AxolotlDPOTrainer(RngLoaderMixin, SchedulerMixin, DPOTrainer):
+    """
+    Extend the base DPOTrainer for axolotl helpers
+    """

    tag_names = ["axolotl", "dpo"]

    def __init__(self, *args, dataset_tags=None, **kwargs):
        super().__init__(*args, **kwargs)
-
        self.dataset_tags = dataset_tags
        self.optimizer = None
        self.model_accepts_loss_kwargs = False

+    def create_optimizer(self):
+        # pylint: disable=duplicate-code
+        if self.args.loraplus_lr_ratio is None:
+            return super().create_optimizer()
+
+        opt_model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model
+        if self.optimizer is None:  # pylint: disable=access-member-before-definition
+            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(
+                self.args,
+                opt_model,
+            )
+
+            loraplus_lr_ratio = getattr(self.args, "loraplus_lr_ratio", None)
+            if loraplus_lr_ratio:
+                print("Using lora+")
+            loraplus_lr_embedding = getattr(self.args, "loraplus_lr_embedding", None)
+            # pylint: disable=duplicate-code
+            self.optimizer = create_loraplus_optimizer(  # pylint: disable=attribute-defined-outside-init
+                opt_model,
+                optimizer_cls,
+                loraplus_lr_ratio=loraplus_lr_ratio,
+                loraplus_lr_embedding=loraplus_lr_embedding,
+                **optimizer_kwargs,
+            )
+
+        if is_sagemaker_mp_enabled():
+            self.optimizer = smp.DistributedOptimizer(  # pylint: disable=attribute-defined-outside-init
+                self.optimizer
+            )
+
+        return self.optimizer
+
    @wraps(DPOTrainer.push_to_hub)
    def push_to_hub(self, *args, **kwargs) -> str:
        """
-        Overwrite the `push_to_hub` method in order to force-add the tags when pushing
-        the model on the Hub. Please refer to `~transformers.Trainer.push_to_hub`
-        for more details.
+        Overwrite the `push_to_hub` method in order to force-add the tags when pushing the
+        model on the Hub. Please refer to `~transformers.Trainer.push_to_hub` for more details.
        """
        kwargs = sanitize_kwargs_for_ds_tagging(
            dataset_tags=self.dataset_tags, kwargs=kwargs
@@ -44,6 +95,64 @@ class AxolotlDPOTrainer(

        return super().push_to_hub(*args, **kwargs)

+    # TODO: remove this once https://github.com/huggingface/trl/pull/3377 is in a release
+    def _prepare_dataset(
+        self,
+        dataset: Union[Dataset, IterableDataset],
+        processing_class: Union[
+            PreTrainedTokenizerBase,
+            BaseImageProcessor,
+            FeatureExtractionMixin,
+            ProcessorMixin,
+        ],
+        args: DPOConfig,
+        dataset_name: str,
+    ) -> Union[Dataset, IterableDataset]:
+        # Build the kwargs for the `map` function
+        map_kwargs: Dict[str, Any] = {"writer_batch_size": 10}
+        if isinstance(dataset, Dataset):  # IterableDataset does not support num_proc
+            map_kwargs["num_proc"] = args.dataset_num_proc
+
+        with PartialState().main_process_first():
+            # Extract prompt if needed
+            if isinstance(
+                dataset, Dataset
+            ):  # `IterableDataset.map` does not support `desc`
+                map_kwargs["desc"] = f"Extracting prompt in {dataset_name} dataset"
+            dataset = dataset.map(maybe_extract_prompt, **map_kwargs)
+
+            # Apply the chat template if needed
+            if isinstance(
+                dataset, Dataset
+            ):  # `IterableDataset.map` does not support `desc`
+                map_kwargs["desc"] = f"Applying chat template to {dataset_name} dataset"
+            dataset = dataset.map(
+                maybe_apply_chat_template,
+                fn_kwargs={"tokenizer": processing_class, "tools": args.tools},
+                **map_kwargs,
+            )
+
+            # Tokenize the dataset
+            if isinstance(
+                dataset, Dataset
+            ):  # `IterableDataset.map` does not support `desc`
+                map_kwargs["desc"] = f"Tokenizing {dataset_name} dataset"
+
+            dataset = dataset.map(
+                self.tokenize_row if not self.is_vision_model else self.process_row,
+                remove_columns=["chosen", "rejected"],
+                fn_kwargs={
+                    "processing_class": processing_class,
+                    "max_prompt_length": args.max_prompt_length,
+                    "max_completion_length": args.max_completion_length,
+                    # for enc-dec, we add the special tokens ([bos_token] + prompt + [eos_token]; completion + [eos_token])
+                    "add_special_tokens": False,
+                },
+                **map_kwargs,
+            )
+
+        return dataset
+
    @staticmethod
    def tokenize_row(
        features,
@@ -83,3 +192,69 @@ class AxolotlDPOTrainer(
        gc.collect()
        torch.cuda.empty_cache()
        return loss
+
+    # TODO: remove this once https://github.com/huggingface/trl/pull/3377 is in a release
+    def evaluation_loop(
+        self,
+        dataloader: DataLoader,
+        description: str,
+        prediction_loss_only: Optional[bool] = None,
+        ignore_keys: Optional[list[str]] = None,
+        metric_key_prefix: str = "eval",
+    ) -> EvalLoopOutput:
+        """
+        Overriding built-in evaluation loop to store metrics for each batch.
+        Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.
+
+        Works both with or without labels.
+        """
+
+        # Sample and save to game log if requested (for one batch to save time)
+        if self.generate_during_eval:
+            # Generate random indices within the range of the total number of samples
+            num_samples = len(dataloader.dataset)
+            random_indices = random.sample(
+                range(num_samples), k=self.args.eval_batch_size
+            )
+
+            # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader
+            random_batch_dataset = dataloader.dataset.select(random_indices)
+            random_batch = self.data_collator(random_batch_dataset)
+            random_batch = self._prepare_inputs(random_batch)
+
+            policy_output_decoded, ref_output_decoded = (
+                self.generate_from_model_and_ref(self.model, random_batch)
+            )
+
+            table = pd.DataFrame(
+                columns=["Prompt", "Policy", "Ref Model"],
+                data=[
+                    [prompt, pol[len(prompt) :], ref[len(prompt) :]]
+                    for prompt, pol, ref in zip(
+                        random_batch_dataset["prompt"],
+                        policy_output_decoded,
+                        ref_output_decoded,
+                    )
+                ],
+            )
+            if "wandb" in self.args.report_to and self.accelerator.is_main_process:
+                wandb.log({"game_log": wandb.Table(data=table)})
+
+            if "comet_ml" in self.args.report_to:
+                log_table_to_comet_experiment(
+                    name="game_log.csv",
+                    table=table,
+                )
+
+        # Base evaluation
+        initial_output = super(  # pylint: disable=bad-super-call
+            DPOTrainer, self
+        ).evaluation_loop(
+            dataloader,
+            description,
+            prediction_loss_only,
+            ignore_keys,
+            metric_key_prefix,
+        )
+
+        return initial_output
--- a/src/axolotl/core/trainers/grpo/init.py
+++ b/src/axolotl/core/trainers/grpo/init.py
@@ -2,20 +2,20 @@

 import importlib
 import inspect
+import logging
 from typing import Any

 from trl.trainer.grpo_trainer import RewardFunc

 from axolotl.core.trainers.grpo.args import AxolotlGRPOConfig
 from axolotl.core.trainers.grpo.trainer import (
-    AxolotlGRPOContextParallelTrainer,
+    AxolotlGRPOSequenceParallelTrainer,
    AxolotlGRPOTrainer,
 )
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.trl import TRLConfig

-LOG = get_logger(__name__)
+LOG = logging.getLogger(__name__)


 class GRPOStrategy:
@@ -23,10 +23,10 @@ class GRPOStrategy:

    @classmethod
    def get_trainer_class(
-        cls, context_parallel: bool
-    ) -> type[AxolotlGRPOTrainer] | type[AxolotlGRPOContextParallelTrainer]:
-        if context_parallel:
-            return AxolotlGRPOContextParallelTrainer
+        cls, sequence_parallel: bool
+    ) -> type[AxolotlGRPOTrainer] | type[AxolotlGRPOSequenceParallelTrainer]:
+        if sequence_parallel:
+            return AxolotlGRPOSequenceParallelTrainer
        return AxolotlGRPOTrainer

    @classmethod
@@ -69,9 +69,6 @@ class GRPOStrategy:
        grpo_args_kwargs["log_completions"] = trl.log_completions
        grpo_args_kwargs["num_completions_to_print"] = trl.num_completions_to_print

-        if cfg.context_parallel_degree > 1:
-            grpo_args_kwargs["context_parallel_degree"] = cfg.context_parallel_degree
-
        if trl.reward_weights:
            grpo_args_kwargs["reward_weights"] = trl.reward_weights

@@ -109,9 +106,7 @@ class GRPOStrategy:
        return grpo_args_kwargs

    @classmethod
-    def set_trainer_args(
-        cls, cfg: DictDefault
-    ) -> list[Any]:  # pylint: disable=unused-argument
+    def set_trainer_args(cls, cfg: DictDefault) -> list[Any]:
        trainer_args = []
        if cfg.trl and cfg.trl.reward_funcs:
            reward_funcs = []
@@ -128,7 +123,6 @@ class GRPOStrategy:
            trainer_kwargs["reward_processing_classes"] = (
                cfg.trl.reward_processing_classes
            )
-
        return trainer_kwargs

    @classmethod
@@ -138,7 +132,7 @@ class GRPOStrategy:

    @classmethod
    def get_blocklist_args_kwargs(cls) -> list[str]:
-        return ["dataset_num_proc", "max_length"]
+        return ["dataset_num_proc"]

    @classmethod
    def get_reward_func(cls, reward_func_fqn: str) -> RewardFunc:
@@ -173,4 +167,4 @@ class GRPOStrategy:
            LOG.info(
                f"Reward function {reward_func_fqn} is a pre-trained model path - if this is unexpected, please check the reward function path."
            )
-            return reward_func_fqn
+            return reward_func
--- a/src/axolotl/core/trainers/grpo/args.py
+++ b/src/axolotl/core/trainers/grpo/args.py
@@ -12,5 +12,3 @@ from axolotl.core.training_args import AxolotlTrainingMixins
@dataclass
 class AxolotlGRPOConfig(AxolotlTrainingMixins, GRPOConfig):
    """Axolotl GRPO Config for GRPO training"""
-
-    context_parallel_degree: int | None = None
--- a/src/axolotl/core/trainers/grpo/sampler.py
+++ b/src/axolotl/core/trainers/grpo/sampler.py
@@ -1,7 +1,7 @@
 """Repeat random sampler (similar to the one implemented in
 https://github.com/huggingface/trl/blob/main/trl/trainer/grpo_trainer.py) that adds
-context parallelism functionality; i.e., duplicating data across ranks in the same
-context parallel group.
+sequence parallelism functionality; i.e., duplicating data across ranks in the same
+sequence parallel group.
 """

 from typing import Iterator, Sized
@@ -10,26 +10,26 @@ import torch
 from torch.utils.data import Sampler


-class ContextParallelRepeatRandomSampler(Sampler):
-    """Sampler for GRPO training with context parallelism.
+class SequenceParallelRepeatRandomSampler(Sampler):
+    """Sampler for GRPO training with sequence parallelism.

    This sampler ensures:
-    - Ranks in the same context parallel (SP) group receive identical data.
+    - Ranks in the same sequence parallel (SP) group receive identical data.
    - Each index is repeated multiple times for sampling different completions.
    - Entire batches are repeated for reuse in multiple updates.
-    - Data is properly distributed across CP groups.
+    - Data is properly distributed across SP groups.

-    In the table below, the values represent dataset indices. Each CP group has
-    `context_parallel_degree = 2` GPUs working together on the same data. There are 2
-    CP groups (SP0 and SP1), with `world_size = 4` total GPUs.
+    In the table below, the values represent dataset indices. Each SP group has
+    `sequence_parallel_degree = 2` GPUs working together on the same data. There are 2
+    SP groups (SP0 and SP1), with `world_size = 4` total GPUs.

-                                               Context Parallel Groups
+                                               Sequence Parallel Groups
                                        |       SP0        |       SP1        |
                                        |  GPU 0  |  GPU 1 |  GPU 2  |  GPU 3 |
                    global_step  step    <---> mini_repeat_count=3
-                                            <----------> batch_size=2 per CP group
-    grad_accum=2   ▲  ▲  0       0         [0 0 0  1 1 1]     [2 2 2  3 3 3]   <- CP groups get different data
-                   ▼  |  0       1         [0 0 0  1 1 1]     [2 2 2  3 3 3]   <- Same data for each CP group GPU
+                                            <----------> batch_size=2 per SP group
+    grad_accum=2   ▲  ▲  0       0         [0 0 0  1 1 1]     [2 2 2  3 3 3]   <- SP groups get different data
+                   ▼  |  0       1         [0 0 0  1 1 1]     [2 2 2  3 3 3]   <- Same data for each SP group GPU
                      |
                      |  1       2         [0 0 0  1 1 1]     [2 2 2  3 3 3]   <- Repeat same indices for iterations
    num_iterations=2  ▼  1       3         [0 0 0  1 1 1]     [2 2 2  3 3 3]   <- When using gradient accumulation
@@ -45,7 +45,7 @@ class ContextParallelRepeatRandomSampler(Sampler):
        rank: Rank of current process.
        batch_size: Number of samples per batch.
        repeat_count: How many times to repeat the full sampling process.
-        context_parallel_degree: Number of ranks in a context parallel group.
+        sequence_parallel_degree: Number of ranks in a sequence parallel group.
        shuffle: Whether to shuffle the dataset.
        seed: Random seed for shuffling.
        drop_last: Whether to drop the last incomplete batch.
@@ -59,7 +59,7 @@ class ContextParallelRepeatRandomSampler(Sampler):
        rank: int,
        batch_size: int = 1,
        repeat_count: int = 1,
-        context_parallel_degree: int = 1,
+        sequence_parallel_degree: int = 1,
        shuffle: bool = True,
        seed: int = 0,
        drop_last: bool = False,
@@ -76,16 +76,16 @@ class ContextParallelRepeatRandomSampler(Sampler):
        self.world_size = world_size
        self.rank = rank

-        # Context parallelism parameters
-        self.context_parallel_degree = context_parallel_degree
-        self.num_sp_groups = world_size // context_parallel_degree
-        self.sp_group_id = rank // context_parallel_degree
+        # Sequence parallelism parameters
+        self.sequence_parallel_degree = sequence_parallel_degree
+        self.num_sp_groups = world_size // sequence_parallel_degree
+        self.sp_group_id = rank // sequence_parallel_degree

        # Adjust dataset size for distributed sampling
        self.num_samples = len(self.dataset)
        self.total_size = self.num_samples

-        # Calculate effective number of samples per CP group
+        # Calculate effective number of samples per SP group
        if (
            self.drop_last
            and self.total_size % (self.num_sp_groups * self.batch_size) != 0
@@ -125,8 +125,8 @@ class ContextParallelRepeatRandomSampler(Sampler):
            padding = indices[: self.batch_size - len(indices) % self.batch_size]
            indices += padding

-        # Subsample based on CP group ID
-        # Each CP group gets distinct batches of data
+        # Subsample based on SP group ID
+        # Each SP group gets distinct batches of data
        batch_indices = []
        for i in range(0, len(indices), self.batch_size * self.num_sp_groups):
            start_idx = i + self.sp_group_id * self.batch_size
--- a/src/axolotl/core/trainers/grpo/trainer.py
+++ b/src/axolotl/core/trainers/grpo/trainer.py
@@ -1,4 +1,4 @@
-"""Axolotl GRPO trainers (with and without context parallelism handling)"""
+"""Axolotl GRPO trainers (with and without sequence parallelism handling)"""

 # pylint: disable=too-many-lines,duplicate-code,protected-access,no-member

@@ -41,26 +41,23 @@ from trl.trainer.grpo_config import GRPOConfig
 from trl.trainer.grpo_trainer import RewardFunc, nanstd
 from trl.trainer.utils import pad

-from axolotl.core.trainers.grpo.sampler import ContextParallelRepeatRandomSampler
+from axolotl.core.trainers.grpo.sampler import SequenceParallelRepeatRandomSampler
 from axolotl.core.trainers.mixins import RngLoaderMixin, SchedulerMixin
-from axolotl.core.trainers.mixins.optimizer import OptimizerInitMixin, OptimizerMixin
-from axolotl.monkeypatch.ring_attn import get_ring_attn_group
+from axolotl.monkeypatch.attention.ring_attn.patch import get_ring_attn_group

 if is_peft_available():
    # pylint: disable=unused-import
    from peft import PeftConfig


-class AxolotlGRPOTrainer(
-    RngLoaderMixin, SchedulerMixin, OptimizerMixin, OptimizerInitMixin, GRPOTrainer
-):
+class AxolotlGRPOTrainer(RngLoaderMixin, SchedulerMixin, GRPOTrainer):
    """Extend the base GRPOTrainer for axolotl helpers"""

    _tag_names = ["trl", "grpo", "axolotl"]


-class AxolotlGRPOContextParallelTrainer(AxolotlGRPOTrainer):
-    """Extend the base GRPOTrainer for context parallelism handling"""
+class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
+    """Extend the base GRPOTrainer for sequence parallelism handling"""

    def __init__(
        self,
@@ -80,7 +77,6 @@ class AxolotlGRPOContextParallelTrainer(AxolotlGRPOTrainer):
            torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None
        ] = (None, None),
        peft_config: "PeftConfig | None" = None,
-        optimizer_cls_and_kwargs: tuple[type, dict] | None = None,
    ):
        # First call the superclass constructor with all arguments
        super().__init__(
@@ -94,14 +90,13 @@ class AxolotlGRPOContextParallelTrainer(AxolotlGRPOTrainer):
            callbacks=callbacks,
            optimizers=optimizers,
            peft_config=peft_config,
-            optimizer_cls_and_kwargs=optimizer_cls_and_kwargs,
        )

-        # Get number of CP groups (number of processes divided by CP degree)
+        # Get number of SP groups (number of processes divided by SP degree)
        num_processes = self.accelerator.num_processes
-        num_sp_groups = num_processes // self.args.context_parallel_degree
+        num_sp_groups = num_processes // self.args.sequence_parallel_degree

-        # Calculate batch size per CP group (not per process)
+        # Calculate batch size per SP group (not per process)
        sp_group_batch_size = self.args.per_device_train_batch_size * num_sp_groups
        possible_values = [
            n_gen
@@ -111,7 +106,7 @@ class AxolotlGRPOContextParallelTrainer(AxolotlGRPOTrainer):

        if self.num_generations not in possible_values:
            raise ValueError(
-                f"The batch size per CP group ({num_sp_groups} x "
+                f"The batch size per SP group ({num_sp_groups} x "
                f"{self.args.per_device_train_batch_size}) must be evenly divisible by "
                f"the number of generations per prompt ({self.num_generations}). Given "
                "the current configuration, the valid values for the number of "
@@ -119,7 +114,7 @@ class AxolotlGRPOContextParallelTrainer(AxolotlGRPOTrainer):
            )

        if self.args.eval_strategy != "no":
-            # If context parallelism is enabled, calculate batch size per CP group
+            # If sequence parallelism is enabled, calculate batch size per SP group
            sp_group_eval_batch_size = args.per_device_eval_batch_size * num_sp_groups  # type: ignore[union-attr]
            possible_values = [
                n_gen
@@ -129,29 +124,20 @@ class AxolotlGRPOContextParallelTrainer(AxolotlGRPOTrainer):

            if self.num_generations not in possible_values:
                raise ValueError(
-                    f"With context parallelism (degree {self.args.context_parallel_degree}), "
-                    f"the eval batch size per CP group ({num_sp_groups} x {self.args.per_device_eval_batch_size}) "
+                    f"With sequence parallelism (degree {self.args.sequence_parallel_degree}), "
+                    f"the eval batch size per SP group ({num_sp_groups} x {self.args.per_device_eval_batch_size}) "
                    f"must be evenly divisible by the number of generations per prompt "
                    f"({self.num_generations}). Given the current eval batch size, "
                    f"the valid values for the number of generations are: {possible_values}."
                )

-        self.sp_group = None
-        self.rank = dist.get_rank()
-        self.world_size = dist.get_world_size()
-        self.local_rank = 0
-        self.local_world_size = 1
-
-    def train(self, *args, **kwargs):
-        # Initialize the CP group
+        # Initialize the SP group
        self.sp_group = get_ring_attn_group()
        self.rank = dist.get_rank()
        self.world_size = dist.get_world_size()
        self.local_rank = dist.get_rank(group=self.sp_group)
        self.local_world_size = dist.get_world_size(group=self.sp_group)

-        return super().train(*args, **kwargs)
-
    def _get_train_sampler(self) -> Sampler:
        effective_batch_size = (
            self.args.per_device_train_batch_size
@@ -159,16 +145,16 @@ class AxolotlGRPOContextParallelTrainer(AxolotlGRPOTrainer):
            * self.args.gradient_accumulation_steps
        )

-        return ContextParallelRepeatRandomSampler(
+        return SequenceParallelRepeatRandomSampler(
            dataset=self.train_dataset,
            mini_repeat_count=self.num_generations,
            world_size=self.world_size,
            rank=self.rank,
            batch_size=effective_batch_size
            // self.num_generations
-            // self.args.context_parallel_degree,
+            // self.args.sequence_parallel_degree,
            repeat_count=self.num_iterations * self.args.gradient_accumulation_steps,
-            context_parallel_degree=self.args.context_parallel_degree,
+            sequence_parallel_degree=self.args.sequence_parallel_degree,
            shuffle=True,
            seed=self.args.seed,
            drop_last=True,
@@ -226,11 +212,11 @@ class AxolotlGRPOContextParallelTrainer(AxolotlGRPOTrainer):
        ):
            self.accelerator.even_batches = False

-        # Return unprepared dataloader if using context parallelism
+        # Return unprepared dataloader if using sequence parallelism
        # TODO(djsaunde): We might be able to use `accelerate`'s dataloader preparation
        # if we use `dispatch_batches` and `slice_fn_for_dispatch` properly (i.e.,
        # slice each batch along the sequence dimension).
-        if self.args.context_parallel_degree > 1:
+        if self.args.sequence_parallel_degree > 1:
            return dataloader

        # Otherwise prepare with accelerator
@@ -303,21 +289,21 @@ class AxolotlGRPOContextParallelTrainer(AxolotlGRPOTrainer):
            # Generate completions using vLLM: gather all prompts and use them in a single call in the main process
            all_prompts_text = gather_object(prompts_text)
            if self.accelerator.is_main_process:
-                if self.args.context_parallel_degree > 1:
-                    # Calculate context parallel group information
+                if self.args.sequence_parallel_degree > 1:
+                    # Calculate sequence parallel group information
                    world_size = self.accelerator.num_processes
-                    context_parallel_degree = self.args.context_parallel_degree
-                    num_sp_groups = world_size // context_parallel_degree
+                    sequence_parallel_degree = self.args.sequence_parallel_degree
+                    num_sp_groups = world_size // sequence_parallel_degree

-                    # Since processes in the same CP group have the same prompts, we need to ensure
-                    # we only take one copy of each prompt from each CP group
+                    # Since processes in the same SP group have the same prompts, we need to ensure
+                    # we only take one copy of each prompt from each SP group
                    ordered_set_of_prompts = []
                    for sp_group_id in range(num_sp_groups):
-                        # Get the first process from each CP group (typically the group leader)
-                        group_leader_rank = sp_group_id * context_parallel_degree
+                        # Get the first process from each SP group (typically the group leader)
+                        group_leader_rank = sp_group_id * sequence_parallel_degree

-                        # Extract prompts from this CP group, accounting for num_generations duplicates
-                        # We only need prompts from one rank in each CP group
+                        # Extract prompts from this SP group, accounting for num_generations duplicates
+                        # We only need prompts from one rank in each SP group
                        group_prompts = all_prompts_text[
                            group_leader_rank
                            * len(prompts_text) : (group_leader_rank + 1)
@@ -330,7 +316,7 @@ class AxolotlGRPOContextParallelTrainer(AxolotlGRPOTrainer):
                    # num_generations outputs for each one. This is faster than generating outputs for each duplicate
                    # prompt individually.
                    ordered_set_of_prompts = all_prompts_text[
-                        :: self.num_generations * self.args.context_parallel_degree
+                        :: self.num_generations * self.args.sequence_parallel_degree
                    ]

                with profiling_context(self, "vLLM.generate"):
@@ -347,28 +333,28 @@ class AxolotlGRPOContextParallelTrainer(AxolotlGRPOTrainer):
                    )
            else:
                completion_ids = [None] * (
-                    len(all_prompts_text) // self.args.context_parallel_degree
+                    len(all_prompts_text) // self.args.sequence_parallel_degree
                )

            # Broadcast the completions from the main process to all processes
            completion_ids = broadcast_object_list(completion_ids, from_process=0)

-            # Determine the appropriate slice based on context parallelism
-            if self.args.context_parallel_degree > 1:
-                # Calculate CP group ID (which group of ranks this rank belongs to)
+            # Determine the appropriate slice based on sequence parallelism
+            if self.args.sequence_parallel_degree > 1:
+                # Calculate SP group ID (which group of ranks this rank belongs to)
                sp_group_id = self.accelerator.process_index // self.local_world_size

-                # Calculate the start index for this CP group
+                # Calculate the start index for this SP group
                sp_group_start = sp_group_id * len(prompts) * self.local_world_size

-                # All ranks in the same CP group get the same data slice
+                # All ranks in the same SP group get the same data slice
                process_slice = slice(
                    sp_group_start,
                    sp_group_start + len(prompts),
                )
                completion_ids = completion_ids[process_slice]
            else:
-                # Original behavior for non-context parallel case
+                # Original behavior for non-sequence parallel case
                process_slice = slice(
                    self.accelerator.process_index * len(prompts),
                    (self.accelerator.process_index + 1) * len(prompts),
@@ -578,20 +564,20 @@ class AxolotlGRPOContextParallelTrainer(AxolotlGRPOTrainer):
            advantages = advantages / (std_grouped_rewards + 1e-4)

        # Slice to keep only the local part of the data
-        if self.args.context_parallel_degree > 1:
-            # Calculate CP group ID (which group of ranks this rank belongs to)
+        if self.args.sequence_parallel_degree > 1:
+            # Calculate SP group ID (which group of ranks this rank belongs to)
            sp_group_id = self.accelerator.process_index // self.local_world_size

-            # Calculate the start index for this CP group
+            # Calculate the start index for this SP group
            sp_group_start = sp_group_id * len(prompts) * self.local_world_size

-            # All ranks in the same CP group get the same data slice
+            # All ranks in the same SP group get the same data slice
            process_slice = slice(
                sp_group_start,
                sp_group_start + len(prompts),
            )
        else:
-            # Original behavior for non-context parallel case
+            # Original behavior for non-sequence parallel case
            process_slice = slice(
                self.accelerator.process_index * len(prompts),
                (self.accelerator.process_index + 1) * len(prompts),
--- a/src/axolotl/core/trainers/mixins/init.py
+++ b/src/axolotl/core/trainers/mixins/init.py
@@ -6,3 +6,4 @@
 from .optimizer import OptimizerMixin
 from .rng_state_loader import RngLoaderMixin
 from .scheduler import SchedulerMixin
+from .sequence_parallel import SequenceParallelMixin
--- a/src/axolotl/core/trainers/mixins/optimizer.py
+++ b/src/axolotl/core/trainers/mixins/optimizer.py
@@ -1,17 +1,18 @@
 """Module for Axolotl trainer optimizer mixin"""

+import logging
+
 from peft.optimizers import create_loraplus_optimizer
 from torch import nn
 from transformers.trainer import Trainer
 from transformers.utils import is_sagemaker_mp_enabled

 from axolotl.integrations.base import BaseOptimizerFactory
-from axolotl.utils.logging import get_logger

 if is_sagemaker_mp_enabled():
    import smdistributed.modelparallel.torch as smp

-LOG = get_logger(__name__)
+LOG = logging.getLogger(__name__)


 class OptimizerMixin(Trainer):
@@ -198,20 +199,3 @@ class OptimizerMixin(Trainer):
            )

        return self.optimizer
-
-
-class OptimizerInitMixin:
-    """
-    Mixin to handle common optimizer initialization logic for Trainers (mostly TRL) that do not
-    accept optimizer_cls_and_kwargs as kwarg in constructor.
-    """
-
-    def __init__(self, *args, **kwargs):
-        optimizer_cls_and_kwargs = kwargs.pop("optimizer_cls_and_kwargs", None)
-        super().__init__(*args, **kwargs)
-        if (
-            optimizer_cls_and_kwargs
-            and self.optimizer_cls_and_kwargs is None
-            and self.optimizer is None
-        ):
-            self.optimizer_cls_and_kwargs = optimizer_cls_and_kwargs
--- a/src/axolotl/core/trainers/mixins/rng_state_loader.py
+++ b/src/axolotl/core/trainers/mixins/rng_state_loader.py
@@ -6,6 +6,7 @@ See https://github.com/huggingface/transformers/pull/37162
 TODO: Remove when upstream added PR to release
 """

+import logging
 import os
 import random

@@ -16,9 +17,7 @@ from transformers.trainer import safe_globals
 from transformers.trainer_pt_utils import set_rng_state_for_device
 from transformers.training_args import ParallelMode

-from axolotl.utils.logging import get_logger
-
-LOG = get_logger(__name__)
+LOG = logging.getLogger(__name__)


 class RngLoaderMixin(Trainer):
--- a/src/axolotl/core/trainers/mixins/scheduler.py
+++ b/src/axolotl/core/trainers/mixins/scheduler.py
@@ -1,11 +1,12 @@
 """Module for Axolotl trainer scheduler mixin"""

+import logging
+
 import torch
 from torch.optim.lr_scheduler import LRScheduler, OneCycleLR
 from transformers.trainer import Trainer

 from axolotl.integrations.base import PluginManager
-from axolotl.utils.logging import get_logger
 from axolotl.utils.schedulers import (
    RexLR,
    get_cosine_schedule_with_min_lr,
@@ -13,7 +14,7 @@ from axolotl.utils.schedulers import (
    get_cosine_schedule_with_warmup_decay_constant,
 )

-LOG = get_logger(__name__)
+LOG = logging.getLogger(__name__)


 class SchedulerMixin(Trainer):
@@ -79,15 +80,13 @@ class SchedulerMixin(Trainer):
                self.lr_scheduler = RexLR(
                    optimizer=optimizer,
                    max_lr=self.args.learning_rate,
-                    min_lr=0 if not use_cosine_min_lr else (
-                        self.args.learning_rate * self.args.cosine_min_lr_ratio),
+                    min_lr=0 if not use_cosine_min_lr else (self.args.learning_rate * self.args.cosine_min_lr_ratio),
                    total_steps=num_training_steps,
                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
                )
            elif use_cosine_quadratic:
                if use_cosine_min_lr:
-                    LOG.warning(
-                        "Both cosine quadratic warmup and min lr detected. Using quadratic warmup.")
+                    LOG.warning("Both cosine quadratic warmup and min lr detected. Using quadratic warmup.")

                self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup(  # pylint: disable=attribute-defined-outside-init
                    optimizer,
@@ -116,11 +115,9 @@ class SchedulerMixin(Trainer):
                return super().create_scheduler(num_training_steps, optimizer=optimizer)
        else:
            if use_cosine_quadratic:
-                LOG.warning(
-                    "axolotl's cosine scheduler with quadratic warmup not used (e.g., because of deepspeed).")
+                LOG.warning("axolotl's cosine scheduler with quadratic warmup not used (e.g., because of deepspeed).")

            if use_cosine_min_lr:
-                LOG.warning(
-                    "axolotl's cosine scheduler with min lr not used (e.g., because of deepspeed).")
+                LOG.warning("axolotl's cosine scheduler with min lr not used (e.g., because of deepspeed).")

        return self.lr_scheduler  # type: ignore
--- a/src/axolotl/core/trainers/mixins/sequence_parallel.py
+++ b/src/axolotl/core/trainers/mixins/sequence_parallel.py
@@ -0,0 +1,87 @@
+"""Module for Axolotl trainer sequence parallelism mixin"""
+
+import torch.distributed as dist
+from datasets import Dataset
+from torch.utils.data import DistributedSampler, Sampler
+
+from axolotl.monkeypatch.attention.ring_attn import (
+    get_ring_attn_group,
+)
+
+
+class SequenceParallelMixin:
+    """
+    Mixin class for sequence parallelism support in trainers.
+
+    This mixin provides functionality for handling sequence parallelism,
+    specifically for creating appropriate data samplers.
+    """
+
+    args = None  # type: "AxolotlTrainingArguments"  # type: ignore[name-defined]
+
+    def _setup_sequence_parallel(self):
+        """Set up sequence parallelism environment."""
+        self.ring_attn_group = get_ring_attn_group()
+
+    def _create_sequence_parallel_sampler(
+        self,
+        dataset: Dataset,
+        shuffle: bool = True,
+        is_eval: bool = False,
+    ) -> DistributedSampler:
+        """
+        Helper method to create sampler for sequence parallelism (SP).
+
+        We create a distributed sampler with rank equal to the SP group ID, which
+        means that all ranks in the SP group receive the same sample / set of samples
+        per training step. We also set the number of replicas equal to the number of
+        SP groups, which is a bit of a hack / unintended use, but works!
+
+        Args:
+            dataset: Dataset to sample from.
+            shuffle: Whether to shuffle the dataset.
+            is_eval: Whether we are creating a sampler for evaluation or training.
+
+        Returns:
+            Distributed sampler.
+        """
+        num_sp_groups = self.args.world_size // self.args.sequence_parallel_degree
+        sp_group_id = dist.get_rank() // self.args.sequence_parallel_degree
+
+        return DistributedSampler(
+            dataset,
+            num_replicas=num_sp_groups,
+            rank=sp_group_id,
+            seed=self.args.seed if shuffle else None,
+            shuffle=shuffle,
+            drop_last=not is_eval,
+        )
+
+    def _sp_get_train_sampler(self, dataset) -> Sampler | None:
+        """
+        Get a training sampler configured for sequence parallelism.
+
+        Args:
+            dataset: The training dataset
+
+        Returns:
+            Configured sequence parallel sampler.
+        """
+        return self._create_sequence_parallel_sampler(
+            dataset,
+            shuffle=not self.args.curriculum_sampling,
+        )
+
+    def _sp_get_eval_sampler(self, eval_dataset) -> Sampler | None:
+        """
+        Get an evaluation sampler configured for sequence parallelism.
+
+        Args:
+            eval_dataset: The evaluation dataset.
+
+        Returns:
+            Configured sequence parallel sampler.
+        """
+        return self._create_sequence_parallel_sampler(
+            eval_dataset, shuffle=False, is_eval=True
+        )
--- a/src/axolotl/core/trainers/trl.py
+++ b/src/axolotl/core/trainers/trl.py
@@ -1,5 +1,7 @@
 """Module for TRL PPO trainer"""

+from typing import Literal, Union
+
 import torch
 from tqdm import tqdm
 from trl import (
@@ -12,7 +14,6 @@ from trl import (
 )

 from axolotl.core.trainers.mixins import RngLoaderMixin
-from axolotl.core.trainers.mixins.optimizer import OptimizerInitMixin, OptimizerMixin
 from axolotl.core.trainers.mixins.scheduler import SchedulerMixin


@@ -74,19 +75,87 @@ class TRLPPOTrainer(PPOTrainer):
            )


-class AxolotlORPOTrainer(
-    RngLoaderMixin, SchedulerMixin, OptimizerMixin, OptimizerInitMixin, ORPOTrainer
-):
+class AxolotlORPOTrainer(RngLoaderMixin, SchedulerMixin, ORPOTrainer):
    """
    Extend the base ORPOTrainer for axolotl helpers
    """

    tag_names = ["axolotl", "orpo"]

+    def get_batch_loss_metrics(
+        self,
+        model,
+        batch: dict[str, Union[list, torch.LongTensor]],
+        train_eval: Literal["train", "eval"] = "train",
+    ):
+        """Compute the ORPO loss and other metrics for the given batch of inputs for train or test."""

-class AxolotlKTOTrainer(
-    RngLoaderMixin, SchedulerMixin, OptimizerMixin, OptimizerInitMixin, KTOTrainer
-):
+        # TODO remove once https://github.com/huggingface/trl/pull/3069 is included in a trl release
+
+        metrics = {}
+
+        forward_output = self.concatenated_forward(model, batch)
+        (
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits,
+            policy_rejected_logits,
+            policy_nll_loss,
+        ) = forward_output[:5]
+        if self.aux_loss_enabled:
+            aux_loss = forward_output[5]
+
+        losses, chosen_rewards, rejected_rewards, log_odds_ratio, log_odds_chosen = (
+            self.odds_ratio_loss(policy_chosen_logps, policy_rejected_logps)
+        )
+        # full ORPO loss
+        loss = policy_nll_loss - losses.mean()
+
+        reward_accuracies = (chosen_rewards > rejected_rewards).float()
+
+        prefix = "eval_" if train_eval == "eval" else ""
+        metrics[f"{prefix}rewards/chosen"] = self.accelerator.gather_for_metrics(
+            chosen_rewards
+        ).mean()
+        metrics[f"{prefix}rewards/rejected"] = self.accelerator.gather_for_metrics(
+            rejected_rewards
+        ).mean()
+        metrics[f"{prefix}rewards/accuracies"] = self.accelerator.gather_for_metrics(
+            reward_accuracies
+        ).mean()
+        metrics[f"{prefix}rewards/margins"] = self.accelerator.gather_for_metrics(
+            chosen_rewards - rejected_rewards
+        ).mean()
+        metrics[f"{prefix}logps/rejected"] = (
+            self.accelerator.gather_for_metrics(policy_rejected_logps).detach().mean()
+        )
+        metrics[f"{prefix}logps/chosen"] = (
+            self.accelerator.gather_for_metrics(policy_chosen_logps).detach().mean()
+        )
+        metrics[f"{prefix}logits/rejected"] = self.accelerator.gather_for_metrics(
+            policy_rejected_logits.detach().mean()
+        ).mean()
+        metrics[f"{prefix}logits/chosen"] = self.accelerator.gather_for_metrics(
+            policy_chosen_logits.detach().mean()
+        ).mean()
+        metrics[f"{prefix}nll_loss"] = (
+            self.accelerator.gather_for_metrics(policy_nll_loss).detach().mean()
+        )
+        metrics[f"{prefix}log_odds_ratio"] = (
+            self.accelerator.gather_for_metrics(log_odds_ratio).detach().mean()
+        )
+        metrics[f"{prefix}log_odds_chosen"] = (
+            self.accelerator.gather_for_metrics(log_odds_chosen).detach().mean()
+        )
+        for k, v in metrics.items():
+            metrics[k] = v.item()
+        if self.aux_loss_enabled:
+            loss += self.aux_loss_coef * aux_loss
+
+        return loss, metrics
+
+
+class AxolotlKTOTrainer(RngLoaderMixin, SchedulerMixin, KTOTrainer):
    """
    Extend the base KTOTrainer for axolotl helpers
    """
@@ -94,19 +163,89 @@ class AxolotlKTOTrainer(
    tag_names = ["axolotl", "kto"]


-class AxolotlCPOTrainer(
-    RngLoaderMixin, SchedulerMixin, OptimizerMixin, OptimizerInitMixin, CPOTrainer
-):
+class AxolotlCPOTrainer(RngLoaderMixin, SchedulerMixin, CPOTrainer):
    """
    Extend the base CPOTrainer for axolotl helpers
    """

    tag_names = ["axolotl", "cpo"]

+    def get_batch_loss_metrics(
+        self,
+        model,
+        batch: dict[str, Union[list, torch.LongTensor]],
+        train_eval: Literal["train", "eval"] = "train",
+    ):
+        """Compute the CPO loss and other metrics for the given batch of inputs for train or test."""
+        metrics = {}

-class AxolotlRewardTrainer(
-    RngLoaderMixin, SchedulerMixin, OptimizerMixin, OptimizerInitMixin, RewardTrainer
-):
+        forward_output = self.concatenated_forward(model, batch)
+        (
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits,
+            policy_rejected_logits,
+            policy_nll_loss,
+        ) = forward_output[:5]
+        if self.aux_loss_enabled:
+            aux_loss = forward_output[5]
+
+        losses, chosen_rewards, rejected_rewards = self.cpo_loss(
+            policy_chosen_logps,
+            policy_rejected_logps,
+        )
+
+        loss = losses.mean() + self.cpo_alpha * policy_nll_loss
+        reward_accuracies = (chosen_rewards > rejected_rewards).float()
+
+        prefix = "eval_" if train_eval == "eval" else ""
+        metrics[f"{prefix}rewards/chosen"] = (
+            self.accelerator.gather_for_metrics(chosen_rewards).mean().item()
+        )
+        metrics[f"{prefix}rewards/rejected"] = (
+            self.accelerator.gather_for_metrics(rejected_rewards).mean().item()
+        )
+        metrics[f"{prefix}rewards/accuracies"] = (
+            self.accelerator.gather_for_metrics(reward_accuracies).mean().item()
+        )
+        metrics[f"{prefix}rewards/margins"] = (
+            self.accelerator.gather_for_metrics(chosen_rewards - rejected_rewards)
+            .mean()
+            .item()
+        )
+        metrics[f"{prefix}logps/rejected"] = (
+            self.accelerator.gather_for_metrics(policy_rejected_logps)
+            .detach()
+            .mean()
+            .item()
+        )
+        metrics[f"{prefix}logps/chosen"] = (
+            self.accelerator.gather_for_metrics(policy_chosen_logps)
+            .detach()
+            .mean()
+            .item()
+        )
+        metrics[f"{prefix}logits/rejected"] = (
+            self.accelerator.gather_for_metrics(policy_rejected_logits.detach().mean())
+            .mean()
+            .item()
+        )
+        metrics[f"{prefix}logits/chosen"] = (
+            self.accelerator.gather_for_metrics(policy_chosen_logits.detach().mean())
+            .mean()
+            .item()
+        )
+        metrics[f"{prefix}nll_loss"] = (
+            self.accelerator.gather_for_metrics(policy_nll_loss).detach().mean().item()
+        )
+
+        if self.aux_loss_enabled:
+            loss += self.aux_loss_coef * aux_loss
+
+        return loss, metrics
+
+
+class AxolotlRewardTrainer(RngLoaderMixin, SchedulerMixin, RewardTrainer):
    """
    Extend the base RewardTrainer for axolotl helpers
    """
@@ -114,9 +253,7 @@ class AxolotlRewardTrainer(
    tag_names = ["axolotl", "reward"]


-class AxolotlPRMTrainer(
-    RngLoaderMixin, SchedulerMixin, OptimizerMixin, OptimizerInitMixin, PRMTrainer
-):
+class AxolotlPRMTrainer(RngLoaderMixin, SchedulerMixin, PRMTrainer):
    """
    Extend the base trl.PRMTrainer for axolotl helpers
    """
--- a/src/axolotl/core/training_args.py
+++ b/src/axolotl/core/training_args.py
@@ -9,6 +9,8 @@ from PIL.Image import Resampling
 from transformers import TrainingArguments
 from trl import CPOConfig, KTOConfig, ORPOConfig, PRMConfig, RewardConfig

+from axolotl.utils.schemas.enums import RingAttnFunc
+

@dataclass
 class AxolotlTrainingMixins:
@@ -164,6 +166,12 @@ class AxolotlTrainingMixins:
        default=None,
        metadata={"help": "whether to use sequential sampling for curriculum learning"},
    )
+    alternate_optimizer: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "workaround to pass an alternate optimizer to the HF trainer"
+        },
+    )
    alternate_lr_scheduler_type: Optional[str] = field(
        default=None,
        metadata={
@@ -208,6 +216,17 @@ class AxolotlTrainingMixins:
        },
    )

+    sequence_parallel_degree: Optional[int] = field(
+        default=1,
+        metadata={"help": "The number of workers to use in sequence parallelism"},
+    )
+    ring_attn_func: Optional[RingAttnFunc] = field(
+        default=None,
+        metadata={
+            "help": "The ring-flash-attn function to use in sequence parallelism"
+        },
+    )
+
    adam_beta3: Optional[float] = field(
        default=None,
        metadata={
--- a/src/axolotl/datasets.py
+++ b/src/axolotl/datasets.py
@@ -1,13 +1,12 @@
 """Module containing Dataset functionality"""

+import logging
 import os
 from typing import List, Optional, Union

 import torch
 from datasets import Dataset, IterableDataset

-from axolotl.utils.logging import get_logger
-
 from .prompt_tokenizers import PromptTokenizingStrategy

 # We want this to be a wrapper for an existing dataset that we have loaded
@@ -16,7 +15,7 @@ from .prompt_tokenizers import PromptTokenizingStrategy
 # let's check to ensure we don't truncate an item in the middle, we'll use
 # the collators later on to pad the datasets

-LOG = get_logger(__name__)
+LOG = logging.getLogger("axolotl")


 class TokenizedPromptDataset(Dataset):
--- a/src/axolotl/integrations/base.py
+++ b/src/axolotl/integrations/base.py
@@ -10,83 +10,71 @@
 # License for the specific language governing permissions and limitations under
 # the License.

-"""Base class for all plugins.
+"""
+Base class for all plugins.

 A plugin is a reusable, modular, and self-contained piece of code that extends the functionality of Axolotl.
 Plugins can be used to integrate third-party models, modify the training process, or add new features.

 To create a new plugin, you need to inherit from the BasePlugin class and implement the required methods.
 """
-
-from __future__ import annotations
-
 import collections
 import importlib
-from typing import TYPE_CHECKING, Callable, OrderedDict, Union
+import logging
+from typing import OrderedDict

-from peft import PeftModel
-from torch.optim import Optimizer
+import torch
 from torch.optim.lr_scheduler import LRScheduler
-from transformers import PreTrainedModel, Trainer

 from axolotl.utils.dict import DictDefault
-from axolotl.utils.logging import get_logger
-
-LOG = get_logger(__name__, use_environ=True)
-
-if TYPE_CHECKING:
-    from axolotl.common.datasets import TrainDatasetMeta


 class BasePlugin:
-    """Base class for all plugins. Defines the interface for plugin methods.
+    """
+    Base class for all plugins. Defines the interface for plugin methods.

-    A plugin is a reusable, modular, and self-contained piece of code that extends
-    the functionality of Axolotl. Plugins can be used to integrate third-party models,
-    modify the training process, or add new features.
+    Attributes:
+    None

-    To create a new plugin, you need to inherit from the BasePlugin class and
-    implement the required methods.
-
-    Note:
-        Plugin methods include:
-        - register(cfg): Registers the plugin with the given configuration.
-        - load_datasets(cfg): Loads and preprocesses the dataset for training.
-        - pre_model_load(cfg): Performs actions before the model is loaded.
-        - post_model_build(cfg, model): Performs actions after the model is loaded, but
-            before LoRA adapters are applied.
-        - pre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.
-        - post_lora_load(cfg, model): Performs actions after LoRA weights are loaded.
-        - post_model_load(cfg, model): Performs actions after the model is loaded,
-            inclusive of any adapters.
-        - post_trainer_create(cfg, trainer): Performs actions after the trainer is
-            created.
-        - create_optimizer(cfg, trainer): Creates and returns an optimizer for training.
-        - create_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and
-            returns a learning rate scheduler.
-        - add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before
-            training.
-        - add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after
-            training.
+    Methods:
+    register(cfg): Registers the plugin with the given configuration.
+    load_datasets(cfg): Loads and preprocesses the dataset for training.
+    pre_model_load(cfg): Performs actions before the model is loaded.
+    post_model_build(cfg, model): Performs actions after the model is loaded, but before LoRA adapters are applied.
+    pre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.
+    post_lora_load(cfg, model): Performs actions after LoRA weights are loaded.
+    post_model_load(cfg, model): Performs actions after the model is loaded, inclusive of any adapters.
+    post_trainer_create(cfg, trainer): Performs actions after the trainer is created.
+    create_optimizer(cfg, trainer): Creates and returns an optimizer for training.
+    create_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and returns a learning rate scheduler.
+    add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before training.
+    add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after training.
    """

    def __init__(self):
-        """Initializes the BasePlugin."""
+        """
+        Initializes the BasePlugin.
+        """

-    def register(self, cfg: DictDefault):  # pylint: disable=unused-argument
-        """Registers the plugin with the given configuration.
+    def register(self, cfg):  # pylint: disable=unused-argument
+        """
+        Registers the plugin with the given configuration.

-        Args:
-            cfg: The configuration for the plugin.
+        Parameters:
+        cfg (dict): The configuration for the plugin.
+
+        Returns:
+        None
        """

    def get_input_args(self) -> str | None:
-        """Returns a pydantic model for the plugin's input arguments."""
+        """
+        Returns a pydantic model for the plugin's input arguments.
+        """

-    def load_datasets(
-        self, cfg: DictDefault, preprocess: bool = False
-    ) -> Union["TrainDatasetMeta", None]:
-        """Loads and preprocesses the dataset for training.
+    def load_datasets(self, cfg: DictDefault, preprocess: bool = False):
+        """
+        Loads and preprocesses the dataset for training.

        Args:
            cfg: The configuration for the plugin.
@@ -96,164 +84,181 @@ class BasePlugin:
            dataset_meta: The metadata for the training dataset.
        """

-    def pre_model_load(self, cfg: DictDefault):  # pylint: disable=unused-argument
-        """Performs actions before the model is loaded.
-
-        Args:
-            cfg: The configuration for the plugin.
+    def pre_model_load(self, cfg):  # pylint: disable=unused-argument
        """
-
-    # pylint: disable=unused-argument
-    def post_model_build(self, cfg: DictDefault, model: PreTrainedModel):
-        """Performs actions after the model is built/loaded, but before any adapters are applied.
+        Performs actions before the model is loaded.

        Args:
-            cfg: The configuration for the plugin.
-        """
-
-    # pylint: disable=unused-argument
-    def pre_lora_load(self, cfg: DictDefault, model: PreTrainedModel):
-        """Performs actions before LoRA weights are loaded.
-
-        Args:
-            cfg: The configuration for the plugin.
-            model: The loaded model.
-        """
-
-    # pylint: disable=unused-argument
-    def post_lora_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
-        """Performs actions after LoRA weights are loaded.
-
-        Args:
-            cfg: The configuration for the plugin.
-            model: The loaded model.
-        """
-
-    # pylint: disable=unused-argument
-    def post_model_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
-        """Performs actions after the model is loaded.
-
-        Args:
-            cfg: The configuration for the plugin.
-            model: The loaded model.
-        """
-
-    # pylint: disable=unused-argument
-    def get_trainer_cls(self, cfg: DictDefault) -> Trainer | None:
-        """Returns a custom class for the trainer.
-
-        Args:
-            cfg: The global axolotl configuration.
+            cfg (dict): The configuration for the plugin.

        Returns:
-            The first non-`None` trainer class returned by a plugin.
+            None
        """

-    # pylint: disable=unused-argument
-    def post_trainer_create(self, cfg: DictDefault, trainer: Trainer):
-        """Performs actions after the trainer is created.
+    def post_model_build(self, cfg, model):  # pylint: disable=unused-argument
+        """
+        Performs actions after the model is built/loaded, but before any adapters are applied.

        Args:
-            cfg: The configuration for the plugin.
-            trainer: The trainer object for training.
+            cfg (dict): The configuration for the plugin.
        """

-    # pylint: disable=unused-argument
-    def create_optimizer(self, cfg: DictDefault, trainer: Trainer) -> Optimizer | None:
-        """Creates and returns an optimizer for training.
+    def post_model_load(self, cfg, model):  # pylint: disable=unused-argument
+        """
+        Performs actions after the model is loaded.

        Args:
-            cfg: The configuration for the plugin.
-            trainer: The trainer object for training.
+            cfg (dict): The configuration for the plugin.
+            model (object): The loaded model.

        Returns:
-            The created optimizer.
+            None
+        """
+
+    def pre_lora_load(self, cfg, model):  # pylint: disable=unused-argument
+        """
+        Performs actions before LoRA weights are loaded.
+
+        Args:
+            cfg (dict): The configuration for the plugin.
+            model (object): The loaded model.
+
+        Returns:
+            None
+        """
+
+    def post_lora_load(self, cfg, model):  # pylint: disable=unused-argument
+        """
+        Performs actions after LoRA weights are loaded.
+
+        Args:
+            cfg (dict): The configuration for the plugin.
+            model (object): The loaded model.
+
+        Returns:
+            None
+        """
+
+    def get_trainer_cls(self, cfg):  # pylint: disable=unused-argument):
+        """
+        Returns a custom class for the trainer.
+
+        Args:
+            cfg (dict): The global axolotl configuration.
+
+        Returns:
+            class: The class for the trainer.
+        """
+
+    def post_trainer_create(self, cfg, trainer):  # pylint: disable=unused-argument
+        """
+        Performs actions after the trainer is created.
+
+        Args:
+            cfg (dict): The configuration for the plugin.
+            trainer (object): The trainer object for training.
+
+        Returns:
+            None
+        """
+
+    def create_optimizer(self, cfg, trainer):  # pylint: disable=unused-argument
+        """
+        Creates and returns an optimizer for training.
+
+        Args:
+            cfg (dict): The configuration for the plugin.
+            trainer (object): The trainer object for training.
+
+        Returns:
+            object: The created optimizer.
        """

-    # pylint: disable=unused-argument
    def create_lr_scheduler(
-        self,
-        cfg: DictDefault,
-        trainer: Trainer,
-        optimizer: Optimizer,
-        num_training_steps: int,
-    ) -> LRScheduler | None:
-        """Creates and returns a learning rate scheduler.
+        self, cfg, trainer, optimizer, num_training_steps
+    ) -> LRScheduler | None:  # pylint: disable=unused-argument
+        """
+        Creates and returns a learning rate scheduler.

        Args:
-            cfg: The configuration for the plugin.
-            trainer: The trainer object for training.
-            optimizer: The optimizer for training.
-            num_training_steps: Total number of training steps
+            cfg (dict): The configuration for the plugin.
+            trainer (object): The trainer object for training.
+            optimizer (object): The optimizer for training.
+            num_training_steps (int): Total number of training steps

        Returns:
-            The created learning rate scheduler.
+            object (LRScheduler): The created learning rate scheduler.
        """

-    # pylint: disable=unused-argument
-    def add_callbacks_pre_trainer(
-        self, cfg: DictDefault, model: PreTrainedModel
-    ) -> list[Callable]:
-        """Set up callbacks before creating the trainer.
+    def add_callbacks_pre_trainer(self, cfg, model):  # pylint: disable=unused-argument
+        """
+        setup callbacks before creating the trainer.

        Args:
-            cfg: The configuration for the plugin.
-            model: The loaded model.
+            cfg (dict): The configuration for the plugin.
+            model (object): The loaded model.

        Returns:
-            A list of callback functions to be added to the `TrainingArgs`.
+            List[callable]: A list of callback functions to be added to the TrainingArgs
        """
        return []

-    # pylint: disable=unused-argument
    def add_callbacks_post_trainer(
-        self, cfg: DictDefault, trainer: Trainer
-    ) -> list[Callable]:
-        """Adds callbacks to the trainer after creating the trainer. This is useful for
-        callbacks that require access to the model or trainer.
+        self, cfg, trainer
+    ):  # pylint: disable=unused-argument
+        """
+        Adds callbacks to the trainer after creating the trainer.
+        This is useful for callbacks that require access to the model or trainer.

        Args:
-            cfg: The configuration for the plugin.
-            trainer: The trainer object for training.
+            cfg (dict): The configuration for the plugin.
+            trainer (object): The trainer object for training.

        Returns:
-            A list of callback functions to be added
+            List[callable]: A list of callback functions to be added
        """
        return []

-    # pylint: disable=unused-argument
-    def post_train(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
-        """Performs actions after training is complete.
+    def post_train(self, cfg, model):  # pylint: disable=unused-argument
+        """
+        Performs actions after training is complete.

        Args:
-            cfg: The axolotl configuration.
-            model: The loaded model.
+            cfg (dict): The axolotl configuration
+            model (object): The loaded model.
+
+        Returns:
+            None
        """

-    def post_train_unload(self, cfg: DictDefault):  # pylint: disable=unused-argument
-        """Performs actions after training is complete and the model is unloaded.
+    def post_train_unload(self, cfg):  # pylint: disable=unused-argument
+        """
+        Performs actions after training is complete and the model is unloaded.

        Args:
-            cfg: The configuration for the plugin.
+            cfg (dict): The configuration for the plugin.
+
+        Returns:
+            None
        """


 def load_plugin(plugin_name: str) -> BasePlugin:
-    """Loads a plugin based on the given plugin name.
+    """
+    Loads a plugin based on the given plugin name.

-    The plugin name should be in the format "module_name.class_name". This function
-    splits the plugin name into module and class, imports the module, retrieves the
-    class from the module, and creates an instance of the class.
+    The plugin name should be in the format "module_name.class_name".
+    This function splits the plugin name into module and class, imports the module,
+    retrieves the class from the module, and creates an instance of the class.

-    Args:
-        plugin_name: The name of the plugin to be loaded. The name should be in the
-            format "module_name.class_name".
+    Parameters:
+    plugin_name (str): The name of the plugin to be loaded. The name should be in the format "module_name.class_name".

    Returns:
-        An instance of the loaded plugin.
+    BasePlugin: An instance of the loaded plugin.

    Raises:
-        ImportError: If the plugin module cannot be imported.
+    ImportError: If the plugin module cannot be imported.
    """
    # split the plugin name into module and class
    module_name, class_name = plugin_name.rsplit(".", 1)
@@ -279,26 +284,28 @@ def load_plugin(plugin_name: str) -> BasePlugin:


 class PluginManager:
-    """The `PluginManager` class is responsible for loading and managing plugins. It
-    should be a singleton so it can be accessed from anywhere in the codebase.
+    """
+    The PluginManager class is responsible for loading and managing plugins.
+    It should be a singleton so it can be accessed from anywhere in the codebase.

    Attributes:
-        plugins: A list of loaded plugins.
+    plugins (List[BasePlugin]): A list of loaded plugins.

-    Note:
-        Key methods include:
-        - get_instance(): Static method to get the singleton instance of `PluginManager`.
-        - register(plugin_name: str): Registers a new plugin by its name.
-        - pre_model_load(cfg): Calls the pre_model_load method of all registered plugins.
+    Methods:
+    get_instance(): Static method to get the singleton instance of PluginManager.
+    register(plugin_name: str): Registers a new plugin by its name.
+    pre_model_load(cfg): Calls the pre_model_load method of all registered plugins.
    """

    plugins: OrderedDict[str, BasePlugin] = collections.OrderedDict()

-    _instance: PluginManager | None = None
-    _cfg: DictDefault | None = None
+    _instance = None
+    _cfg = None

    def __new__(cls):
-        """Creates a new instance of PluginManager if it doesn't exist yet."""
+        """
+        Creates a new instance of PluginManager if it doesn't exist yet.
+        """
        if cls._instance is None:
            cls._instance = super(PluginManager, cls).__new__(cls)
            cls._instance.plugins: OrderedDict[str, BasePlugin] = (
@@ -308,8 +315,9 @@ class PluginManager:

    @staticmethod
    def get_instance() -> "PluginManager":
-        """Returns the singleton instance of PluginManager. If the instance doesn't
-        exist, it creates a new one.
+        """
+        Returns the singleton instance of PluginManager.
+        If the instance doesn't exist, it creates a new one.
        """
        if PluginManager._instance is None:
            PluginManager()
@@ -324,27 +332,32 @@ class PluginManager:
        self._cfg = cfg

    def register(self, plugin_name: str):
-        """Registers a new plugin by its name.
-
-        Args:
-            plugin_name: The name of the plugin to be registered.
-
-        Raises:
-            ImportError: If the plugin module cannot be imported.
        """
-        try:
-            LOG.info(f"Attempting to load plugin: {plugin_name}")
-            plugin = load_plugin(plugin_name)
-            self.plugins[plugin_name] = plugin
-            LOG.info(f"Plugin loaded successfully: {plugin_name}")
-        except ImportError:
-            LOG.error(f"Failed to load plugin: {plugin_name}")
+        Registers a new plugin by its name.

-    def get_input_args(self) -> list[str]:
-        """Returns a list of Pydantic classes for all registered plugins' input arguments.'
+        Parameters:
+        plugin_name (str): The name of the plugin to be registered.

        Returns:
-            A list of Pydantic classes for all registered plugins' input arguments.'
+        None
+
+        Raises:
+        ImportError: If the plugin module cannot be imported.
+        """
+        try:
+            logging.info(f"Attempting to load plugin: {plugin_name}")
+            plugin = load_plugin(plugin_name)
+            self.plugins[plugin_name] = plugin
+            logging.info(f"Plugin loaded successfully: {plugin_name}")
+        except ImportError:
+            logging.error(f"Failed to load plugin: {plugin_name}")
+
+    def get_input_args(self):
+        """
+        Returns a list of Pydantic classes for all registered plugins' input arguments.'
+
+        Returns:
+        list[str]: A list of Pydantic classes for all registered plugins' input arguments.'
        """
        input_args = []
        for plugin in self.plugins.values():
@@ -353,17 +366,16 @@ class PluginManager:
                input_args.append(input_args_from_plugin)
        return input_args

-    def load_datasets(
-        self, cfg: DictDefault, preprocess: bool = False
-    ) -> Union["TrainDatasetMeta", None]:
-        """Calls the load_datasets method of each registered plugin.
+    def load_datasets(self, cfg, preprocess: bool = False):
+        """
+        Calls the load_datasets method of each registered plugin.

        Args:
            cfg: The configuration for the plugins.
-            preprocess: Whether this is preprocess step of the datasets.
+            preprocess : Whether this is preprocess step of the datasets.

        Returns:
-            The dataset metadata loaded from all registered plugins.
+            dataset_meta: The dataset metadata loaded from all registered plugins.
        """
        return_ds_meta = None
        for plugin in self.plugins.values():
@@ -375,66 +387,83 @@ class PluginManager:
                    raise RuntimeError("Multiple plugins loaded datasets")
        return return_ds_meta

-    def pre_model_load(self, cfg: DictDefault):
-        """Calls the pre_model_load method of all registered plugins.
+    def pre_model_load(self, cfg):
+        """
+        Calls the pre_model_load method of all registered plugins.

-        Args:
-            cfg: The configuration for the plugins.
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+
+        Returns:
+        None
        """
        for plugin in self.plugins.values():
            plugin.pre_model_load(cfg)

-    def post_model_build(self, cfg: DictDefault, model: PreTrainedModel):
-        """Calls the `post_model_build` method of all registered plugins after the
-        model has been built / loaded, but before any adapters have been applied.
+    def post_model_build(self, cfg, model):
+        """
+        Calls the post_model_build method of all registered plugins after the model has been built/loaded,
+        but before any adapters have been applied.

        Args:
-            cfg: The configuration for the plugins.
-            model: The loaded model.
+            cfg (dict): The configuration for the plugins.
+            model (object): The loaded model.
        """
        for plugin in self.plugins.values():
            plugin.post_model_build(cfg, model)

-    def pre_lora_load(self, cfg: DictDefault, model: PreTrainedModel):
-        """Calls the `pre_lora_load` method of all registered plugins.
-
-        Args:
-            cfg: The configuration for the plugins.
-            model: The loaded model.
+    def post_model_load(self, cfg, model):
        """
-        for plugin in self.plugins.values():
-            plugin.pre_lora_load(cfg, model)
+        Calls the post_model_load method of all registered plugins after the model has been loaded
+        inclusive of any adapters

-    def post_lora_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
-        """Calls the `post_lora_load` method of all registered plugins.
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+        model (object): The loaded model.

-        Args:
-            cfg: The configuration for the plugins.
-            model: The loaded model.
-        """
-        for plugin in self.plugins.values():
-            plugin.post_lora_load(cfg, model)
-
-    def post_model_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
-        """Calls the `post_model_load` method of all registered plugins after the model
-        has been loaded inclusive of any adapters.
-
-        Args:
-            cfg: The configuration for the plugins.
-            model: The loaded model.
+        Returns:
+        None
        """
        for plugin in self.plugins.values():
            plugin.post_model_load(cfg, model)

-    def get_trainer_cls(self, cfg: DictDefault) -> Trainer | None:
-        """Calls the `get_trainer_cls` method of all registered plugins and returns the
-        first non-`None` trainer class.
+    def pre_lora_load(self, cfg, model):
+        """
+        Calls the pre_lora_load method of all registered plugins.

-        Args:
-            cfg: The configuration for the plugins.
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+        model (object): The loaded model.

        Returns:
-            The first non-`None` trainer class returned by a plugin.
+        None
+        """
+        for plugin in self.plugins.values():
+            plugin.pre_lora_load(cfg, model)
+
+    def post_lora_load(self, cfg, model):
+        """
+        Calls the post_lora_load method of all registered plugins.
+
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+        model (object): The loaded model.
+
+        Returns:
+        None
+        """
+        for plugin in self.plugins.values():
+            plugin.post_lora_load(cfg, model)
+
+    def get_trainer_cls(self, cfg):
+        """
+        Calls the get_trainer_cls method of all registered plugins and returns the first non-None trainer class.
+
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+
+        Returns:
+        object: The trainer class, or None if none was found.
        """
        for plugin in self.plugins.values():
            trainer_cls = plugin.get_trainer_cls(cfg)
@@ -442,25 +471,29 @@ class PluginManager:
                return trainer_cls
        return None

-    def post_trainer_create(self, cfg: DictDefault, trainer: Trainer):
-        """Calls the `post_trainer_create` method of all registered plugins.
+    def post_trainer_create(self, cfg, trainer):
+        """
+        Calls the post_trainer_create method of all registered plugins.

-        Args:
-            cfg: The configuration for the plugins.
-            trainer: The trainer object for training.
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+        trainer (object): The trainer object for training.
+
+        Returns:
+        None
        """
        for plugin in self.plugins.values():
            plugin.post_trainer_create(cfg, trainer)

-    def create_optimizer(self, trainer: Trainer) -> Optimizer | None:
-        """Calls the `create_optimizer` method of all registered plugins and returns
-        the first non-`None` optimizer.
+    def create_optimizer(self, trainer):
+        """
+        Calls the create_optimizer method of all registered plugins and returns the first non-None optimizer.

-        Args:
-            trainer: The trainer object for training.
+        Parameters:
+        trainer (object): The trainer object for training.

        Returns:
-            The created optimizer, or `None` if none was found.
+        object: The created optimizer, or None if none was found.
        """
        for plugin in self.plugins.values():
            optimizer = plugin.create_optimizer(self.cfg, trainer)
@@ -469,17 +502,17 @@ class PluginManager:
        return None

    def create_lr_scheduler(
-        self, trainer: Trainer, optimizer: Optimizer, num_training_steps: int
+        self, trainer, optimizer, num_training_steps
    ) -> LRScheduler | None:
-        """Calls the `create_lr_scheduler` method of all registered plugins and returns
-        the first non-`None` scheduler.
+        """
+        Calls the create_lr_scheduler method of all registered plugins and returns the first non-None scheduler.

-        Args:
-            trainer: The trainer object for training.
-            optimizer: The optimizer for training.
+        Parameters:
+        trainer (object): The trainer object for training.
+        optimizer (object): The optimizer for training.

        Returns:
-            The created learning rate scheduler, or `None` if not found.
+        object: The created learning rate scheduler, or None if none was found.
        """
        for plugin in self.plugins.values():
            scheduler: LRScheduler | None = plugin.create_lr_scheduler(
@@ -492,17 +525,16 @@ class PluginManager:
                return scheduler
        return None

-    def add_callbacks_pre_trainer(
-        self, cfg: DictDefault, model: PreTrainedModel
-    ) -> list[Callable]:
-        """Calls the add_callbacks_pre_trainer method of all registered plugins.
+    def add_callbacks_pre_trainer(self, cfg, model):
+        """
+        Calls the add_callbacks_pre_trainer method of all registered plugins.

-        Args:
-            cfg: The configuration for the plugins.
-            model: The loaded model.
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+        model (object): The loaded model.

        Returns:
-            A list of callback functions to be added to the `TrainingArgs`.
+        List[callable]: A list of callback functions to be added to the TrainingArgs.
        """
        callbacks = []
        for plugin in self.plugins.values():
@@ -511,17 +543,16 @@ class PluginManager:
                callbacks.extend(plugin_callbacks)
        return callbacks

-    def add_callbacks_post_trainer(
-        self, cfg: DictDefault, trainer: Trainer
-    ) -> list[Callable]:
-        """Calls the `add_callbacks_post_trainer` method of all registered plugins.
+    def add_callbacks_post_trainer(self, cfg, trainer):
+        """
+        Calls the add_callbacks_post_trainer method of all registered plugins.

-        Args:
-            cfg: The configuration for the plugins.
-            trainer: The trainer object for training.
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+        trainer (object): The trainer object for training.

        Returns:
-            A list of callback functions to be added to the `TrainingArgs`.
+        List[callable]: A list of callback functions to be added to the TrainingArgs.
        """
        callbacks = []
        for plugin in self.plugins.values():
@@ -530,30 +561,41 @@ class PluginManager:
                callbacks.extend(plugin_callbacks)
        return callbacks

-    def post_train(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
-        """Calls the post_train method of all registered plugins.
+    def post_train(self, cfg, model):
+        """
+        Calls the post_train method of all registered plugins.

-        Args:
-            cfg: The configuration for the plugins.
-            model: The loaded model.
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+        model (object): The loaded model.
+
+        Returns:
+        None
        """
        for plugin in self.plugins.values():
            plugin.post_train(cfg, model)

-    def post_train_unload(self, cfg: DictDefault):
-        """Calls the post_train_unload method of all registered plugins.
+    def post_train_unload(self, cfg):
+        """
+        Calls the post_train_unload method of all registered plugins.

-        Args:
-            cfg: The configuration for the plugins.
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+        model (object): The loaded model.
+
+        Returns:
+        None
        """
        for plugin in self.plugins.values():
            plugin.post_train_unload(cfg)


 class BaseOptimizerFactory:
-    """Base class for factories to create custom optimizers"""
+    """
+    Base class for factories to create custom optimizers
+    """

    def __call__(
        self, opt_model, training_args, **optimizer_kwargs
-    ) -> Optimizer | None:
+    ) -> "torch.optim.Optimizer":
        pass
--- a/src/axolotl/integrations/cut_cross_entropy/init.py
+++ b/src/axolotl/integrations/cut_cross_entropy/init.py
@@ -19,16 +19,17 @@ Cut Cross Entropy is an optimized implementation of cross entropy loss
 from Apple's ML team.
 """
 import importlib
+import logging

 import torch

 from axolotl.integrations.base import BasePlugin
 from axolotl.utils import get_pytorch_version
-from axolotl.utils.logging import get_logger
+from axolotl.utils.distributed import is_main_process

 from .args import CutCrossEntropyArgs  # pylint: disable=unused-import. # noqa: F401

-LOG = get_logger(__name__, use_environ=True)
+LOG = logging.getLogger("axolotl.integrations.cut_cross_entropy")

 _CCE_INSTALL_MESSAGE = (
    "Please install cut_cross_entropy with transformers support using "
@@ -75,9 +76,10 @@ class CutCrossEntropyPlugin(BasePlugin):
                cce_patch,
            )

-            LOG.info(
-                f"Applying Cut Cross Entropy to model type: {cfg.model_config_type}"
-            )
+            if is_main_process(use_environ=True):
+                LOG.info(
+                    f"Applying Cut Cross Entropy to model type: {cfg.model_config_type}"
+                )

            # The patch checks model_type internally
            cce_patch(cfg.model_config_type)
--- a/src/axolotl/integrations/cut_cross_entropy/args.py
+++ b/src/axolotl/integrations/cut_cross_entropy/args.py
@@ -15,13 +15,12 @@
 """
 Module for handling Cut Cross Entropy input arguments.
 """
+import logging
 from typing import Optional

 from pydantic import BaseModel, model_validator

-from axolotl.utils.logging import get_logger
-
-LOG = get_logger(__name__)
+LOG = logging.getLogger("axolotl.integrations.cut_cross_entropy.args")


 class CutCrossEntropyArgs(BaseModel):
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/mllama.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/mllama.py
@@ -15,14 +15,23 @@ from cut_cross_entropy.transformers.utils import (
 from transformers.cache_utils import Cache
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.models.mllama.modeling_mllama import (
+    MLLAMA_INPUTS_DOCSTRING,
    _prepare_cross_attention_mask,
 )
+from transformers.utils import (
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
 from transformers.utils.deprecation import deprecate_kwarg

 _PATCH_OPTS: PatchOptions | None = None


@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+@add_start_docstrings_to_model_forward(MLLAMA_INPUTS_DOCSTRING)
+@replace_return_docstrings(
+    output_type=CausalLMOutputWithPast, config_class="MllamaTextConfig"
+)
 def cce_forward(
    self,
    input_ids: torch.LongTensor | None = None,
@@ -155,6 +164,10 @@ def cce_forward(


@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+@add_start_docstrings_to_model_forward(MLLAMA_INPUTS_DOCSTRING)
+@replace_return_docstrings(
+    output_type=CausalLMOutputWithPast, config_class="MllamaConfig"
+)
 def cce_forward_multimodal(
    self,
    input_ids: Optional[torch.LongTensor] = None,
--- a/src/axolotl/integrations/grokfast/init.py
+++ b/src/axolotl/integrations/grokfast/init.py
@@ -2,15 +2,15 @@
 Grokfast plugin for Axolotl
 """

-from transformers.trainer_callback import TrainerCallback
+import logging

-from axolotl.utils.logging import get_logger
+from transformers.trainer_callback import TrainerCallback

 from ..base import BasePlugin
 from .args import GrokfastArgs  # pylint: disable=unused-import. # noqa: F401
 from .optimizer import gradfilter_ema

-LOG = get_logger(__name__)
+LOG = logging.getLogger("axolotl.integrations.grokfast")


 class GrokfastCallbackHandler(TrainerCallback):
--- a/src/axolotl/integrations/liger/init.py
+++ b/src/axolotl/integrations/liger/init.py
@@ -19,15 +19,16 @@ Liger Kernel is the collection of Triton-native kernels for LLM Training.
 It is designed to be performant, correct, and light-weight.
 """
 import inspect
+import logging
 import sys

 from axolotl.integrations.base import BasePlugin
-from axolotl.utils.logging import get_logger
+from axolotl.utils.distributed import is_main_process

 from .args import LigerArgs  # pylint: disable=unused-import. # noqa: F401
 from .utils import patch_with_compile_disable

-LOG = get_logger(__name__, use_environ=True)
+LOG = logging.getLogger("axolotl.integrations.liger")


 class LigerPlugin(BasePlugin):
@@ -84,7 +85,10 @@ class LigerPlugin(BasePlugin):
                kwargs["geglu"] = cfg.liger_glu_activation
            elif "swiglu" in liger_fn_sig.parameters:
                kwargs["swiglu"] = cfg.liger_glu_activation
-            LOG.info(f"Applying LIGER to {cfg.model_config_type} with kwargs: {kwargs}")
+            if is_main_process(use_environ=True):
+                LOG.info(
+                    f"Applying LIGER to {cfg.model_config_type} with kwargs: {kwargs}"
+                )
            apply_liger_fn(**kwargs)
        elif cfg.model_config_type == "jamba":
            from transformers.models.jamba import modeling_jamba
@@ -120,9 +124,9 @@ class LigerPlugin(BasePlugin):
            if cfg.liger_rope:
                # The DeepseekV2 version of RoPE is different than upstream LLaMA.
                # See https://github.com/linkedin/Liger-Kernel/issues/129#issuecomment-2313763528
-                LOG.warning("Fused liger_rope is not supported for DeepseekV2.")
+                logging.warning("Fused liger_rope is not supported for DeepseekV2.")
            if cfg.liger_glu_activation:
-                LOG.warning("liger_glu_activation is not supported for DeepseekV2.")
+                logging.warning("liger_glu_activation is not supported for DeepseekV2.")
            if cfg.liger_rms_norm:
                modeling_mod.DeepseekV2RMSNorm = LigerRMSNorm
            if cfg.liger_glu_activation:
@@ -171,17 +175,7 @@ class LigerPlugin(BasePlugin):
                rms_norm=cfg.liger_rms_norm,
                layer_norm=cfg.liger_layer_norm,
            )
-        elif cfg.model_config_type == "granitemoe":
-            from liger_kernel.transformers import apply_liger_kernel_to_granite
-
-            apply_liger_kernel_to_granite(
-                rope=cfg.liger_rope,
-                cross_entropy=cfg.liger_cross_entropy,
-                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
-                rms_norm=cfg.liger_rms_norm,
-                swiglu=cfg.liger_glu_activation,
-            )
        else:
-            LOG.warning(
+            logging.warning(
                f"Unsupported model config type: {cfg.model_config_type}. Liger not applied."
            )
--- a/src/axolotl/integrations/liger/args.py
+++ b/src/axolotl/integrations/liger/args.py
@@ -15,13 +15,12 @@
 """
 Module for handling LIGER input arguments.
 """
+import logging
 from typing import Optional

 from pydantic import BaseModel, model_validator

-from axolotl.utils.logging import get_logger
-
-LOG = get_logger(__name__)
+LOG = logging.getLogger("axolotl.integrations.liger.args")


 class LigerArgs(BaseModel):
--- a/src/axolotl/integrations/llm_compressor/plugin.py
+++ b/src/axolotl/integrations/llm_compressor/plugin.py
@@ -3,6 +3,7 @@ Sparse Finetuning plugin for Axolotl — enables handling of sparse neural netwo
 by maintaining masks for zero weights during training.
 """

+import logging
 from functools import wraps
 from typing import Any, Callable, Concatenate, ParamSpec, TypeVar

@@ -15,12 +16,11 @@ from transformers.trainer_callback import TrainerCallback, TrainerControl, Train
 from transformers.training_args import TrainingArguments

 from axolotl.integrations.base import BasePlugin
-from axolotl.utils.logging import get_logger

 P = ParamSpec("P")  # Params for generic function signatures
 R = TypeVar("R")  # Return type for generic function signatures

-LOG = get_logger(__name__)
+LOG = logging.getLogger("axolotl.integrations.llm_compressor")


 class LLMCompressorCallbackHandler(TrainerCallback):
--- a/src/axolotl/integrations/spectrum/init.py
+++ b/src/axolotl/integrations/spectrum/init.py
@@ -17,16 +17,14 @@ Spectrum Plugin to automatically generate unfrozen parameters based on SNR data.
 """

 import json
+import logging

 import requests

 from axolotl.integrations.base import BasePlugin
-from axolotl.utils.logging import get_logger

 from .args import SpectrumArgs  # pylint: disable=unused-import. # noqa: F401

-LOG = get_logger(__name__)
-

 def _generate_unfrozen_params_yaml(snr_data, top_fraction=0.5):
    unfrozen_parameters = {}
@@ -85,17 +83,17 @@ class SpectrumPlugin(BasePlugin):
        except FileNotFoundError:
            pass
        except Exception as exc:  # pylint: disable=broad-exception-caught
-            LOG.warning(f"Failed to read SNR data from {snr_path}: {exc}")
+            logging.warning(f"Failed to read SNR data from {snr_path}: {exc}")

        if not snr_data:
            try:
                snr_data = requests.get(snr_url, timeout=60).json()
            except requests.exceptions.RequestException as exc:
-                LOG.warning(f"Failed to fetch SNR data from {snr_url}: {exc}")
+                logging.warning(f"Failed to fetch SNR data from {snr_url}: {exc}")
                return
            # also catch json parsing errors
            except json.JSONDecodeError as exc:
-                LOG.warning(f"Failed to parse SNR data from {snr_url}: {exc}")
+                logging.warning(f"Failed to parse SNR data from {snr_url}: {exc}")
                return

        unfrozen_parameters = _generate_unfrozen_params_yaml(
--- a/src/axolotl/kernels/geglu.py
+++ b/src/axolotl/kernels/geglu.py
@@ -1,4 +1,5 @@
-"""Module for definition of GEGLU Triton kernels.
+"""
+Module for definition of GEGLU Triton kernels.

 See "GLU Variants Improve Transformer" (https://arxiv.org/abs/2002.05202).

@@ -11,6 +12,8 @@ import torch
 import triton
 import triton.language as tl

+SQRT_2_PI: tl.constexpr = 0.7978845608028654  # sqrt(2/π)
+

@triton.jit
 def _geglu_fwd_kernel(
--- a/src/axolotl/kernels/lora.py
+++ b/src/axolotl/kernels/lora.py
@@ -280,19 +280,19 @@ class LoRA_MLP(torch.autograd.Function):
        # Initialize and compute LoRA gradients
        d_down_A = d_down_B = d_up_A = d_up_B = d_gate_A = d_gate_B = None

-        if down_A is not None and down_B is not None:
+        if down_A is not None:
            d_down_A = h.t() @ (grad_output @ down_B.t())
            d_down_B = (down_A.t() @ h.t()) @ grad_output
            d_down_A *= down_scale
            d_down_B *= down_scale

-        if up_A is not None and up_B is not None:
+        if up_A is not None:
            d_up_A = X.t() @ (grad_up @ up_B.t())
            d_up_B = (up_A.t() @ X.t()) @ grad_up
            d_up_A *= up_scale
            d_up_B *= up_scale

-        if gate_A is not None and gate_B is not None:
+        if gate_A is not None:
            d_gate_A = X.t() @ (grad_gate @ gate_B.t())
            d_gate_B = (gate_A.t() @ X.t()) @ grad_gate
            d_gate_A *= gate_scale
@@ -311,7 +311,7 @@ class LoRA_MLP(torch.autograd.Function):
            del up_weight

            # Note the .to(dtype) only where mixing LoRA with base weights
-            if up_A is not None and up_B is not None:
+            if up_A is not None:
                dX += grad_up @ up_B.to(dtype).t() @ (up_scale * up_A.to(dtype).t())

            # Gate projection gradients
@@ -319,7 +319,7 @@ class LoRA_MLP(torch.autograd.Function):
            dX += grad_gate @ gate_weight.t()
            del gate_weight

-            if gate_A is not None and gate_B is not None:
+            if gate_A is not None:
                dX += (
                    grad_gate
                    @ gate_B.to(dtype).t()
--- a/src/axolotl/loaders/init.py
+++ b/src/axolotl/loaders/init.py
@@ -1,10 +0,0 @@
-"""Init for axolotl.loaders module"""
-
-# pylint: disable=unused-import
-# flake8: noqa
-
-from .adapter import load_adapter, load_lora
-from .constants import MULTIMODAL_AUTO_MODEL_MAPPING
-from .model import ModelLoader
-from .processor import load_processor
-from .tokenizer import load_tokenizer
--- a/src/axolotl/loaders/adapter.py
+++ b/src/axolotl/loaders/adapter.py
@@ -1,206 +0,0 @@
-"""Adapter loading functionality, including LoRA / QLoRA and associated utils"""
-
-import os
-import types
-from typing import Any
-
-import bitsandbytes as bnb
-import torch
-from bitsandbytes.nn import Params4bit
-from peft import (
-    AdaptionPromptConfig,
-    LoftQConfig,
-    LoraConfig,
-    PeftConfig,
-    PeftMixedModel,
-    PeftModel,
-    get_peft_model,
-)
-from transformers import PreTrainedModel
-
-from axolotl.loaders.utils import get_linear_embedding_layers
-from axolotl.utils.dict import DictDefault
-from axolotl.utils.logging import get_logger
-
-LOG = get_logger(__name__)
-
-
-def setup_quantized_meta_for_peft(model: torch.nn.Module):
-    """Replaces `quant_state.to` with a dummy function to prevent PEFT from moving `quant_state` to meta device"""
-
-    def temp_to_method(self, *args, **kwargs):  # pylint: disable=unused-argument
-        return self
-
-    for param in model.parameters():
-        if isinstance(param, Params4bit):
-            param.quant_state._orig_to = (  # pylint: disable=protected-access
-                param.quant_state.to
-            )
-            param.quant_state.to = types.MethodType(temp_to_method, param.quant_state)
-
-
-def setup_quantized_peft_meta_for_training(model: torch.nn.Module):
-    """Replaces dummy `quant_state.to` method with the original function to allow training to continue"""
-    for param in model.parameters():
-        if isinstance(param, Params4bit) and hasattr(param.quant_state, "_orig_to"):
-            param.quant_state.to = (
-                param.quant_state._orig_to  # pylint: disable=protected-access
-            )
-            param.quant_state._orig_to = None  # pylint: disable=protected-access
-
-
-def find_all_linear_names(model):
-    cls = (bnb.nn.Linear4bit, bnb.nn.Linear8bitLt, torch.nn.Linear)
-    lora_module_names = set()
-    for name, module in model.named_modules():
-        if (
-            isinstance(module, cls)
-            or "Linear" in module.__class__.__name__
-            and module.__class__.__name__ not in ("LlamaLinearScalingRotaryEmbedding",)
-        ):
-            names = name.split(".")
-            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
-
-    embedding_modules = get_linear_embedding_layers(model.config.model_type)
-    output_embedding = embedding_modules[1]
-    if output_embedding in lora_module_names:  # needed for 16-bit
-        lora_module_names.remove(output_embedding)
-
-    return list(lora_module_names)
-
-
-def load_lora(
-    model: PreTrainedModel,
-    cfg: DictDefault,
-    inference: bool = False,
-    config_only: bool = False,
-) -> tuple[PreTrainedModel | PeftModel | PeftMixedModel | None, PeftConfig | None]:
-    lora_target_modules = cfg.lora_target_modules or []
-
-    if cfg.lora_target_linear:
-        linear_names = find_all_linear_names(model)
-        LOG.info(f"found linear modules: {repr(sorted(linear_names))}")
-        lora_target_modules_as_list = (
-            lora_target_modules
-            if isinstance(lora_target_modules, list)
-            else [lora_target_modules]
-        )
-        lora_target_modules = list(set(lora_target_modules_as_list + linear_names))
-
-    lora_config_kwargs = {}
-    loftq_bits = cfg.peft and cfg.peft.loftq_config and cfg.peft.loftq_config.loftq_bits
-    if loftq_bits:
-        lora_config_kwargs["loftq_config"] = LoftQConfig(loftq_bits=loftq_bits)
-        lora_config_kwargs["init_lora_weights"] = "loftq"
-    if cfg.peft_init_lora_weights:
-        lora_config_kwargs["init_lora_weights"] = cfg.peft_init_lora_weights
-    if cfg.peft_use_dora:
-        lora_config_kwargs["use_dora"] = cfg.peft_use_dora
-        LOG.info("Initializing LoRA weights using dora. This might take longer.")
-    if cfg.peft_use_rslora:
-        lora_config_kwargs["use_rslora"] = cfg.peft_use_rslora
-    if cfg.peft_layer_replication:
-        lora_config_kwargs["layer_replication"] = cfg.peft_layer_replication
-
-    lora_config = LoraConfig(
-        r=cfg.lora_r,
-        lora_alpha=cfg.lora_alpha,
-        target_modules=lora_target_modules,
-        layers_to_transform=cfg.peft_layers_to_transform,
-        layers_pattern=cfg.peft_layers_pattern,
-        lora_dropout=cfg.lora_dropout,
-        fan_in_fan_out=cfg.lora_fan_in_fan_out,
-        modules_to_save=cfg.lora_modules_to_save if cfg.lora_modules_to_save else None,
-        bias="none",
-        task_type="CAUSAL_LM",
-        **lora_config_kwargs,
-    )
-
-    if config_only:
-        return None, lora_config
-
-    rank = int(os.environ.get("LOCAL_RANK", 0))
-
-    if (
-        cfg.fsdp
-        and cfg.adapter
-        and cfg.fsdp_config.fsdp_cpu_ram_efficient_loading
-        and rank != 0
-    ):
-        setup_quantized_meta_for_peft(model)
-
-    if cfg.lora_model_dir:
-        LOG.debug("Loading pretrained PEFT - LoRA")
-        model_kwargs: Any = {}
-        if cfg.lora_on_cpu:
-            model_kwargs["max_memory"] = {"cpu": "256GiB"}
-            model_kwargs["device_map"] = {"": "cpu"}
-        model = PeftModel.from_pretrained(
-            model,
-            cfg.lora_model_dir,
-            is_trainable=(not inference),
-            **model_kwargs,
-        )
-    else:
-        model = get_peft_model(model, lora_config)
-
-    if rank == 0:
-        try:
-            model.print_trainable_parameters()
-        except AttributeError as exc:
-            LOG.warning(
-                "Exception caught during model.print_trainable_parameters(): %s", exc
-            )
-    elif (
-        cfg.fsdp
-        and cfg.adapter
-        and cfg.fsdp_config.fsdp_cpu_ram_efficient_loading
-        and rank != 0
-    ):
-        setup_quantized_peft_meta_for_training(model)
-
-    return model, lora_config
-
-
-def load_adapter(
-    model: PreTrainedModel,
-    cfg: DictDefault,
-    adapter: str | None,
-    inference: bool = False,
-) -> tuple[PreTrainedModel | PeftModel | PeftMixedModel, PeftConfig | None]:
-    if adapter is None:
-        return model, None
-    if hasattr(model, "enable_input_require_grads"):
-        model.enable_input_require_grads()
-    if adapter in ["lora", "qlora"]:
-        peft_model, lora_config = load_lora(model, cfg, inference=inference)
-        return peft_model, lora_config
-    if adapter == "llama-adapter":
-        peft_model, lora_config = load_llama_adapter(model, cfg)
-        return peft_model, lora_config
-
-    raise NotImplementedError(f"{adapter} PEFT adapter not available")
-
-
-def load_llama_adapter(
-    model: PreTrainedModel, cfg: DictDefault
-) -> tuple[PeftModel | PeftMixedModel, PeftConfig]:
-    peft_config = AdaptionPromptConfig(
-        adapter_layers=cfg.peft_adapter.layers,  # layers (L)
-        adapter_len=cfg.peft_adapter.len,  # prompt length (K)
-        task_type="CAUSAL_LM",
-    )
-
-    if cfg.lora_model_dir:
-        LOG.debug("Loading pretrained PEFT - llama_adapter")
-        peft_model = PeftModel.from_pretrained(
-            model,
-            cfg.lora_model_dir,
-            torch_dtype=torch.float16,
-        )
-    else:
-        peft_model = get_peft_model(model, peft_config)
-
-    peft_model.print_trainable_parameters()
-
-    return peft_model, peft_config
--- a/src/axolotl/loaders/constants.py
+++ b/src/axolotl/loaders/constants.py
@@ -1,21 +0,0 @@
-"""Shared constants for axolotl.loaders module"""
-
-from transformers import (
-    Gemma3ForConditionalGeneration,
-    Llama4ForConditionalGeneration,
-    LlavaForConditionalGeneration,
-    Mistral3ForConditionalGeneration,
-    MllamaForConditionalGeneration,
-    Qwen2_5_VLForConditionalGeneration,
-    Qwen2VLForConditionalGeneration,
-)
-
-MULTIMODAL_AUTO_MODEL_MAPPING = {
-    "mllama": MllamaForConditionalGeneration,
-    "llama4": Llama4ForConditionalGeneration,
-    "llava": LlavaForConditionalGeneration,
-    "qwen2_vl": Qwen2VLForConditionalGeneration,
-    "qwen2_5_vl": Qwen2_5_VLForConditionalGeneration,
-    "mistral3": Mistral3ForConditionalGeneration,
-    "gemma3": Gemma3ForConditionalGeneration,
-}
--- a/src/axolotl/loaders/model.py
+++ b/src/axolotl/loaders/model.py
@@ -1,783 +0,0 @@
-"""Model loader class implementation for loading, configuring, and patching various
-models.
-"""
-
-import gc
-import math
-import os
-from functools import cached_property
-from importlib.util import find_spec
-from typing import Any
-
-import peft
-import torch
-import transformers
-import transformers.modeling_utils
-from accelerate import init_empty_weights
-from peft import (
-    PeftConfig,
-    PeftMixedModel,
-    PeftModel,
-    PeftModelForCausalLM,
-    prepare_model_for_kbit_training,
-)
-from transformers import (
-    AutoModelForCausalLM,
-    AutoModelForVision2Seq,
-    AwqConfig,
-    BitsAndBytesConfig,
-    GPTQConfig,
-    PreTrainedModel,
-    PreTrainedTokenizerBase,
-)
-from transformers.integrations.deepspeed import (
-    HfTrainerDeepSpeedConfig,
-    is_deepspeed_zero3_enabled,
-)
-
-from axolotl.common.architectures import MOE_ARCH_BLOCK
-from axolotl.integrations.base import PluginManager
-from axolotl.loaders.adapter import load_adapter, load_lora
-from axolotl.loaders.constants import MULTIMODAL_AUTO_MODEL_MAPPING
-from axolotl.loaders.patch_manager import PatchManager
-from axolotl.loaders.utils import (
-    get_linear_embedding_layers,
-    get_module_class_from_name,
-    load_model_config,
-)
-from axolotl.models.mamba import fix_mamba_attn_for_loss
-from axolotl.utils.bench import log_gpu_memory_usage
-from axolotl.utils.dict import DictDefault
-from axolotl.utils.distributed import (
-    get_device_count,
-    get_device_type,
-)
-from axolotl.utils.logging import get_logger
-from axolotl.utils.model_shard_quant import load_sharded_model_quant
-from axolotl.utils.schemas.enums import RLType
-
-LOG = get_logger(__name__)
-PLUGIN_MANAGER = PluginManager.get_instance()
-
-
-class ModelLoader:
-    """Manages model configuration, initialization and application of patches during
-    model loading.
-
-    This class orchestrates the entire process of loading a model from configuration to
-    final preparation. It handles device mapping, quantization, attention mechanisms,
-    adapter integration, and various optimizations.
-
-    The loading process includes:
-        - Loading and validating model configuration
-        - Applying monkey patches for optimizations / fixes
-        - Setting up device mapping (including multi-GPU configurations)
-        - Configuring quantization
-        - Setting attention mechanisms (Flash Attention, SDPA, etc.)
-        - Loading and initializing the model
-        - Applying adapters (LoRA, QLoRA, etc.)
-
-    Attributes:
-        model: The loaded model instance (available after load() is called).
-        model_kwargs: Dictionary of keyword arguments passed to model initialization.
-        base_model: Name or path of the base model to load.
-        model_type: Type of model to load (e.g., `AutoModelForCausalLM`).
-        model_config: Configuration object for the model.
-        auto_model_loader: class used for loading the model (default:
-            `AutoModelForCausalLM`).
-    """
-
-    def __init__(
-        self,
-        cfg: DictDefault,
-        tokenizer: PreTrainedTokenizerBase,
-        *,
-        inference: bool = False,
-        reference_model: bool = False,
-        **kwargs,  # pylint: disable=unused-argument
-    ):
-        """Initializes the ModelLoader.
-
-        Args:
-            cfg: Configuration dictionary with model and training settings.
-            tokenizer: Tokenizer instance associated with the model.
-            processor: Optional processor for multimodal models. Defaults to None.
-            inference: Whether the model is being loaded for inference mode. Defaults
-                to False.
-            reference_model: Whether this is a reference model (used in setups like DPO
-                training). Defaults to False.
-            **kwargs: Additional keyword arguments (ignored).
-        """
-        self.cfg = cfg
-        self.tokenizer = tokenizer
-        self.inference: bool = inference
-        self.reference_model: bool = reference_model
-
-        # Init model kwargs
-        self.model_kwargs: dict[str, Any] = {}
-        if cfg.overrides_of_model_kwargs:
-            for key, val in cfg.overrides_of_model_kwargs.items():
-                self.model_kwargs[key] = val
-
-        # Init model
-        self.model: PreTrainedModel | PeftModel | PeftMixedModel
-        self.base_model = cfg.base_model
-        self.model_type = cfg.type_of_model
-
-        # Init model config
-        self.model_config = load_model_config(cfg)
-        self.auto_model_loader = AutoModelForCausalLM  # pylint: disable=invalid-name
-
-        # Initialize the patch manager
-        self.patch_manager = PatchManager(
-            cfg=cfg,
-            model_config=self.model_config,
-            inference=inference,
-        )
-
-    @cached_property
-    def has_flash_attn(self) -> bool:
-        """Check if flash attention is installed."""
-        return find_spec("flash_attn") is not None
-
-    @cached_property
-    def qlora_fsdp(self):
-        """Property that determines if FSDP with QLoRA is enabled."""
-        return self.cfg.fsdp and self.cfg.adapter == "qlora"
-
-    def load(self) -> tuple[PreTrainedModel | PeftModelForCausalLM, PeftConfig | None]:
-        """Load and prepare the model with all configurations and patches.
-
-        Returns:
-            A tuple with the loaded model and its LoRA configuration (if applicable).
-        """
-        # Initial setup and patches
-        self.patch_manager.apply_pre_model_load_patches()
-        self._apply_pre_model_load_setup()
-
-        # Build the model
-        PLUGIN_MANAGER.pre_model_load(self.cfg)
-        skip_move_to_device = self._build_model()
-        PLUGIN_MANAGER.post_model_build(self.cfg, self.model)
-
-        # Post-build model configuration
-        self._apply_post_model_load_setup()
-
-        # Load adapters (LoRA, etc.)
-        PLUGIN_MANAGER.pre_lora_load(self.cfg, self.model)
-        lora_config = self._load_adapters()
-        PLUGIN_MANAGER.post_lora_load(self.cfg, self.model)
-
-        # Apply remaining patches and finalize
-        self._apply_post_lora_load_setup(skip_move_to_device)
-        self.patch_manager.apply_post_model_load_patches(self.model)
-        PLUGIN_MANAGER.post_model_load(self.cfg, self.model)
-
-        return self.model, lora_config
-
-    def _apply_pre_model_load_setup(self):
-        """Apply patches and setup configurations before model loading."""
-        self._set_auto_model_loader()
-        self._set_device_map_config()
-        if self.cfg.revision_of_model:
-            self.model_kwargs["revision"] = self.cfg.revision_of_model
-        self._set_quantization_config()
-        self._set_attention_config()
-
-    def _apply_post_model_load_setup(self):
-        """Configure the model after it has been loaded."""
-        # Handle PeftModel if needed
-        if (
-            isinstance(self.model, (peft.PeftModel, peft.PeftModelForCausalLM))
-            and not self.qlora_fsdp
-        ):
-            self.model = self.model.merge_and_unload()
-
-        self._resize_token_embeddings()
-        self._adjust_model_config()
-        self._log_memory_usage()
-        self._configure_embedding_dtypes()
-        self._configure_qat()
-
-    def _resize_token_embeddings(self):
-        """Resize token embeddings if needed."""
-        embeddings_len = (
-            math.ceil(len(self.tokenizer) / 32) * 32
-            if self.cfg.resize_token_embeddings_to_32x
-            else len(self.tokenizer)
-        )
-        if hasattr(self.model, "get_input_embeddings") and (
-            self.model.get_input_embeddings().num_embeddings < embeddings_len
-            or (
-                self.model.get_input_embeddings().num_embeddings > embeddings_len
-                and self.cfg.shrink_embeddings
-            )
-        ):
-            resize_kwargs = {}
-            if self.cfg.mean_resizing_embeddings is not None and (
-                self.model_config.model_type != "llava"
-            ):
-                resize_kwargs["mean_resizing"] = self.cfg.mean_resizing_embeddings
-            self.model.resize_token_embeddings(embeddings_len, **resize_kwargs)
-        else:
-            self.model.tie_weights()
-
-    def _adjust_model_config(self):
-        if (
-            hasattr(self.model, "config")
-            and hasattr(self.model.config, "max_position_embeddings")
-            and self.model.config.max_position_embeddings
-            and self.cfg.sequence_len > self.model.config.max_position_embeddings
-        ):
-            LOG.warning(
-                "increasing model.config.max_position_embeddings from "
-                f"{self.model.config.max_position_embeddings} to {self.cfg.sequence_len}"
-            )
-            self.model.config.max_position_embeddings = self.cfg.sequence_len
-
-        if (
-            hasattr(self.model, "config")
-            and hasattr(self.model.config, "bos_token_id")
-            and self.model.config.bos_token_id
-            and self.model.config.bos_token_id != self.tokenizer.bos_token_id
-        ):
-            self.model.config.bos_token_id = self.tokenizer.bos_token_id
-
-        if (
-            hasattr(self.model, "config")
-            and hasattr(self.model.config, "eos_token_id")
-            and self.model.config.eos_token_id
-            and self.model.config.eos_token_id != self.tokenizer.eos_token_id
-        ):
-            self.model.config.eos_token_id = self.tokenizer.eos_token_id
-
-    def _log_memory_usage(self):
-        """Log device memory usage after model load."""
-        if hasattr(self.model, "device") and self.model.device.type in (
-            "cuda",
-            "mps",
-            "npu",
-        ):
-            log_gpu_memory_usage(LOG, "after model load", self.model.device)
-
-    def _configure_embedding_dtypes(self):
-        """Configure embedding module dtypes."""
-        # Get embedding modules
-        embedding_modules = get_linear_embedding_layers(self.cfg.model_config_type)
-
-        # Initial dtype conversion
-        if not self.cfg.fsdp:
-            # We don't run this during FSDP because this will leave mixed and bfloat16
-            # dtypes in the model which FSDP doesn't like
-            if self.cfg.load_in_4bit and self.cfg.embeddings_skip_upcast:
-                embedding_modules = []
-            self._convert_embedding_modules_dtype(
-                embedding_modules,
-                dist_dtype=torch.float32,
-                before_kbit_train_or_finetune=True,
-            )
-
-        # Handle DeepSpeed Zero3
-        if is_deepspeed_zero3_enabled():
-            self._set_z3_leaf_modules()
-
-        # Apply gradient checkpointing if needed
-        needs_fa2_dtype = self.cfg.adapter or self.cfg.fsdp
-        if self.cfg.adapter in ["lora", "qlora"]:
-            needs_fa2_dtype = True
-            if self.cfg.gradient_checkpointing:
-                self.model.gradient_checkpointing_enable(
-                    gradient_checkpointing_kwargs=self.cfg.gradient_checkpointing_kwargs
-                )
-
-        self._prepare_model_for_quantization()
-
-        # Convert dtypes if needed
-        should_convert = (
-            # LlamaRMSNorm layers are in fp32 after kbit_training or full finetune, so
-            # we need to convert them back to fp16/bf16 for flash-attn compatibility.
-            (
-                (needs_fa2_dtype or self.cfg.flash_attention or self.cfg.flex_attention)
-                and not self.qlora_fsdp
-            )
-            # CCE requires embedding layers to be in fp16/bf16 for backward pass
-            or self.cfg.cut_cross_entropy
-        )
-
-        if should_convert:
-            LOG.info("Converting modules to %s", self.cfg.torch_dtype)
-            self._convert_embedding_modules_dtype(
-                embedding_modules=embedding_modules,
-                dist_dtype=self.cfg.torch_dtype,
-                before_kbit_train_or_finetune=False,
-            )
-
-    def _configure_qat(self):
-        """Configure QAT."""
-        if self.cfg.qat:
-            from axolotl.utils.quantization import prepare_model_for_qat
-
-            prepare_model_for_qat(
-                self.model,
-                self.cfg.qat.weight_dtype,
-                self.cfg.qat.group_size,
-                self.cfg.qat.activation_dtype,
-                self.cfg.qat.quantize_embedding,
-            )
-
-    def _load_adapters(self) -> PeftConfig | None:
-        """Load LoRA or other adapters."""
-        # Load LoRA or adapter
-        lora_config = None
-        if not self.reference_model or self.cfg.lora_model_dir:
-            # If we're not loading the reference model, then we're loading the model
-            # for training. Then, the DPO trainer doesn't want the PEFT model loaded
-            # over it, it just wants the LoRA / PEFT config.
-            if (
-                self.cfg.adapter
-                and self.cfg.rl in [RLType.DPO, RLType.IPO, RLType.KTO]
-                and not self.cfg.merge_lora
-            ):
-                _, lora_config = load_lora(
-                    self.model, self.cfg, inference=False, config_only=True
-                )
-            else:
-                self.model, lora_config = load_adapter(
-                    self.model, self.cfg, self.cfg.adapter
-                )
-
-        return lora_config
-
-    def _apply_post_lora_load_setup(self, skip_move_to_device: bool):
-        """Apply final optimizations and patches."""
-        # Place model on accelerator
-        if (
-            self.cfg.ddp
-            and not self.cfg.load_in_8bit
-            and not (self.cfg.rl and self.cfg.load_in_4bit)
-            and not skip_move_to_device
-        ):
-            # TODO: validate this conditional
-            self.model.to(f"{str(get_device_type())}:{self.cfg.local_rank}")
-
-        if get_device_count() > 1 and int(os.getenv("WORLD_SIZE", "1")) == 1:
-            self.model.is_parallelizable = True
-            self.model.model_parallel = True
-
-        if not any(
-            param.requires_grad
-            for _, param in self.model.named_parameters(recurse=True)
-        ):
-            LOG.warning("There are no parameters that require gradient updates")
-
-        if self.cfg.flash_optimum:
-            from optimum.bettertransformer import BetterTransformer
-
-            self.model = BetterTransformer.transform(self.model)
-
-        if self.cfg.adapter is not None:
-            log_gpu_memory_usage(LOG, "after adapters", self.model.device)
-
-        for _ in range(3):
-            gc.collect()
-            torch.cuda.empty_cache()
-
-    def _set_auto_model_loader(self):
-        """Set `self.auto_model_loader`. Defaults to `transformers.AutoModelForCausalLM`
-        (set at `__init__`). When using a multimodal model, `self.auto_model_loader`
-        should be set according to the type of the model.
-        """
-        if self.cfg.is_multimodal:
-            self.auto_model_loader = MULTIMODAL_AUTO_MODEL_MAPPING.get(
-                self.model_config.model_type, AutoModelForVision2Seq
-            )
-
-    def _set_device_map_config(self):
-        """Setup `device_map` according to config"""
-        device_map = self.cfg.device_map
-        max_memory = self.cfg.max_memory
-
-        if self.cfg.gpu_memory_limit:
-            gpu_memory_limit = (
-                str(self.cfg.gpu_memory_limit) + "GiB"
-                if isinstance(self.cfg.gpu_memory_limit, int)
-                else self.cfg.gpu_memory_limit
-            )
-
-            max_memory = {}
-            num_device = get_device_count()
-            for i in range(num_device):
-                max_memory[i] = gpu_memory_limit
-            max_memory["cpu"] = "256GiB"  # something sufficiently large to fit anything
-
-        if max_memory is not None:
-            # Based on https://github.com/togethercomputer/OpenChatKit/blob/main/inference/bot.py
-            from accelerate import infer_auto_device_map
-
-            with init_empty_weights():
-                model_canvas = self.auto_model_loader.from_config(
-                    self.model_config,
-                    trust_remote_code=self.cfg.trust_remote_code or False,
-                )
-            model_canvas.tie_weights()
-            device_map = infer_auto_device_map(
-                model_canvas,
-                max_memory=max_memory,
-                dtype=self.cfg.torch_dtype,
-            )
-            # We can discard max_memory now as we have a device map set up
-            max_memory = None
-
-        self.model_kwargs["torch_dtype"] = self.cfg.torch_dtype
-
-        if not is_deepspeed_zero3_enabled():
-            self.model_kwargs["device_map"] = device_map
-
-            cur_device = get_device_type()
-            if "mps" in str(cur_device):
-                self.model_kwargs["device_map"] = "mps:0"
-            elif "npu" in str(cur_device):
-                self.model_kwargs["device_map"] = "npu:0"
-
-        # TODO: can we put the reference model on it's own gpu? I think we have to move
-        # logits around to calculate loss
-        # if cfg.rl:
-        #     if torch.cuda.device_count() > 1:
-        #         if reference_model:
-        #             model_kwargs["device_map"] = "cuda:" + str(
-        #                 torch.cuda.current_device() + 1
-        #             )
-        #         else:
-        #             model_kwargs["device_map"] = "cuda:" + str(torch.cuda.current_device())
-
-    def _set_quantization_config(self):
-        """Set up quantization config (bitsandbytes, awq, gptq, etc.)"""
-        self.model_kwargs["load_in_8bit"] = self.cfg.load_in_8bit
-        self.model_kwargs["load_in_4bit"] = self.cfg.load_in_4bit
-
-        if self.cfg.gptq:
-            if not hasattr(self.model_config, "quantization_config"):
-                LOG.warning(
-                    "model config does not contain quantization_config information"
-                )
-            else:
-                if self.cfg.gptq_disable_exllama is not None:
-                    self.model_config.quantization_config["disable_exllama"] = (
-                        self.cfg.gptq_disable_exllama
-                    )
-                self.model_kwargs["quantization_config"] = GPTQConfig(
-                    **self.model_config.quantization_config
-                )
-        if (
-            self.cfg.adapter in ["qlora", "lora"]
-            and hasattr(self.model_config, "quantization_config")
-            and self.model_config.quantization_config["quant_method"]
-            in ["gptq", "awq", "bitsandbytes"]
-        ):
-            if self.model_config.quantization_config["quant_method"] == "gptq":
-                self.model_kwargs["quantization_config"] = GPTQConfig(
-                    **self.model_config.quantization_config
-                )
-            elif self.model_config.quantization_config["quant_method"] == "awq":
-                self.model_kwargs["quantization_config"] = AwqConfig(
-                    **self.model_config.quantization_config
-                )
-            elif (
-                self.model_config.quantization_config["quant_method"] == "bitsandbytes"
-            ):
-                self.model_kwargs["quantization_config"] = BitsAndBytesConfig(
-                    **self.model_config.quantization_config
-                )
-        elif self.cfg.adapter == "qlora" and self.model_kwargs["load_in_4bit"]:
-            bnb_config = {
-                "load_in_4bit": True,
-                "llm_int8_threshold": 6.0,
-                "llm_int8_has_fp16_weight": False,
-                "bnb_4bit_compute_dtype": self.cfg.torch_dtype,
-                "bnb_4bit_use_double_quant": True,
-                "bnb_4bit_quant_type": "nf4",
-                "bnb_4bit_quant_storage": torch.bfloat16,
-            }
-            if self.cfg.model_config_type in ["jamba", "qwen2_moe"] and not (
-                self.cfg.deepspeed or self.cfg.fsdp
-            ):
-                # for some reason, this causes the loss to be off by an order of magnitude
-                # but deepspeed needs this still in bfloat16
-                bnb_config["bnb_4bit_quant_storage"] = torch.float32
-
-            if self.cfg.bnb_config_kwargs:
-                bnb_config.update(self.cfg.bnb_config_kwargs)
-
-            self.model_kwargs["quantization_config"] = BitsAndBytesConfig(
-                **bnb_config,
-            )
-        elif self.cfg.adapter == "lora" and self.model_kwargs["load_in_8bit"]:
-            bnb_config = {
-                "load_in_8bit": True,
-            }
-            # Exclude mamba blocks from int8 quantization for jamba
-            if self.cfg.model_config_type == "jamba":
-                bnb_config["llm_int8_skip_modules"] = ["mamba"]
-            self.model_kwargs["quantization_config"] = BitsAndBytesConfig(
-                **bnb_config,
-            )
-
-        # no longer needed per https://github.com/huggingface/transformers/pull/26610
-        if "quantization_config" in self.model_kwargs or self.cfg.gptq:
-            self.model_kwargs.pop("load_in_8bit", None)
-            self.model_kwargs.pop("load_in_4bit", None)
-
-    def _set_attention_config(self):
-        """Sample packing uses custom FA2 patch"""
-        if self.cfg.flex_attention:
-            self.model_kwargs["attn_implementation"] = "flex_attention"
-            self.model_config._attn_implementation = (  # pylint: disable=protected-access
-                "flex_attention"
-            )
-
-        elif self.cfg.flash_attention:
-            if not self.cfg.sample_packing and self.cfg.s2_attention:
-                pass
-            self.model_kwargs["attn_implementation"] = "flash_attention_2"
-            self.model_config._attn_implementation = (  # pylint: disable=protected-access
-                "flash_attention_2"
-            )
-        elif self.cfg.sdp_attention:
-            self.model_kwargs["attn_implementation"] = "sdpa"
-            self.model_config._attn_implementation = (  # pylint: disable=protected-access
-                "sdpa"
-            )
-        elif self.cfg.eager_attention:
-            self.model_kwargs["attn_implementation"] = "eager"
-            self.model_config._attn_implementation = (  # pylint: disable=protected-access
-                "eager"
-            )
-
-        if self.cfg.low_cpu_mem_usage:
-            self.model_kwargs["low_cpu_mem_usage"] = True
-
-    def _configure_zero3_memory_efficient_loading(
-        self,
-    ) -> HfTrainerDeepSpeedConfig | None:
-        """
-        Set the deepspeed config to load the model into RAM first before moving to VRAM.
-
-        IMPORTANT
-        ==========
-
-        We need to return `hf_ds_cfg` as it needs to exist before model loading for zero3.
-        HfTrainerDeepSpeedConfig is a class that is used to configure the DeepSpeed training.
-        It is not passed anywhere in the model loading function, just need to exist.
-        """
-        hf_ds_cfg = None
-
-        if os.getenv("ACCELERATE_DEEPSPEED_ZERO_STAGE") == "3":
-            hf_ds_cfg = HfTrainerDeepSpeedConfig(self.cfg.deepspeed)
-            hf_ds_cfg.fill_match(
-                "train_micro_batch_size_per_gpu", self.cfg.micro_batch_size
-            )
-            hf_ds_cfg.fill_match(
-                "gradient_accumulation_steps", self.cfg.gradient_accumulation_steps
-            )
-            hf_ds_cfg.fill_match(
-                "train_batch_size",
-                int(os.getenv("WORLD_SIZE", "1"))
-                * self.cfg.micro_batch_size
-                * self.cfg.gradient_accumulation_steps,
-            )
-            if "device_map" in self.model_kwargs:
-                del self.model_kwargs["device_map"]
-
-            transformers.modeling_utils.is_deepspeed_zero3_enabled = lambda: True
-            transformers.integrations.deepspeed.is_deepspeed_zero3_enabled = (
-                lambda: True
-            )
-
-        return hf_ds_cfg
-
-    def _build_model(self) -> bool:
-        """Load model, with load strategy depending on config."""
-        skip_move_to_device = False
-        if (
-            self.qlora_fsdp
-            and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading
-            and (
-                self.cfg.model_config_type == "dbrx"
-                or self.cfg.qlora_sharded_model_loading
-            )
-        ):
-            quant_storage = self.cfg.torch_dtype
-            quantization_config = getattr(
-                self.model_config, "quantization_config", None
-            )
-            quantization_config = (
-                quantization_config or self.model_kwargs["quantization_config"]
-            )
-            self.model = load_sharded_model_quant(
-                self.base_model,
-                self.model_config,
-                self.cfg,
-                quant_storage=quant_storage,
-                quantization_config=quantization_config,
-            )
-            skip_move_to_device = True
-        elif (
-            self.model_config.model_type in ["llama", "llama4"]
-            and not self.cfg.trust_remote_code
-            and not self.cfg.gptq
-        ):
-            # TODO: Do we need to open this up for all models?
-            if self.cfg.fsdp and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading:
-                skip_move_to_device = True
-                if "device_map" in self.model_kwargs:
-                    del self.model_kwargs["device_map"]
-
-            # Please don't remove underscore binding without reading the fn docstring.
-            _ = self._configure_zero3_memory_efficient_loading()
-
-            # Load model with random initialization if specified
-            if self.cfg.random_init_weights:
-                # AutoModel classes support the from_config method
-                if self.auto_model_loader in [
-                    AutoModelForCausalLM,
-                    AutoModelForVision2Seq,
-                ]:
-                    self.model = self.auto_model_loader.from_config(
-                        config=self.model_config,
-                    )
-                else:
-                    self.model = self.auto_model_loader(config=self.model_config)
-            else:
-                self.model = self.auto_model_loader.from_pretrained(
-                    self.base_model,
-                    config=self.model_config,
-                    **self.model_kwargs,
-                )
-        elif self.model_type == "MambaLMHeadModel":
-            # FIXME this is janky at best and hacked together to make it work
-            MambaLMHeadModel = fix_mamba_attn_for_loss()  # pylint: disable=invalid-name
-
-            self.model_kwargs["dtype"] = self.model_kwargs["torch_dtype"]
-            self.model_kwargs["device"] = torch.cuda.current_device()
-            self.model_kwargs.pop("torch_dtype", None)
-            self.model_kwargs.pop("device_map", None)
-
-            self.model = MambaLMHeadModel.from_pretrained(
-                self.base_model,
-                **self.model_kwargs,
-            )
-        elif (
-            self.model_type
-            and self.model_type != "AutoModelForCausalLM"
-            and not self.cfg.trust_remote_code
-        ):
-            if self.cfg.gptq:
-                self.model = self.auto_model_loader.from_pretrained(
-                    self.base_model,
-                    config=self.model_config,
-                    trust_remote_code=self.cfg.trust_remote_code or False,
-                    **self.model_kwargs,
-                )
-            else:
-                self.model = getattr(transformers, self.model_type).from_pretrained(
-                    self.base_model,
-                    config=self.model_config,
-                    trust_remote_code=self.cfg.trust_remote_code or False,
-                    **self.model_kwargs,
-                )
-        else:
-            if self.cfg.gptq:
-                self.model = self.auto_model_loader.from_pretrained(
-                    self.base_model,
-                    config=self.model_config,
-                    trust_remote_code=self.cfg.trust_remote_code or False,
-                    **self.model_kwargs,
-                )
-            else:
-                if (
-                    self.cfg.fsdp
-                    and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading
-                ):
-                    # disabling either of these two still leads to VRAM spike before setting back down
-                    skip_move_to_device = True
-                    if "device_map" in self.model_kwargs:
-                        del self.model_kwargs["device_map"]
-
-                # Please don't remove underscore binding without reading the fn docstring.
-                _ = self._configure_zero3_memory_efficient_loading()
-
-                self.model = self.auto_model_loader.from_pretrained(
-                    self.base_model,
-                    config=self.model_config,
-                    trust_remote_code=self.cfg.trust_remote_code or False,
-                    **self.model_kwargs,
-                )
-        if is_deepspeed_zero3_enabled():
-            skip_move_to_device = True
-
-        return skip_move_to_device
-
-    def _set_z3_leaf_modules(self):
-        from deepspeed.utils import set_z3_leaf_modules
-
-        if self.cfg.model_config_type in MOE_ARCH_BLOCK:
-            moe_blocks = MOE_ARCH_BLOCK[self.cfg.model_config_type]
-            moe_blocks = [moe_blocks] if isinstance(moe_blocks, str) else moe_blocks
-            set_z3_leaf_modules(
-                self.model,
-                [
-                    get_module_class_from_name(self.model, module_name)
-                    for module_name in moe_blocks
-                ],
-            )
-
-    def _prepare_model_for_quantization(self):
-        """Prepare loaded model for quantization."""
-        skip_prepare_model_for_kbit_training = False
-        if self.cfg.model_config_type == "qwen" and self.cfg.adapter == "lora":
-            # Qwen doesn't play nicely with LoRA if this is enabled
-            skip_prepare_model_for_kbit_training = True
-
-        loftq_bits = (
-            self.cfg.peft
-            and self.cfg.peft.loftq_config
-            and self.cfg.peft.loftq_config.loftq_bits
-        )
-        if self.cfg.adapter == "lora" and loftq_bits:
-            skip_prepare_model_for_kbit_training = True
-
-        if (
-            self.qlora_fsdp
-            or (self.cfg.fsdp and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading)
-            or is_deepspeed_zero3_enabled()
-        ):
-            # Make sure everything is in the same dtype
-            skip_prepare_model_for_kbit_training = True
-
-        if (
-            not skip_prepare_model_for_kbit_training
-            and self.cfg.adapter in ["lora", "qlora"]
-            and (self.cfg.load_in_8bit or self.cfg.load_in_4bit)
-        ):
-            LOG.info("converting PEFT model w/ prepare_model_for_kbit_training")
-            self.model = prepare_model_for_kbit_training(
-                self.model, use_gradient_checkpointing=self.cfg.gradient_checkpointing
-            )
-
-    def _convert_embedding_modules_dtype(
-        self,
-        embedding_modules: list[str],
-        dist_dtype: torch.dtype,
-        before_kbit_train_or_finetune: bool,
-    ):
-        for name, module in self.model.named_modules():
-            if "norm" in name:
-                module.to(dist_dtype)
-            if before_kbit_train_or_finetune:
-                if name.endswith(".gate"):
-                    module.to(dist_dtype)
-                if self.model_config.model_type == "btlm":
-                    # don't upcast lm_head for btlm
-                    continue
-            if any(m in name for m in embedding_modules) and hasattr(module, "weight"):
-                module.to(dist_dtype)
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -1,367 +0,0 @@
-"""Patch manager class implementation to complement `axolotl.loaders.ModelLoader`.
-
-Applies pre- and post-model load patches for various fixes and optimizations.
-"""
-
-import importlib.util
-from functools import cached_property
-
-import addict
-import transformers
-from transformers import PretrainedConfig, PreTrainedModel
-
-from axolotl.integrations.base import PluginManager
-from axolotl.monkeypatch.multipack import (
-    SUPPORTED_MULTIPACK_MODEL_TYPES,
-    patch_for_multipack,
-)
-from axolotl.utils.dict import DictDefault
-from axolotl.utils.logging import get_logger
-
-LOG = get_logger(__name__)
-PLUGIN_MANAGER = PluginManager.get_instance()
-
-
-class PatchManager:
-    """Manages the application of patches during the model loading process."""
-
-    def __init__(
-        self,
-        cfg: DictDefault,
-        model_config: PretrainedConfig | addict.Dict,
-        inference: bool = False,
-    ):
-        """Initialize the `PatchManager`.
-
-        Args:
-            cfg: Configuration dictionary with model and training settings.
-            model_config: Configuration object for the model.
-            inference: Whether the model is being loaded for inference mode.
-        """
-        self.cfg = cfg
-        self.model_config = model_config
-        self.inference = inference
-
-    @cached_property
-    def has_flash_attn(self) -> bool:
-        """Check if flash attention is installed."""
-        return importlib.util.find_spec("flash_attn") is not None
-
-    def apply_pre_model_load_patches(self):
-        """Apply pre-model load patches based on config."""
-        self._apply_flash_attention_patches()
-        self._apply_fsdp_patches()
-        self._apply_adapter_patches()
-        self._apply_flex_attention_patches()
-        self._apply_model_specific_patches()
-        self._apply_fp8_patches()
-        self._apply_flash_attention_peft_patches()
-        self._apply_gradient_checkpointing_patches()
-        self._patch_attention()
-        self._apply_multipack_patches()
-        self._patch_loss_llama()
-        self._patch_llama_derived_model()
-        self._apply_mistral_cross_entropy_patch()
-        self._apply_self_attention_lora_patch()
-
-    def apply_post_model_load_patches(self, model: PreTrainedModel):
-        """Apply patches that require the model instance."""
-        self._apply_llama_flash_attn_patches(model)
-        self._apply_unsloth_patches(model)
-        self._apply_lora_kernel_patch(model)
-
-    def _apply_flash_attention_patches(self):
-        """Apply patches related to Flash Attention."""
-        if self.cfg.xformers_attention and self.cfg.sample_packing:
-            from axolotl.monkeypatch.attention import patch_xformers_attn_over_fa2
-
-            patch_xformers_attn_over_fa2()
-            self.cfg.flash_attention = True
-
-    def _apply_fsdp_patches(self):
-        """Apply patches for FSDP configurations."""
-        if self.cfg.fsdp_config and str(self.cfg.fsdp_config.fsdp_version) == "2":
-            from axolotl.monkeypatch.accelerate.fsdp2 import patch_accelerate_fsdp2
-
-            patch_accelerate_fsdp2()
-
-    def _apply_adapter_patches(self):
-        """Apply patches for adapter configurations."""
-        if self.cfg.adapter and self.cfg.embeddings_skip_upcast:
-            from axolotl.monkeypatch.peft.utils import patch_peft_prep_code
-
-            patch_peft_prep_code()
-
-    def _apply_flex_attention_patches(self):
-        """Apply patches for flexible attention."""
-        if self.cfg.flex_attention:
-            from axolotl.monkeypatch.attention.flex_attn import (
-                patch_flex_make_mask,
-                patch_flex_wrapper,
-            )
-
-            flex_attn_compile_kwargs = self.cfg.flex_attn_compile_kwargs or {}
-            patch_flex_wrapper(**flex_attn_compile_kwargs)
-            patch_flex_make_mask()
-
-    def _apply_model_specific_patches(self):
-        """Apply patches specific to model architectures."""
-        if (
-            self.cfg.model_config_type == "llama4"
-            and self.cfg.llama4_linearized_experts
-        ):
-            from axolotl.monkeypatch.models.llama4.modeling import (
-                patch_llama4_linearized_modeling,
-            )
-
-            patch_llama4_linearized_modeling()
-
-    def _apply_fp8_patches(self):
-        """Apply patches for FP8 support."""
-        if self.cfg.fp8:
-            from axolotl.monkeypatch.trainer_accelerator_args import (
-                patch_create_accelerate_code_for_fp8,
-            )
-
-            patch_create_accelerate_code_for_fp8()
-
-    def _apply_flash_attention_peft_patches(self):
-        """Apply patches for Flash Attention with PEFT."""
-        if self.cfg.adapter:
-            from axolotl.monkeypatch.transformers_fa_utils import (
-                patch_fa_peft_integration,
-            )
-
-            patch_fa_peft_integration()
-
-    def _apply_gradient_checkpointing_patches(self):
-        """Apply patches for gradient checkpointing."""
-        if self.cfg.gradient_checkpointing in ["unsloth", "offload"]:
-            from axolotl.monkeypatch.gradient_checkpointing import (
-                hf_grad_checkpoint_offload_wrapper,
-            )
-
-            transformers.modeling_utils.checkpoint = hf_grad_checkpoint_offload_wrapper
-        if self.cfg.gradient_checkpointing == "offload_disk":
-            from axolotl.monkeypatch.gradient_checkpointing import (
-                hf_grad_checkpoint_disk_offload_wrapper,
-            )
-
-            transformers.modeling_utils.checkpoint = (
-                hf_grad_checkpoint_disk_offload_wrapper
-            )
-
-    def _apply_mistral_cross_entropy_patch(self):
-        """Apply Mistral cross entropy patch if configured."""
-        if (
-            self.cfg.model_config_type == "mistral"
-            and self.cfg.flash_attn_cross_entropy_loss
-        ):
-            from axolotl.monkeypatch.mistral_attn_hijack_flash import (
-                patch_mistral_cross_entropy,
-            )
-
-            patch_mistral_cross_entropy()
-
-    def _apply_self_attention_lora_patch(self):
-        """Apply self-attention LoRA patches if configured."""
-        if self.cfg.lora_qkv_kernel or self.cfg.lora_o_kernel:
-            from axolotl.monkeypatch.lora_kernels import patch_self_attn_lora
-
-            patch_self_attn_lora(self.cfg)
-
-    def _apply_multipack_patches(self):
-        """Apply multipack patches if necessary."""
-        if (
-            self.cfg.model_config_type in SUPPORTED_MULTIPACK_MODEL_TYPES
-            and (self.cfg.flash_attention or self.cfg.flex_attention)
-            and self.cfg.sample_packing
-        ):
-            # Get automap config if it exists
-            auto_map_config = None
-            if isinstance(self.model_config, dict) and "auto_map" in self.model_config:
-                auto_map_config = self.model_config["auto_map"]
-            elif hasattr(self.model_config, "auto_map"):
-                auto_map_config = self.model_config.auto_map
-
-            # Determine if the model has remote code
-            if auto_map_config is not None:
-                has_remote_code = "AutoModelForCausalLM" in auto_map_config
-            else:
-                has_remote_code = False
-
-            if has_remote_code and self.cfg.trust_remote_code is False:
-                # If explicitly set in YAML, prefer that
-                has_remote_code = self.cfg.trust_remote_code
-
-            patch_for_multipack(
-                self.cfg.model_config_type,
-                model_name=self.cfg.base_model,
-                has_remote_code=has_remote_code,
-            )
-
-    def _patch_attention(self):
-        """Apply attention-specific patches based on model type."""
-        if not (self.cfg.flash_attention and hasattr(self.model_config, "model_type")):
-            return
-
-        if self.model_config.model_type == "btlm":
-            from axolotl.monkeypatch.btlm_attn_hijack_flash import (
-                replace_btlm_attn_with_flash_attn,
-            )
-
-            replace_btlm_attn_with_flash_attn(self.cfg.base_model)
-
-        if self.model_config.model_type == "stablelm_epoch" and self.cfg.sample_packing:
-            from axolotl.monkeypatch.stablelm_attn_hijack_flash import (
-                replace_stablelm_attn_with_flash_attn,
-            )
-
-            replace_stablelm_attn_with_flash_attn(self.cfg.base_model)
-
-    def _patch_loss_llama(self):
-        """Patch loss functions and other optimizations for LLaMA models."""
-        if not self.cfg.is_llama_derived_model:
-            return
-
-        if self.cfg.flash_attn_cross_entropy and self.has_flash_attn:
-            from axolotl.monkeypatch.llama_attn_hijack_flash import (
-                patch_fa_llama_cross_entropy,
-            )
-
-            patch_fa_llama_cross_entropy()
-        elif self.cfg.unsloth_cross_entropy_loss:
-            from axolotl.monkeypatch.unsloth_ import integrate_cross_entropy_loss_patch
-
-            integrate_cross_entropy_loss_patch(model_type="llama")
-
-        if self.cfg.flash_attn_rms_norm and self.has_flash_attn:
-            from axolotl.monkeypatch.llama_attn_hijack_flash import patch_llama_rms_norm
-
-            patch_llama_rms_norm()
-        elif self.cfg.unsloth_rms_norm:
-            from axolotl.monkeypatch.unsloth_ import patch_unsloth_layernorm
-
-            patch_unsloth_layernorm()
-
-        if self.cfg.unsloth_lora_qkv or self.cfg.unsloth_lora_o:
-            from axolotl.monkeypatch.unsloth_ import patch_self_attn_lora
-
-            patch_self_attn_lora()
-
-    def _patch_llama_flash_attention(self, packed=False):
-        """Apply Flash Attention patches for LLaMA models."""
-        from axolotl.monkeypatch.llama_attn_hijack_flash import (
-            replace_llama_attn_with_flash_attn,
-        )
-
-        if packed:
-            if self.cfg.device not in ["mps", "cpu"] and not self.inference:
-                LOG.info("patching with flash attention for sample packing")
-                replace_llama_attn_with_flash_attn(
-                    packed=True,
-                    cross_entropy=self.cfg.flash_attn_cross_entropy,
-                    rms_norm=self.cfg.flash_attn_rms_norm,
-                )
-        elif self.cfg.s2_attention:
-            LOG.info("patching w/ flash-enabled, shifted-sparse attention")
-            replace_llama_attn_with_flash_attn(
-                packed=False,
-                cross_entropy=self.cfg.flash_attn_cross_entropy,
-                rms_norm=self.cfg.flash_attn_rms_norm,
-                use_shifted_sparse_attn=True,
-            )
-        elif self.cfg.flash_attn_cross_entropy or self.cfg.flash_attn_rms_norm:
-            replace_llama_attn_with_flash_attn(
-                packed=False,
-                cross_entropy=self.cfg.flash_attn_cross_entropy,
-                rms_norm=self.cfg.flash_attn_rms_norm,
-            )
-
-    def _patch_llama_xformers_attention(self):
-        """Apply xformers attention patches for LLaMA models."""
-        from axolotl.monkeypatch.llama_attn_hijack_xformers import (
-            hijack_llama_attention,
-        )
-
-        LOG.info("Patching with xformers attention...")
-        hijack_llama_attention()
-
-    def _patch_llama_sample_packing(self):
-        """Apply sample packing patches for LLaMA models."""
-        from axolotl.monkeypatch.llama_patch_multipack import (
-            hijack_llama_prepare_4d_mask,
-        )
-
-        LOG.info("Patching llama _prepare_4d_causal_attention_mask*...")
-        hijack_llama_prepare_4d_mask()
-
-    def _patch_llama_derived_model(self):
-        """Modify all llama derived models in one block."""
-        if self.cfg.is_llama_derived_model and not (
-            self.cfg.model_config_type in SUPPORTED_MULTIPACK_MODEL_TYPES
-            and (self.cfg.flash_attention or self.cfg.flex_attention)
-            and self.cfg.sample_packing
-        ):
-            if self.cfg.flash_attention:
-                self._patch_llama_flash_attention(packed=self.cfg.sample_packing)
-            elif self.cfg.xformers_attention:
-                self._patch_llama_xformers_attention()
-            elif self.cfg.sample_packing:
-                self._patch_llama_sample_packing()
-            elif self.cfg.s2_attention:
-                raise NotImplementedError(
-                    "Shifted-sparse attention not currently implemented without flash attention."
-                )
-
-    def _apply_llama_flash_attn_patches(self, model):
-        """Apply LLaMA-specific flash attention patches."""
-        if (
-            self.model_config.model_type in ["llama", "llama4"]
-            and not self.cfg.trust_remote_code
-            and not self.cfg.gptq
-            and self.cfg.flash_attention
-            and not self.inference
-        ):
-            # TODO(MengqingCao): split these patches seperately
-            from axolotl.monkeypatch.llama_attn_hijack_flash import (
-                is_xformers_swiglu_available,
-                replace_llama_mlp_with_swiglu,
-                replace_llama_qkv_with_fused,
-            )
-
-            if self.cfg.flash_attn_fuse_mlp and is_xformers_swiglu_available():
-                LOG.info("Patching with SwiGLU...")
-                replace_llama_mlp_with_swiglu(model)
-
-            if self.cfg.flash_attn_fuse_qkv:
-                LOG.info("Patching with fused QKV...")
-                replace_llama_qkv_with_fused(model)
-
-    def _apply_unsloth_patches(self, model):
-        """Apply unsloth optimization patches."""
-        if self.cfg.unsloth_lora_mlp:
-            from axolotl.monkeypatch.unsloth_ import integrate_lora_mlp_patch
-
-            integrate_lora_mlp_patch(peft_model=model)
-
-        if self.cfg.unsloth_lora_qkv or self.cfg.unsloth_lora_o:
-            from axolotl.monkeypatch.unsloth_ import integrate_lora_patch
-
-            integrate_lora_patch(peft_model=model, cfg=self.cfg)
-
-        if self.cfg.unsloth_rope:
-            from axolotl.monkeypatch.unsloth_ import integrate_rope_embeddings
-
-            integrate_rope_embeddings()
-
-    def _apply_lora_kernel_patch(self, model):
-        """Apply LoRA kernel patches."""
-        if (
-            self.cfg.lora_mlp_kernel
-            or self.cfg.lora_qkv_kernel
-            or self.cfg.lora_o_kernel
-        ):
-            from axolotl.monkeypatch.lora_kernels import apply_lora_kernel_patches
-
-            apply_lora_kernel_patches(model=model, cfg=self.cfg)
--- a/src/axolotl/loaders/processor.py
+++ b/src/axolotl/loaders/processor.py
@@ -1,56 +0,0 @@
-"""Processor loading functionality for multi-modal models"""
-
-from typing import Any
-
-import transformers
-from transformers import (
-    AutoProcessor,
-    PreTrainedTokenizerBase,
-)
-
-from axolotl.utils.dict import DictDefault
-from axolotl.utils.logging import get_logger
-
-LOG = get_logger(__name__)
-
-
-def load_processor(cfg: DictDefault, tokenizer: PreTrainedTokenizerBase):
-    processor_kwargs: dict[str, Any] = {}  # Do we actually need this?
-
-    processor_cls = AutoProcessor
-    if cfg.processor_type:
-        processor_cls = getattr(transformers, cfg.processor_type)
-
-    processor = processor_cls.from_pretrained(
-        cfg.processor_config,
-        trust_remote_code=cfg.trust_remote_code or False,
-        tokenizer=tokenizer,
-        **processor_kwargs,
-    )
-
-    # Attempt to load image size from processor if available
-    if (
-        cfg.image_size is None
-        and hasattr(processor, "size")
-        and any(dim in processor.size for dim in ["width", "height"])
-    ):
-        im_width = None
-        im_height = None
-        if "width" in processor.size:
-            im_width = processor.size["width"]
-        if "height" in processor.size:
-            im_height = processor.size["height"]
-
-        # If both width and height are set, use a tuple
-        if im_width is not None and im_height is not None:
-            cfg.image_size = (im_width, im_height)
-        # If only width is set, use as integer
-        elif im_width is not None:
-            cfg.image_size = im_width
-        # If only height is set, use as integer
-        elif im_height is not None:
-            cfg.image_size = im_height
-
-        LOG.debug(f"Loaded image size: {cfg.image_size} from processor")
-
-    return processor
--- a/src/axolotl/loaders/tokenizer.py
+++ b/src/axolotl/loaders/tokenizer.py
@@ -1,281 +0,0 @@
-"""Tokenizer loading functionality and associated utils"""
-
-import json
-import os
-
-import transformers
-from transformers import (
-    AddedToken,
-    AutoTokenizer,
-)
-
-from axolotl.integrations.base import PluginManager
-from axolotl.loaders.utils import get_linear_embedding_layers, load_model_config
-from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN
-from axolotl.utils.chat_templates import get_chat_template_from_config
-from axolotl.utils.distributed import (
-    barrier,
-    is_local_main_process,
-    is_main_process,
-)
-from axolotl.utils.logging import get_logger
-
-LOG = get_logger(__name__)
-PLUGIN_MANAGER = PluginManager.get_instance()
-
-
-def modify_tokenizer_files(
-    tokenizer_path: str, token_mappings: dict[int, str], output_dir: str
-) -> str:
-    """
-    Modify tokenizer files to replace added_tokens strings, save to output directory,
-    and return the path to the modified tokenizer.
-
-    This only works with reserved tokens that were added to the tokenizer, not tokens
-    already part of the vocab.
-
-    Args:
-        tokenizer_path: Path or name of the original tokenizer
-        token_mappings: Dict mapping {token_id (int): new_token_string}
-        output_dir: Directory to save the modified tokenizer
-
-    Returns:
-        Path to the modified tokenizer directory
-
-    Ref: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941
-    """
-    # Create the tokenizer directory in output_dir if it doesn't exist
-    tokenizer_dir = os.path.join(output_dir, "tokenizer")
-    os.makedirs(tokenizer_dir, exist_ok=True)
-
-    if is_local_main_process():  # pylint: disable=too-many-nested-blocks
-        # Load the tokenizer
-        temp_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True)
-
-        # Save the tokenizer to the output directory
-        temp_tokenizer.save_pretrained(tokenizer_dir)
-
-        # Get the token IDs and map them to their new values
-        token_id_mappings = {
-            int(token_id): new_value for token_id, new_value in token_mappings.items()
-        }
-
-        # 1. Update tokenizer_config.json - added_tokens_decoder
-        config_path = os.path.join(tokenizer_dir, "tokenizer_config.json")
-        if os.path.exists(config_path):
-            with open(config_path, "r", encoding="utf-8") as f:
-                config_data = json.load(f)
-
-            # Update added_tokens_decoder
-            if "added_tokens_decoder" in config_data:
-                for token_id, new_value in token_id_mappings.items():
-                    token_id_str = str(token_id)
-                    if token_id_str in config_data["added_tokens_decoder"]:
-                        config_data["added_tokens_decoder"][token_id_str][
-                            "content"
-                        ] = new_value
-                    else:
-                        raise ValueError(
-                            f"Token ID {token_id_str} not found in added_tokens_decoder"
-                        )
-
-            # Write the updated config back
-            with open(config_path, "w", encoding="utf-8") as f:
-                json.dump(config_data, f, indent=2)
-
-        # 2. Update tokenizer.json - added_tokens
-        tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json")
-        if os.path.exists(tokenizer_path):
-            with open(tokenizer_path, "r", encoding="utf-8") as f:
-                tokenizer_data = json.load(f)
-
-            # Update added_tokens
-            if "added_tokens" in tokenizer_data:
-                for token_id, new_value in token_id_mappings.items():
-                    for i, token_entry in enumerate(tokenizer_data["added_tokens"]):
-                        if token_entry["id"] == token_id:
-                            tokenizer_data["added_tokens"][i]["content"] = new_value
-                            break
-                    else:
-                        # Reaching this section means the token_id was not found in tokenizer.json added_tokens
-                        raise ValueError(
-                            f"Token ID {token_id} not found in added_tokens"
-                        )
-            if "model" in tokenizer_data and "vocab" in tokenizer_data["model"]:
-                for token_id, new_value in token_id_mappings.items():
-                    for entry_val, entry_id in tokenizer_data["model"]["vocab"].items():
-                        if entry_id == token_id:
-                            del tokenizer_data["model"]["vocab"][entry_val]
-                            tokenizer_data["model"]["vocab"][new_value] = token_id
-                            break
-
-            # Write the updated tokenizer data back
-            with open(tokenizer_path, "w", encoding="utf-8") as f:
-                json.dump(tokenizer_data, f, indent=2)
-
-    barrier()
-    return tokenizer_dir
-
-
-def load_tokenizer(cfg):
-    """Load and configure the tokenizer based on the provided config."""
-    model_config = load_model_config(cfg)
-    tokenizer_kwargs = {}
-    use_fast = True  # this is the default
-
-    if cfg.tokenizer_use_fast is not None:
-        use_fast = cfg.tokenizer_use_fast
-    if cfg.tokenizer_legacy is not None:
-        # True is the default w/ https://github.com/huggingface/transformers/pull/25224
-        tokenizer_kwargs["legacy"] = cfg.tokenizer_legacy
-
-    tokenizer_cls = AutoTokenizer
-    if cfg.tokenizer_type:
-        tokenizer_cls = getattr(transformers, cfg.tokenizer_type)
-
-    # Set base tokenizer path
-    tokenizer_path = cfg.tokenizer_config
-
-    # Apply token string overrides if specified
-    if cfg.added_tokens_overrides:
-        # Modify tokenizer files and get path to modified tokenizer
-        tokenizer_path = modify_tokenizer_files(
-            tokenizer_path, cfg.added_tokens_overrides, output_dir=cfg.output_dir
-        )
-
-    tokenizer = tokenizer_cls.from_pretrained(
-        tokenizer_path,
-        trust_remote_code=cfg.trust_remote_code or False,
-        use_fast=use_fast,
-        **tokenizer_kwargs,
-    )
-
-    if (
-        tokenizer.__class__.__name__
-        in [
-            "LlamaTokenizer",
-            "LlamaTokenizerFast",
-            "CodeLlamaTokenizer",
-            "CodeLlamaTokenizerFast",
-        ]
-        and hasattr(tokenizer, "pad_token")
-        and not tokenizer.pad_token
-    ):
-        # set a pad_token, but use eos_token so we don't add a new token
-        tokenizer.pad_token = LLAMA_DEFAULT_EOS_TOKEN
-
-    if tokenizer.__class__.__name__ == "GPTNeoXTokenizerFast":
-        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-        os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-    # Mistral's official FA implementation requires left padding
-    if cfg.is_mistral_derived_model and cfg.flash_attention and not cfg.sample_packing:
-        tokenizer.padding_side = "left"
-
-    # Qwen base only has single token, so we need to set the special tokens
-    if cfg.is_qwen_derived_model:
-        token_ids = ["bos_token_id", "eos_token_id", "pad_token_id", "unk_token_id"]
-        for attr_name in token_ids:
-            if getattr(tokenizer, attr_name) is None:
-                setattr(tokenizer, attr_name, tokenizer.eod_id)
-
-        token_names = ["bos_token", "eos_token", "pad_token", "unk_token"]
-        for attr_name in token_names:
-            if getattr(tokenizer, attr_name) is None:
-                setattr(tokenizer, attr_name, "<|endoftext|>")
-
-    additional_special_tokens = None
-    if cfg.special_tokens:
-        special_tokens = cfg.special_tokens.to_dict()
-        additional_special_tokens = special_tokens.pop(
-            "additional_special_tokens", None
-        )
-        lora_modules_to_save = get_linear_embedding_layers(model_config.model_type)
-        for k, val in special_tokens.items():
-            # check if new special token is not already in tokenizer and
-            # is adapter training to make sure lora_modules_to_save is set
-            # pylint: disable=too-many-boolean-expressions
-            if (
-                (getattr(tokenizer, k) is None or getattr(tokenizer, k) != val)
-                and (len(tokenizer.encode(val, add_special_tokens=False)) > 2)
-                and cfg.adapter
-                and (
-                    not cfg.lora_modules_to_save
-                    or not all(
-                        x in cfg.lora_modules_to_save for x in lora_modules_to_save
-                    )
-                )
-                and k != "pad_token"
-            ):
-                lora_modules_to_save = ", ".join(
-                    [f"`{x}`" for x in lora_modules_to_save]
-                )
-                raise ValueError(
-                    f"Please set lora_modules_to_save to [{lora_modules_to_save}] when using an adapter and changing the special tokens."
-                )
-
-            tokenizer.add_special_tokens(
-                {k: AddedToken(val, rstrip=False, lstrip=False, normalized=False)}
-            )
-
-        # If we add bos_token and eos_token, we need to update the post processor to
-        # handle them correctly.
-        # https://github.com/huggingface/transformers/pull/24132
-        bos_or_eos_in_special_tokens = (
-            "bos_token" in cfg.special_tokens and "eos_token" in cfg.special_tokens
-        )
-        if (
-            tokenizer.__class__.__name__
-            in (
-                "LlamaTokenizerFast",
-                "CodeLlamaTokenizerFast",
-            )
-            and bos_or_eos_in_special_tokens
-        ):
-            tokenizer.update_post_processor()
-
-    if cfg.tokens:
-        tokenizer.add_tokens(
-            [
-                AddedToken(token, rstrip=False, lstrip=False, normalized=False)
-                for token in cfg.tokens
-            ]
-        )
-
-    # Additional special tokens are a List, and need to be treated differently than regular special
-    # tokens. We add them after we have called `add_tokens` in case these additional special tokens
-    # are new tokens.
-    #
-    # Usage:
-    #
-    # ```py
-    # special_tokens:
-    #   additional_special_tokens: ["<|im_start|>", "<|im_end|>"]
-    # ```
-    if additional_special_tokens is not None:
-        tokenizer.add_special_tokens(
-            {"additional_special_tokens": additional_special_tokens}
-        )
-
-    if is_main_process(use_environ=True):
-        LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
-        LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
-        LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
-        LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")
-
-    if cfg.chat_template:
-        chat_template_string = get_chat_template_from_config(
-            cfg=cfg,
-            tokenizer=tokenizer,
-        )
-        if cfg.default_system_message and cfg.chat_template == "chatml":
-            chat_template_string = chat_template_string.replace(
-                "You are a helpful assistant.", cfg.default_system_message
-            )
-
-        tokenizer.chat_template = chat_template_string
-    else:
-        LOG.info(
-            "No Chat template selected. Consider adding a chat template for easier inference."
-        )
-    return tokenizer
--- a/src/axolotl/loaders/utils.py
+++ b/src/axolotl/loaders/utils.py
@@ -1,211 +0,0 @@
-"""Utilities for axolotl.loaders module"""
-
-import contextlib
-from typing import Type
-
-import addict
-import torch
-from transformers import AutoConfig, PretrainedConfig, PreTrainedModel
-
-from axolotl.utils.dict import DictDefault
-from axolotl.utils.logging import get_logger
-
-LOG = get_logger(__name__)
-
-
-def get_module_class_from_name(
-    module: torch.nn.Module, name: str
-) -> Type[torch.nn.Module] | None:
-    """Gets a class from a module by its name. Copied from `accelerate.utils.dataclasses`
-    (https://github.com/huggingface/accelerate/blob/main/src/accelerate/utils/dataclasses.py#L2805).
-
-    Args:
-        module: The module to get the class from.
-        name: The name of the class.
-
-    Returns:
-        The class type of the matching module, or `None` if no match is found.
-    """
-    modules_children = list(module.children())
-    if module.__class__.__name__ == name:
-        return module.__class__
-
-    if len(modules_children) == 0:
-        return None
-
-    for child_module in modules_children:
-        module_class = get_module_class_from_name(child_module, name)
-        if module_class is not None:
-            return module_class
-
-    return None
-
-
-def check_model_config(cfg: DictDefault, model_config: PretrainedConfig):
-    """Validates and adjusts model config based on `axolotl` config.
-
-    This function performs several important checks and adjustments:
-        - Disables model caching for better memory efficiency
-        - Handles multimodal model-specific configurations
-        - Validates quantization settings
-        - Ensures proper LoRA configuration when using adapters with new tokens
-
-    Args:
-        cfg: Dictionary mapping `axolotl` config keys to values.
-        model_config: The model's configuration object from `transformers`.
-
-    Raises:
-        ValueError: If a multimodal model lacks text configuration, if GPTQ settings
-            are inconsistent, or if LoRA `modules_to_save` is improperly configured
-            with new tokens.
-    """
-    if hasattr(model_config, "use_cache"):
-        model_config.use_cache = False
-
-    if cfg.is_multimodal:
-        # For multimodal configs, use_cache is set in the text_config
-        if hasattr(model_config, "get_text_config"):
-            text_config = model_config.get_text_config()
-            if hasattr(text_config, "use_cache"):
-                text_config.use_cache = False
-        else:
-            raise ValueError(
-                "No text config found for multimodal model. Please raise an Issue with model details."
-            )
-
-        # Check if image_size is not set and load image size from model config if available
-        if (
-            cfg.image_size is None
-            and hasattr(model_config, "vision_config")
-            and hasattr(model_config.vision_config, "image_size")
-        ):
-            cfg.image_size = model_config.vision_config.image_size
-            LOG.debug(f"Loaded image size: {cfg.image_size} from model config")
-
-    quant_config_exists = (
-        hasattr(model_config, "quantization_config")
-        and model_config.quantization_config
-    )
-
-    # Detect compressed-tensors config
-    is_compressed_tensors_config = (
-        quant_config_exists
-        and model_config.quantization_config.get("quant_method") == "compressed-tensors"
-    )
-
-    if is_compressed_tensors_config:
-        if model_config.quantization_config.get("config_groups"):
-            LOG.warning(
-                "Found `config_groups` in a compressed-tensors config. "
-                "QAT integration with llmcompressor is not tested."
-            )
-        # Skip further quant checks for compressed-tensors
-        return
-
-    quant_config_method_is_gptq = (
-        quant_config_exists
-        and "quant_method" in model_config.quantization_config
-        and model_config.quantization_config["quant_method"] == "gptq"
-    )
-
-    if cfg.gptq and not quant_config_method_is_gptq:
-        raise ValueError(
-            "model_config.quantization_config is not set or quant_method is not set to gptq. "
-            "Please make sure to point to a GPTQ model."
-        )
-
-    lora_modules_to_save = get_linear_embedding_layers(model_config.model_type)
-    if (
-        cfg.adapter
-        and cfg.tokens
-        and (
-            not cfg.lora_modules_to_save
-            or not all(x in cfg.lora_modules_to_save for x in lora_modules_to_save)
-        )
-    ):
-        lora_modules_to_save_joined = ", ".join(
-            map(lambda x: f"`{x}`", lora_modules_to_save)
-        )
-        raise ValueError(
-            "`lora_modules_to_save` not properly set when adding new tokens. "
-            f"Please include [{lora_modules_to_save_joined}] in `lora_modules_to_save`."
-        )
-
-
-def load_model_config(cfg: DictDefault) -> PretrainedConfig | addict.Dict:
-    """Loads and configures a model configuration from HuggingFace or local sources.
-
-    This function determines the appropriate model config source, loads it, applies any
-    necessary overrides, and validates it for compatibility with the `axolotl` config.
-
-    Args:
-        cfg: Dictionary mapping `axolotl` config keys to values.
-
-    Returns:
-        A configured model configuration object (`AutoConfig` instance), or a simple
-            dictionary configuration for special cases like Mamba models.
-
-    Raises:
-        ValueError: If configuration loading fails for reasons other than special cases
-            that are handled (e.g., Mamba models).
-    """
-    model_config_name = cfg.base_model_config or cfg.base_model
-    if not model_config_name and cfg.tokenizer_config:
-        model_config_name = cfg.tokenizer_config
-    trust_remote_code = cfg.trust_remote_code is True
-    config_kwargs = {}
-    if cfg.revision_of_model:
-        config_kwargs["revision"] = cfg.revision_of_model
-    if cfg.num_labels:
-        # num_labels is used to initialize classifier models
-        config_kwargs["num_labels"] = cfg.num_labels
-    try:
-        model_config = AutoConfig.from_pretrained(
-            model_config_name,
-            trust_remote_code=trust_remote_code,
-            **config_kwargs,
-        )
-    except ValueError as error:
-        if "mamba" in model_config_name:
-            return addict.Dict(
-                {
-                    "model_type": "mamba",
-                }
-            )
-        raise error
-
-    if cfg.overrides_of_model_config:
-        for key, val in cfg.overrides_of_model_config.items():
-            setattr(model_config, key, val)
-
-    check_model_config(cfg, model_config)
-
-    return model_config
-
-
-def ensure_dtype(model: PreTrainedModel, dtype: torch.dtype = torch.bfloat16):
-    """Ensures all modules in the model are converted to the specified data type."""
-    for name, module in model.named_modules():
-        weight_mismatch = False
-        with contextlib.suppress(AttributeError):
-            weight_mismatch = module.weight.dtype != dtype
-
-        bias_mismatch = False
-        with contextlib.suppress(AttributeError):
-            bias_mismatch = module.bias.dtype != dtype
-
-        if weight_mismatch:
-            print(f"Converting module {name}.weight: {module.weight.dtype} -> {dtype}")
-        if bias_mismatch:
-            print(f"Converting module {name}.bias: {module.bias.dtype} -> {dtype}")
-        if weight_mismatch or bias_mismatch:
-            module.to(dtype)
-
-
-def get_linear_embedding_layers(model_type: str) -> list[str]:
-    """Returns layer names of linear embeddings needed for LoRA based on model type."""
-    if model_type == "gpt_neox":
-        return ["embed_in", "embed_out"]
-    if model_type == "falcon":
-        return ["word_embeddings", "lm_head"]
-    return ["embed_tokens", "lm_head"]
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Wing Lian	9bdf4b1c23	improve handling and error if fa3 requested but not installeD	2025-05-19 10:11:14 -07:00
Wing Lian	d6f64a3684	handle args to drop dropout	2025-05-18 15:17:40 -07:00
Wing Lian	0735454782	move fa3 tests to multigpu since we only run those on hopper	2025-05-18 15:17:39 -07:00
Wing Lian	bb6464c4c6	use get_device_capability since CI setting in cfg is unreliable	2025-05-18 15:17:39 -07:00
Wing Lian	323a9cb153	handle return sig change for fa3	2025-05-18 15:17:39 -07:00
Wing Lian	b22150751f	check for fa first	2025-05-18 15:17:39 -07:00
Wing Lian	8c4bc59bfc	fa3 doesn't support dropout_p, fix unpatching	2025-05-18 15:17:39 -07:00
Wing Lian	a064f1c9b4	ci for fa3	2025-05-18 15:17:39 -07:00
Wing Lian	fb5ef6d445	use updated package name for fa3	2025-05-18 15:17:38 -07:00
Wing Lian	34b68ddaae	curl with apt instead of pip	2025-05-18 15:17:38 -07:00
Wing Lian	9a3d0c919b	make sure curl is installed	2025-05-18 15:17:38 -07:00
Wing Lian	bd34d0b861	install for hopper from pre-built wheel	2025-05-18 15:17:38 -07:00
Wing Lian	37220ab90a	install pybind11 for fa3 build	2025-05-18 15:17:38 -07:00
Wing Lian	e1b74d710b	update docker args to minimums used and use MAX_JOBS already set as arg	2025-05-18 15:17:38 -07:00
Wing Lian	79daf5b934	reduce max jobs for build of fa3	2025-05-18 15:17:38 -07:00
Wing Lian	ddd7c55576	build hopper w fa3 on torch 2.6	2025-05-18 15:17:37 -07:00
Wing Lian	65c6c98a76	whitespace fix in dockerfile	2025-05-18 15:17:37 -07:00
Wing Lian	4ef2e8293f	fix the bash in docker base	2025-05-18 15:17:37 -07:00
Wing Lian	c126d5cd04	fix suffix for tag	2025-05-18 15:17:37 -07:00
Wing Lian	9b0be4f15c	fix 12.8 image and add flash-attn v3 hopper base image	2025-05-18 15:17:37 -07:00