update llama3 example base models to use nous

2024-07-15 17:19:00 -04:00
62 changed files with 576 additions and 3194 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -12,24 +12,36 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: "121"
-            cuda_version: 12.1.1
-            cudnn_version: 8
+          - cuda: "118"
+            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.3.1
+            pytorch: 2.1.2
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "121"
-            cuda_version: 12.1.1
-            cudnn_version: 8
+            cuda_version: 12.1.0
+            python_version: "3.10"
+            pytorch: 2.1.2
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+          - cuda: "121"
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.1.2
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+          - cuda: "121"
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.2.2
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+          - cuda: "121"
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.3.0
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+          - cuda: "121"
+            cuda_version: 12.1.0
            python_version: "3.11"
            pytorch: 2.3.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-          - cuda: "124"
-            cuda_version: 12.4.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.4.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
    steps:
      - name: Checkout
        uses: actions/checkout@v3
@@ -55,7 +67,6 @@ jobs:
          labels: ${{ steps.metadata.outputs.labels }}
          build-args: |
            CUDA_VERSION=${{ matrix.cuda_version }}
-            CUDNN_VERSION=${{ matrix.cudnn_version }}
            CUDA=${{ matrix.cuda }}
            PYTHON_VERSION=${{ matrix.python_version }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -13,22 +13,28 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 121
-            cuda_version: 12.1.1
+          - cuda: 118
+            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.3.1
-            axolotl_extras: mamba-ssm
-          - cuda: 121
-            cuda_version: 12.1.1
-            python_version: "3.11"
-            pytorch: 2.3.1
-            axolotl_extras: mamba-ssm
-            is_latest: true
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.4.0
+            pytorch: 2.1.2
            axolotl_extras:
+            axolotl_args: "--extra-index-url https://download.pytorch.org/whl/cu118"
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.10"
+            pytorch: 2.1.2
+            axolotl_extras:
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.2.2
+            axolotl_extras:
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.3.1
+            axolotl_extras:
+            is_latest: true
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -59,7 +65,6 @@ jobs:
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
            ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-            ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
            ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
          labels: ${{ steps.metadata.outputs.labels }}

@@ -70,22 +75,27 @@ jobs:
    strategy:
      matrix:
        include:
-          - cuda: 121
-            cuda_version: 12.1.1
+          - cuda: 118
+            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.3.1
+            pytorch: 2.1.2
            axolotl_extras:
          - cuda: 121
-            cuda_version: 12.1.1
+            cuda_version: 12.1.0
+            python_version: "3.10"
+            pytorch: 2.1.2
+            axolotl_extras:
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.2.2
+            axolotl_extras:
+          - cuda: 121
+            cuda_version: 12.1.0
            python_version: "3.11"
            pytorch: 2.3.1
            axolotl_extras:
            is_latest: true
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.4.0
-            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -124,7 +134,7 @@ jobs:
      matrix:
        include:
          - cuda: 121
-            cuda_version: 12.1.1
+            cuda_version: 12.1.0
            python_version: "3.11"
            pytorch: 2.3.1
            axolotl_extras:
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -1,44 +0,0 @@
-name: docker-multigpu-tests-biweekly
-
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: '0 0 * * 1,4'  # Runs at 00:00 UTC every monday & thursday
-
-jobs:
-  test-axolotl-multigpu:
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - cuda: 121
-            cuda_version: 12.1.1
-            python_version: "3.11"
-            pytorch: 2.3.1
-            axolotl_extras:
-            num_gpus: 2
-    runs-on: [self-hosted, modal]
-    timeout-minutes: 120
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Install Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.10"
-      - name: Install Modal
-        run: |
-          python -m pip install --upgrade pip
-          pip install modal==0.63.64 jinja2
-      - name: Update env vars
-        run: |
-          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
-          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
-          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
-          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
-          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
-          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-      - name: Run tests job on Modal
-        run: |
-          modal run cicd.multigpu
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -12,22 +12,28 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 121
-            cuda_version: 12.1.1
+          - cuda: 118
+            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.3.1
+            pytorch: 2.1.2
+            axolotl_extras:
+            axolotl_args: "--extra-index-url https://download.pytorch.org/whl/cu118"
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.10"
+            pytorch: 2.1.2
            axolotl_extras:
          - cuda: 121
-            cuda_version: 12.1.1
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.2.2
+            axolotl_extras:
+          - cuda: 121
+            cuda_version: 12.1.0
            python_version: "3.11"
            pytorch: 2.3.1
            axolotl_extras:
            is_latest: true
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.4.0
-            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -69,22 +75,27 @@ jobs:
    strategy:
      matrix:
        include:
-          - cuda: 121
-            cuda_version: 12.1.1
+          - cuda: 118
+            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.3.1
+            pytorch: 2.1.2
            axolotl_extras:
          - cuda: 121
-            cuda_version: 12.1.1
+            cuda_version: 12.1.0
+            python_version: "3.10"
+            pytorch: 2.1.2
+            axolotl_extras:
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.2.2
+            axolotl_extras:
+          - cuda: 121
+            cuda_version: 12.1.0
            python_version: "3.11"
            pytorch: 2.3.1
            axolotl_extras:
            is_latest: true
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.4.0
-            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -26,8 +26,6 @@ jobs:
          python-version: "3.10"
          cache: 'pip' # caching pip dependencies
      - uses: pre-commit/action@v3.0.0
-        env:
-          SKIP: no-commit-to-branch

  pytest:
    name: PyTest
@@ -59,10 +57,6 @@ jobs:
        run: |
          pytest --ignore=tests/e2e/ tests/

-      - name: cleanup pip cache
-        run: |
-          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
-
  docker-e2e-tests:
    if: github.repository_owner == 'axolotl-ai-cloud'
    # this job needs to be run on self-hosted GPU runners...
@@ -74,24 +68,27 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 121
-            cuda_version: 12.1.1
+          - cuda: 118
+            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.3.1
+            pytorch: 2.1.2
+            axolotl_args: "--extra-index-url https://download.pytorch.org/whl/cu118"
            num_gpus: 1
-            axolotl_extras: mamba-ssm
          - cuda: 121
-            cuda_version: 12.1.1
+            cuda_version: 12.1.0
+            python_version: "3.10"
+            pytorch: 2.1.2
+            num_gpus: 1
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.2.2
+            num_gpus: 1
+          - cuda: 121
+            cuda_version: 12.1.0
            python_version: "3.11"
            pytorch: 2.3.1
            num_gpus: 1
-            axolotl_extras: mamba-ssm
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.4.0
-            num_gpus: 1
-            axolotl_extras:
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -102,13 +99,12 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==0.63.64 jinja2
+          pip install modal jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
-          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -8,8 +8,6 @@ repos:
    -   id: check-yaml
    -   id: end-of-file-fixer
    -   id: trailing-whitespace
-    -   id: no-commit-to-branch
-        args: ['--branch', 'main']
 -   repo: https://github.com/psf/black
    rev: 23.3.0
    hooks:
--- a/README.md
+++ b/README.md
@@ -46,7 +46,6 @@ Features:
  - [Multipack](./docs/multipack.qmd)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
  - [RLHF & DPO](./docs/rlhf.qmd)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
  - [Dataset Pre-Processing](./docs/dataset_preprocessing.qmd)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
-  - [Unsloth](./docs/unsloth.qmd)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
 - [Common Errors](#common-errors-)
  - [Tokenization Mismatch b/w Training & Inference](#tokenization-mismatch-bw-inference--training)
 - [Debugging Axolotl](#debugging-axolotl)
@@ -334,7 +333,7 @@ For further and fine-grained use cases, please refer to the official [dstack doc

 Axolotl supports a variety of dataset formats.  It is recommended to use a JSONL.  The schema of the JSONL depends upon the task and the prompt template you wish to use.  Instead of a JSONL, you can also use a HuggingFace dataset with columns for each JSONL field.

-See [the documentation](https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/) for more information on how to use different dataset formats.
+See [these docs](https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/) for more information on how to use different dataset formats.

 ### Config

--- a/_quarto.yml
+++ b/_quarto.yml
@@ -36,7 +36,6 @@ website:
            - docs/nccl.qmd
            - docs/mac.qmd
            - docs/multi-node.qmd
-            - docs/unsloth.qmd
        - section: "Dataset Formats"
          contents: docs/dataset-formats/*
        - section: "Reference"
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -24,9 +24,9 @@ RUN git fetch origin +$GITHUB_REF && \
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN pip install causal_conv1d
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install -e .[deepspeed,flash-attn,optimizers,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm,optimizers,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
-        pip install -e .[deepspeed,flash-attn,optimizers] $AXOLOTL_ARGS; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm,optimizers] $AXOLOTL_ARGS; \
    fi

 # So we can test the Docker image
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -2,5 +2,5 @@
 set -e

 pytest --ignore=tests/e2e/ /workspace/axolotl/tests/
-pytest -n1 --dist loadfile -v /workspace/axolotl/tests/e2e/patched/
-pytest --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ /workspace/axolotl/tests/e2e/
+pytest /workspace/axolotl/tests/e2e/patched/
+pytest --ignore=tests/e2e/patched/ /workspace/axolotl/tests/e2e/
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -1,77 +0,0 @@
-"""
- modal application to run axolotl gpu tests in Modal
- """
-# pylint: disable=duplicate-code
-
-import os
-import pathlib
-import tempfile
-
-import jinja2
-import modal
-from jinja2 import select_autoescape
-from modal import Image, Stub
-
-cicd_path = pathlib.Path(__file__).parent.resolve()
-
-template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
-template_env = jinja2.Environment(
-    loader=template_loader, autoescape=select_autoescape()
-)
-df_template = template_env.get_template("Dockerfile.jinja")
-
-df_args = {
-    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
-    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.3.1"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.3.1"),
-    "CUDA": os.environ.get("CUDA", "121"),
-    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
-    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
-}
-
-dockerfile_contents = df_template.render(**df_args)
-
-temp_dir = tempfile.mkdtemp()
-with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
-    f.write(dockerfile_contents)
-
-cicd_image = (
-    Image.from_dockerfile(
-        pathlib.Path(temp_dir) / "Dockerfile",
-        force_build=True,
-        gpu="A10G",
-    )
-    .env(df_args)
-    .pip_install("fastapi==0.110.0", "pydantic==2.6.3")
-)
-
-stub = Stub("Axolotl CI/CD", secrets=[])
-
-
-N_GPUS = int(os.environ.get("N_GPUS", 2))
-GPU_CONFIG = modal.gpu.H100(count=N_GPUS)
-
-
-def run_cmd(cmd: str, run_folder: str):
-    import subprocess  # nosec
-
-    # Propagate errors from subprocess.
-    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
-        exit(exit_code)  # pylint: disable=consider-using-sys-exit
-
-
-@stub.function(
-    image=cicd_image,
-    gpu=GPU_CONFIG,
-    timeout=45 * 60,
-    cpu=8.0,
-    memory=131072 * N_GPUS,
-)
-def cicd_pytest():
-    run_cmd("./cicd/multigpu.sh", "/workspace/axolotl")
-
-
-@stub.local_entrypoint()
-def main():
-    cicd_pytest.remote()
--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
@@ -1,5 +0,0 @@
-#!/bin/bash
-set -e
-
-# only run one test at a time so as not to OOM the GPU
-pytest -n1 /workspace/axolotl/tests/e2e/multigpu/
--- a/cicd/tests.py
+++ b/cicd/tests.py
@@ -1,8 +1,6 @@
 """
 modal application to run axolotl gpu tests in Modal
 """
-# pylint: disable=duplicate-code
-
 import os
 import pathlib
 import tempfile
@@ -23,9 +21,9 @@ df_template = template_env.get_template("Dockerfile.jinja")
 df_args = {
    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.3.1"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.3.1"),
-    "CUDA": os.environ.get("CUDA", "121"),
+    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.0.1"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.10-cu118-2.0.1"),
+    "CUDA": os.environ.get("CUDA", "118"),
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
 }
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -22,9 +22,9 @@ WORKDIR /workspace/axolotl
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN pip install causal_conv1d
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install -e .[deepspeed,flash-attn,optimizers,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm,optimizers,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
-        pip install -e .[deepspeed,flash-attn,optimizers] $AXOLOTL_ARGS; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm,optimizers] $AXOLOTL_ARGS; \
    fi

 # So we can test the Docker image
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -3,7 +3,7 @@ ARG CUDNN_VERSION="8"
 ARG UBUNTU_VERSION="22.04"
 ARG MAX_JOBS=4

-FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
+FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION as base-builder

 ENV PATH="/root/miniconda3/bin:${PATH}"

--- a/docker/Dockerfile-cloud
+++ b/docker/Dockerfile-cloud
@@ -3,6 +3,7 @@ FROM winglian/axolotl:$BASE_TAG

 ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
 ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
+ENV TRANSFORMERS_CACHE="/workspace/data/huggingface-cache/hub"
 ENV HF_HOME="/workspace/data/huggingface-cache/hub"
 ENV HF_HUB_ENABLE_HF_TRANSFER="1"

--- a/docker/Dockerfile-cloud-no-tmux
+++ b/docker/Dockerfile-cloud-no-tmux
@@ -3,6 +3,7 @@ FROM winglian/axolotl:$BASE_TAG

 ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
 ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
+ENV TRANSFORMERS_CACHE="/workspace/data/huggingface-cache/hub"
 ENV HF_HOME="/workspace/data/huggingface-cache/hub"
 ENV HF_HUB_ENABLE_HF_TRANSFER="1"

--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -54,14 +54,6 @@ conversations where `from` is `prompter` `assistant` instead of default sharegpt
 {"conversations": [{"from": "...", "value": "..."}]}
 ```

-## sharegpt.load_ultrachat
-
-conversations where the turns field is 'messages', human is 'user' and gpt is 'assistant'.
-
-```{.json filename="data.jsonl"}
-{"messages": [{"user": "...", "assistant": "..."}]}
-```
-
 ## sharegpt_jokes

 creates a chat where bot is asked to tell a joke, then explain why the joke is funny
--- a/docs/input_output.qmd
+++ b/docs/input_output.qmd
@@ -205,7 +205,7 @@ ds = load_from_disk(f'last_run_prepared/{directory[0]}/')
    hi there!.  goodbye  farewell</s>
 ```

-We can check that the right tokens are ignored by comparing the labels
+We can check that the right tokens are ingored by comparing the labels
 to each token:

 ```python
--- a/docs/torchao.qmd
+++ b/docs/torchao.qmd
@@ -1,19 +0,0 @@
---
-title: "PyTorch ao"
-description: "Custom data types and layouts for training and inference"
---
-
-### Installation
-
-Stable Release from the PyTorch index
-
-```bash
-pip install torchao --extra-index-url https://download.pytorch.org/whl/cu121 # full options are cpu/cu118/cu121/cu124
-```
-
-
-Nightly release
-
-```bash
-pip install --pre torchao-nightly --index-url https://download.pytorch.org/whl/nightly/cu121 # full options are cpu/cu118/cu121/cu124
-```
--- a/docs/unsloth.qmd
+++ b/docs/unsloth.qmd
@@ -1,49 +0,0 @@
---
-title: "Unsloth"
-description: "Hyper-optimized QLoRA finetuning for single GPUs"
---
-
-### Overview
-
-Unsloth provides hand-written optimized kernels for LLM finetuning that slightly improve speed and VRAM over
-standard industry baselines.
-
-
-### Installation
-
-The following will install unsloth from source and downgrade xformers as unsloth is incompatible with the most up
-to date libraries.
-
-```bash
-pip install --no-deps "unsloth @ git+https://github.com/unslothai/unsloth.git"
-pip install --no-deps --force-reinstall xformers==0.0.26.post1
-```
-
-### Using unsloth w Axolotl
-
-Axolotl exposes a few configuration options to try out unsloth and get most of the performance gains.
-
-Our unsloth integration is currently limited to the following model architectures:
- - llama
-
-These options are specific to LoRA finetuning and cannot be used for multi-GPU finetuning
-```yaml
-unsloth_lora_mlp: true
-unsloth_lora_qkv: true
-unsloth_lora_o: true
-```
-
-These options are composable and can be used with multi-gpu finetuning
-```
-unsloth_cross_entropy_loss: true
-unsloth_rms_norm: true
-unsloth_rope: true
-```
-
-### Limitations
-
- Single GPU only; e.g. no multi-gpu support
- No deepspeed or FSDP support (requires multi-gpu)
- LoRA + QLoRA support only. No full fine tunes or fp8 support.
- Limited model architecture support. Llama, Phi, Gemma, Mistral only
- No MoE support.
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -43,6 +43,7 @@
   },
   "outputs": [],
   "source": [
+    "!pip install torch==\"2.1.2\"\n",
    "!pip install -e git+https://github.com/axolotl-ai-cloud/axolotl#egg=axolotl\n",
    "!pip install flash-attn==\"2.5.0\"\n",
    "!pip install deepspeed==\"0.13.1\"!pip install mlflow==\"2.13.0\""
--- a/examples/llama-3/instruct-dpo-lora-8b.yml
+++ b/examples/llama-3/instruct-dpo-lora-8b.yml
@@ -1,81 +0,0 @@
-base_model: meta-llama/Meta-Llama-3-8B-Instruct
-model_type: LlamaForCausalLM
-tokenizer_type: AutoTokenizer
-
-load_in_8bit: true
-load_in_4bit: false
-strict: false
-
-chat_template: llama3
-rl: dpo
-datasets:
-  - path: fozziethebeat/alpaca_messages_2k_dpo_test
-    type: chat_template.default
-    chat_template: llama3
-    field_messages: conversation
-    field_chosen: chosen
-    field_rejected: rejected
-    message_field_role: role
-    message_field_content: content
-    roles:
-      system:
-        - system
-      user:
-        - user
-      assistant:
-        - assistant
-
-dataset_prepared_path:
-val_set_size: 0.05
-output_dir: ./outputs/lora-out
-
-sequence_len: 4096
-sample_packing: false
-pad_to_sequence_len: true
-
-adapter: lora
-lora_model_dir:
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_fan_in_fan_out:
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 4
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16:
-tf32: false
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-s2_attention:
-
-warmup_steps: 10
-evals_per_epoch: 4
-eval_table_size:
-eval_max_new_tokens: 128
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
--- a/examples/llama-3/instruct-lora-8b.yml
+++ b/examples/llama-3/instruct-lora-8b.yml
@@ -1,4 +1,4 @@
-base_model: NousResearch/Meta-Llama-3-8B-Instruct
+base_model: meta-llama/Meta-Llama-3-8B-Instruct
 model_type: LlamaForCausalLM
 tokenizer_type: AutoTokenizer

@@ -74,5 +74,3 @@ deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
-special_tokens:
-   pad_token: <|end_of_text|>
--- a/examples/llama-3/qlora-fsdp-405b.yaml
+++ b/examples/llama-3/qlora-fsdp-405b.yaml
@@ -1,63 +0,0 @@
-base_model: hugging-quants/Meta-Llama-3.1-405B-BNB-NF4-BF16
-tokenizer_type: AutoTokenizer
-
-load_in_4bit: true
-strict: false
-
-datasets:
-  - path: tatsu-lab/alpaca
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.0
-output_dir: ./outputs/out/qlora-llama3_1-405b
-save_safetensors: true
-
-adapter: qlora
-
-sequence_len: 2048
-sample_packing: true
-pad_to_sequence_len: true
-
-lora_r: 16
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-lora_target_linear: true
-
-gradient_accumulation_steps: 4
-micro_batch_size: 1
-num_epochs: 2
-optimizer: adamw_torch
-lr_scheduler: cosine
-learning_rate: 0.00001
-
-train_on_inputs: false
-group_by_length: false
-bf16: true
-tf32: true
-
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: true
-logging_steps: 1
-flash_attention: true
-
-warmup_steps: 10
-evals_per_epoch: 4
-saves_per_epoch: 1
-weight_decay: 0.0
-fsdp:
-  - full_shard
-  - auto_wrap
-fsdp_config:
-  fsdp_limit_all_gathers: true
-  fsdp_sync_module_states: true
-  fsdp_offload_params: true
-  fsdp_use_orig_params: false
-  fsdp_cpu_ram_efficient_loading: true
-  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
-  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_sharding_strategy: FULL_SHARD
-special_tokens:
-  pad_token: <|finetune_right_pad_id|>
--- a/examples/llama-3/qlora-fsdp-70b.yaml
+++ b/examples/llama-3/qlora-fsdp-70b.yaml
@@ -1,4 +1,4 @@
-base_model: casperhansen/llama-3-70b-fp16
+base_model: NousResearch/Meta-Llama-3-70B
 model_type: LlamaForCausalLM
 tokenizer_type: AutoTokenizer  # PreTrainedTokenizerFast

--- a/examples/llama-3/qlora.yml
+++ b/examples/llama-3/qlora.yml
@@ -7,7 +7,7 @@ load_in_4bit: true
 strict: false

 datasets:
-  - path: aaditya/alpaca_subset_1
+  - path: tatsu-lab/alpaca
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0
--- a/examples/tiny-llama/lora-mps.yml
+++ b/examples/tiny-llama/lora-mps.yml
@@ -1,4 +1,4 @@
-base_model: TinyLlama/TinyLlama_v1.1
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer

--- a/examples/tiny-llama/lora.yml
+++ b/examples/tiny-llama/lora.yml
@@ -1,5 +1,6 @@
-base_model: TinyLlama/TinyLlama_v1.1
-tokenizer_type: AutoTokenizer
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer

 load_in_8bit: true
 load_in_4bit: false
--- a/examples/tiny-llama/qlora.yml
+++ b/examples/tiny-llama/qlora.yml
@@ -1,4 +1,4 @@
-base_model: TinyLlama/TinyLlama_v1.1
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer

--- a/requirements.txt
+++ b/requirements.txt
@@ -1,18 +1,18 @@
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
 packaging==23.2
-peft==0.12.0
-transformers==4.44.0
-tokenizers>=0.19.1
-bitsandbytes==0.43.3
-accelerate==0.33.0
-datasets==2.20.0
-deepspeed==0.14.4
+peft==0.11.1
+transformers==4.42.3
+tokenizers==0.19.1
+bitsandbytes==0.43.1
+accelerate==0.32.0
+deepspeed @ git+https://github.com/microsoft/DeepSpeed.git@bc48371c5e1fb8fd70fc79285e66201dbb65679b
 pydantic==2.6.3
 addict
 fire
 PyYAML>=6.0
 requests
-flash-attn==2.6.3
+datasets==2.19.1
+flash-attn==2.6.1
 sentencepiece
 wandb
 einops
@@ -32,13 +32,12 @@ fschat @ git+https://github.com/lm-sys/FastChat.git@27a05b04a35510afb1d767ae7e59
 gradio==3.50.2
 tensorboard
 python-dotenv==1.0.1
-autoawq>=0.2.5

 mamba-ssm==1.2.0.post1

 # remote filesystems
-s3fs>=2024.5.0
-gcsfs>=2024.5.0
+s3fs
+gcsfs
 # adlfs

 trl==0.9.6
--- a/setup.py
+++ b/setup.py
@@ -80,13 +80,13 @@ setup(
    dependency_links=dependency_links,
    extras_require={
        "flash-attn": [
-            "flash-attn==2.6.2",
+            "flash-attn==2.6.1",
        ],
        "fused-dense-lib": [
-            "fused-dense-lib  @ git+https://github.com/Dao-AILab/flash-attention@v2.6.2#subdirectory=csrc/fused_dense_lib",
+            "fused-dense-lib  @ git+https://github.com/Dao-AILab/flash-attention@v2.6.1#subdirectory=csrc/fused_dense_lib",
        ],
        "deepspeed": [
-            "deepspeed==0.14.4",
+            "deepspeed @ git+https://github.com/microsoft/DeepSpeed.git@bc48371c5e1fb8fd70fc79285e66201dbb65679b",
            "deepspeed-kernels",
        ],
        "mamba-ssm": [
--- a/src/axolotl/cli/init.py
+++ b/src/axolotl/cli/init.py
@@ -40,7 +40,7 @@ from axolotl.utils.distributed import is_main_process
 from axolotl.utils.mlflow_ import setup_mlflow_env_vars
 from axolotl.utils.models import load_tokenizer
 from axolotl.utils.tokenization import check_dataset_labels
-from axolotl.utils.trainer import prepare_opinionated_env, prepare_optim_env
+from axolotl.utils.trainer import prepare_optim_env
 from axolotl.utils.wandb_ import setup_wandb_env_vars

 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
@@ -375,15 +375,13 @@ def load_cfg(config: Union[str, Path] = Path("examples/"), **kwargs):
        cfg,
        capabilities={
            "bf16": is_torch_bf16_gpu_available(),
-            "n_gpu": int(os.environ.get("WORLD_SIZE", 1)),
+            "n_gpu": os.environ.get("WORLD_SIZE", 1),
            "compute_capability": gpu_version,
        },
    )

    prepare_optim_env(cfg)

-    prepare_opinionated_env(cfg)
-
    normalize_config(cfg)

    normalize_cfg_datasets(cfg)
--- a/src/axolotl/cli/preprocess.py
+++ b/src/axolotl/cli/preprocess.py
@@ -2,7 +2,6 @@
 CLI to run training on a model
 """
 import logging
-import warnings
 from pathlib import Path
 from typing import Union

@@ -77,12 +76,8 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):

    if parsed_cli_args.download:
        model_name = parsed_cfg.base_model
-        with warnings.catch_warnings():
-            # there are a bunch of useless UserWarnings about
-            # "copying from a non-meta parameter in the checkpoint to a meta parameter in the current model"
-            warnings.simplefilter("ignore")
-            with init_empty_weights(include_buffers=True):
-                AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
+        with init_empty_weights():
+            AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)

    LOG.info(
        Fore.GREEN
--- a/src/axolotl/common/architectures.py
+++ b/src/axolotl/common/architectures.py
@@ -1,15 +0,0 @@
-"""
-Common architecture specific constants
-"""
-
-MOE_ARCH_BLOCK = {
-    "dbrx": "DbrxFFN",
-    "jamba": "JambaSparseMoeBlock",
-    "jetmoe": [
-        "JetMoeMoA",
-        "JetMoeMoE",
-    ],
-    "mixtral": "MixtralSparseMoeBlock",
-    "qwen2_moe": "Qwen2MoeSparseMoeBlock",
-    "deepseek_v2": "DeepseekV2MoE",
-}
--- a/src/axolotl/core/tokenizer_utils.py
+++ b/src/axolotl/core/tokenizer_utils.py
@@ -1,150 +0,0 @@
-"""
-helper functions for fixing the embeddings/tokenizer
-"""
-
-# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import itertools
-
-import numpy as np
-import torch
-
-
-@torch.inference_mode
-def fix_untrained_tokens(model, tokenizer, train_dataset, eps=1e-16):
-    """
-    Many of the newer models have reserved tokens that are not trained.
-    """
-    embedding_matrix = model.get_input_embeddings().weight
-    lm_head_matrix = model.get_output_embeddings().weight
-
-    # Get untrained tokens
-    indicator_untrained = torch.amax(embedding_matrix, axis=1) <= eps
-    where_untrained = torch.where(indicator_untrained)[0]
-    n_untrained = where_untrained.shape[0]
-    n_trained = embedding_matrix.shape[0] - n_untrained
-
-    # Get set and actual tokens
-    where_untrained = where_untrained.tolist()
-    if len(where_untrained) == 0:
-        return False
-
-    # Remove untrained indices where it's longer
-
-    where_untrained_set = frozenset(where_untrained)
-    actual_bad_tokens = tokenizer.convert_ids_to_tokens(where_untrained)
-    # Remove None items in actual_bad_tokens
-    actual_bad_tokens = [x for x in actual_bad_tokens if x is not None]
-
-    # Check if tokenizer and training datasets have bad tokens
-    if_bad_first = False
-    if_bad_second = False
-    # Check tokenizer's chat template for any untrained tokens
-    chat_template = getattr(tokenizer, "chat_template", None)
-    if chat_template is not None:
-        if_bad_first = any(x in chat_template for x in actual_bad_tokens)
-
-    # Check the first 250, last 250 input_ids
-    size_dataset = len(train_dataset)
-    size = min(size_dataset, 250)
-    for j in range(size):
-        input_ids = train_dataset[j]
-        if "input_ids" in input_ids:
-            input_ids = input_ids["input_ids"]
-            if_bad = any(item in where_untrained_set for item in input_ids)
-            if if_bad:
-                if_bad_second = True
-                break
-
-    # Check last 250
-    if not if_bad_second:
-        left = max(size_dataset - 250, 0)
-        for j in range(left, size_dataset):
-            input_ids = train_dataset[j]
-            if "input_ids" in input_ids:
-                input_ids = input_ids["input_ids"]
-                if_bad = any(item in where_untrained_set for item in input_ids)
-                if if_bad:
-                    if_bad_second = True
-                    break
-
-    # Check if bad tokens exists!
-    if not if_bad_first and not if_bad_second:
-        return False
-
-    # Count all the possible bad tokens
-    final_counts = np.zeros(
-        max(len(tokenizer), embedding_matrix.shape[0]), dtype=np.int64
-    )
-
-    def mapping(examples):
-        input_ids = examples["input_ids"]
-        counter = np.fromiter(itertools.chain.from_iterable(input_ids), dtype=np.int32)
-        np.add.at(final_counts, counter, 1)
-
-    train_dataset.map(mapping, batched=True, desc="Counting untrained tokens")
-
-    # Get sum of all items
-    sum_embedding = torch.sum(embedding_matrix, dtype=torch.float32, axis=0)
-    sum_lm_head = torch.sum(lm_head_matrix, dtype=torch.float32, axis=0)
-
-    # Remove bad tokens
-    sum_embedding -= torch.sum(
-        embedding_matrix[where_untrained], dtype=torch.float32, axis=0
-    )
-    sum_lm_head -= torch.sum(
-        lm_head_matrix[where_untrained], dtype=torch.float32, axis=0
-    )
-
-    # Find correct average by dividing by sum of trained tokens
-    mean_embedding = sum_embedding / n_trained
-    mean_lm_head = sum_lm_head / n_trained
-
-    # Scale each to be equal to 1/max_frequency. Also set some to 0 if none seen
-    scaling = final_counts[where_untrained] / max(final_counts.max(), 1)
-    scaling = torch.tensor(scaling, device=mean_embedding.device).unsqueeze(1)
-    mean_embedding = (
-        mean_embedding.repeat(
-            (
-                n_untrained,
-                1,
-            )
-        )
-        * scaling
-    )
-    mean_lm_head = (
-        mean_lm_head.repeat(
-            (
-                n_untrained,
-                1,
-            )
-        )
-        * scaling
-    )
-    where_null = scaling.ravel() == 0
-    mean_embedding[where_null] = 0
-    mean_lm_head[where_null] = 0
-
-    # Set them to the mean
-    embedding_matrix[where_untrained] = mean_embedding.to(embedding_matrix.dtype)
-    lm_head_matrix[where_untrained] = mean_lm_head.to(lm_head_matrix.dtype)
-
-    # Clean up
-    for _ in range(3):
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    return True
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -8,7 +8,6 @@ import importlib
 import importlib.util
 import logging
 import math
-import os
 import sys
 from abc import abstractmethod
 from collections import defaultdict
@@ -29,18 +28,9 @@ from transformers import (
    TrainerCallback,
    TrainingArguments,
 )
-from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, seed_worker
+from transformers.trainer_utils import seed_worker
 from transformers.utils import is_sagemaker_mp_enabled
-from trl import (
-    CPOConfig,
-    CPOTrainer,
-    DPOConfig,
-    DPOTrainer,
-    KTOConfig,
-    KTOTrainer,
-    ORPOConfig,
-    ORPOTrainer,
-)
+from trl import DPOConfig, DPOTrainer, KTOConfig, KTOTrainer, ORPOConfig, ORPOTrainer
 from trl.trainer.utils import pad_to_length

 from axolotl.loraplus import create_loraplus_optimizer
@@ -242,12 +232,6 @@ class AxolotlTrainingMixins:
            "help": "workaround to pass an alternate optimizer to the HF trainer"
        },
    )
-    alternate_lr_scheduler_type: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "workaround to pass an alternate lr scheduler to the HF trainer"
-        },
-    )


@dataclass
@@ -281,105 +265,7 @@ class AxolotlKTOConfig(AxolotlTrainingMixins, KTOConfig):
    """


-@dataclass
-class AxolotlCPOConfig(AxolotlTrainingMixins, CPOConfig):
-    """
-    CPO config for CPO training
-    """
-
-    simpo_gamma: Optional[float] = field(
-        default=None,
-        metadata={"help": "simpo gamma parameter"},
-    )
-
-
-class SchedulerMixin(Trainer):
-    """
-    Mixin class for scheduler setup in CausalTrainer.
-    """
-
-    args = None  # type: AxolotlTrainingArguments
-
-    def create_scheduler(
-        self, num_training_steps: int, optimizer: torch.optim.Optimizer = None
-    ):
-        """
-        Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or
-        passed as an argument.
-
-        Args:
-            num_training_steps (int): The number of training steps to do.
-            optimizer (torch.optim.Optimizer): The training optimizer
-        """
-        use_cosine_quadratic = (
-            self.args.lr_scheduler_type == "cosine"
-            and self.args.lr_quadratic_warmup is True
-        )
-
-        use_cosine_min_lr = (
-            self.args.lr_scheduler_type == "cosine"
-            and self.args.cosine_min_lr_ratio is not None
-        )
-
-        # fmt: off
-        if self.lr_scheduler is None:  # type: ignore  # pylint: disable=access-member-before-definition
-            # fmt: on
-            if self.args.alternate_lr_scheduler_type == "one_cycle":
-                num_warmup_steps = self.args.get_warmup_steps(num_training_steps)
-                pct_start = num_warmup_steps / num_training_steps
-                extra_lr_kwargs = {}
-                if "pct_start" not in self.args.lr_scheduler_kwargs:
-                    extra_lr_kwargs["pct_start"] = pct_start
-                if "anneal_strategy" not in self.args.lr_scheduler_kwargs:
-                    extra_lr_kwargs["anneal_strategy"] = "cos"
-
-                self.lr_scheduler = OneCycleLR(
-                    optimizer,
-                    max_lr=self.args.learning_rate,
-                    total_steps=num_training_steps,
-                    **extra_lr_kwargs,
-                    **self.args.lr_scheduler_kwargs,
-                )
-            elif use_cosine_quadratic:
-                if use_cosine_min_lr:
-                    LOG.warning("Both cosine quadratic warmup and min lr detected. Using quadratic warmup.")
-
-                self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup(  # pylint: disable=attribute-defined-outside-init
-                    optimizer,
-                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
-                    num_training_steps=num_training_steps,
-                )
-            elif self.args.cosine_min_lr_ratio and self.args.cosine_constant_lr_ratio and use_cosine_min_lr:
-                assert 0 <= self.args.cosine_min_lr_ratio <= 1.0, "cosine_min_lr_ratio must be between 0.0 and 1.0"
-                assert 0 <= self.args.cosine_constant_lr_ratio <= 1.0, "cosine_constant_lr_ratio must be between 0.0 and 1.0"
-                self.lr_scheduler = get_cosine_schedule_with_warmup_decay_constant(  # pylint: disable=attribute-defined-outside-init
-                    optimizer,
-                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
-                    num_training_steps=num_training_steps,
-                    min_lr_ratio=self.args.cosine_min_lr_ratio,
-                    constant_lr_ratio=self.args.cosine_constant_lr_ratio,
-                )
-            elif self.args.cosine_min_lr_ratio and use_cosine_min_lr:
-                assert 0 <= self.args.cosine_min_lr_ratio <= 1.0, "cosine_min_lr_ratio must be between 0.0 and 1.0"
-                self.lr_scheduler = get_cosine_schedule_with_min_lr(  # pylint: disable=attribute-defined-outside-init
-                    optimizer,
-                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
-                    num_training_steps=num_training_steps,
-                    min_lr_ratio=self.args.cosine_min_lr_ratio,
-                )
-            else:
-                return super().create_scheduler(num_training_steps, optimizer)
-        else:
-            if use_cosine_quadratic:
-                LOG.warning("axolotl's cosine scheduler with quadratic warmup not used (e.g., because of deepspeed).")
-
-            if use_cosine_min_lr:
-                LOG.warning("axolotl's cosine scheduler with min lr not used (e.g., because of deepspeed).")
-
-        return self.lr_scheduler
-
-
-class AxolotlTrainer(SchedulerMixin, Trainer):
+class AxolotlTrainer(Trainer):
    """
    Extend the base Trainer for axolotl helpers
    """
@@ -404,23 +290,10 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
        if self.args.orpo_alpha:
            self.loss_fct = torch.nn.CrossEntropyLoss(reduction="none")

-    def _wrap_model(self, model, training=True, dataloader=None):
-        if self.args.torch_compile:
-            torch._dynamo.config.accumulated_cache_size_limit = (  # pylint: disable=protected-access
-                256
-            )
-            model = torch.compile(
-                model,
-                backend=self.args.torch_compile_backend,
-                mode=self.args.torch_compile_mode,
-            )
-        return super()._wrap_model(model, training=training, dataloader=dataloader)
-
    def create_optimizer(self):
        if (
            self.args.loraplus_lr_ratio is None
-            and self.args.alternate_optimizer
-            not in ["optimi_adamw", "ao_adamw_8bit", "ao_adamw_4bit", "ao_adamw_fp8"]
+            and self.args.alternate_optimizer != "optimi_adamw"
        ):
            return super().create_optimizer()

@@ -471,24 +344,6 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
                        optimizer_grouped_parameters, foreach=False, **optimizer_kwargs
                    )
                )
-            elif self.args.alternate_optimizer == "ao_adamw_4bit":
-                from torchao.prototype.low_bit_optim import AdamW4bit
-
-                self.optimizer = (  # pylint: disable=attribute-defined-outside-init
-                    AdamW4bit(optimizer_grouped_parameters, **optimizer_kwargs)
-                )
-            elif self.args.alternate_optimizer == "ao_adamw_8bit":
-                from torchao.prototype.low_bit_optim import AdamW8bit
-
-                self.optimizer = (  # pylint: disable=attribute-defined-outside-init
-                    AdamW8bit(optimizer_grouped_parameters, **optimizer_kwargs)
-                )
-            elif self.args.alternate_optimizer == "ao_adamw_fp8":
-                from torchao.prototype.low_bit_optim import AdamWFp8
-
-                self.optimizer = (  # pylint: disable=attribute-defined-outside-init
-                    AdamWFp8(optimizer_grouped_parameters, **optimizer_kwargs)
-                )

        if is_sagemaker_mp_enabled():
            self.optimizer = smp.DistributedOptimizer(  # pylint: disable=attribute-defined-outside-init
@@ -497,6 +352,68 @@ class AxolotlTrainer(SchedulerMixin, Trainer):

        return self.optimizer

+    def create_scheduler(
+        self, num_training_steps: int, optimizer: torch.optim.Optimizer = None
+    ):
+        """
+        Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or
+        passed as an argument.
+
+        Args:
+            num_training_steps (int): The number of training steps to do.
+            optimizer (torch.optim.Optimizer): The training optimizer
+        """
+        use_cosine_quadratic = (
+            self.args.lr_scheduler_type == "cosine"
+            and self.args.lr_quadratic_warmup is True
+        )
+
+        use_cosine_min_lr = (
+            self.args.lr_scheduler_type == "cosine"
+            and self.args.cosine_min_lr_ratio is not None
+        )
+
+        # fmt: off
+        if self.lr_scheduler is None:  # type: ignore  # pylint: disable=access-member-before-definition
+            # fmt: on
+            if use_cosine_quadratic:
+                if use_cosine_min_lr:
+                    LOG.warning("Both cosine quadratic warmup and min lr detected. Using quadratic warmup.")
+
+                self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup(  # pylint: disable=attribute-defined-outside-init
+                    optimizer,
+                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
+                    num_training_steps=num_training_steps,
+                )
+            elif self.args.cosine_min_lr_ratio and self.args.cosine_constant_lr_ratio and use_cosine_min_lr:
+                assert 0 <= self.args.cosine_min_lr_ratio <= 1.0, "cosine_min_lr_ratio must be between 0.0 and 1.0"
+                assert 0 <= self.args.cosine_constant_lr_ratio <= 1.0, "cosine_constant_lr_ratio must be between 0.0 and 1.0"
+                self.lr_scheduler = get_cosine_schedule_with_warmup_decay_constant(  # pylint: disable=attribute-defined-outside-init
+                    optimizer,
+                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
+                    num_training_steps=num_training_steps,
+                    min_lr_ratio=self.args.cosine_min_lr_ratio,
+                    constant_lr_ratio=self.args.cosine_constant_lr_ratio,
+                )
+            elif self.args.cosine_min_lr_ratio and use_cosine_min_lr:
+                assert 0 <= self.args.cosine_min_lr_ratio <= 1.0, "cosine_min_lr_ratio must be between 0.0 and 1.0"
+                self.lr_scheduler = get_cosine_schedule_with_min_lr(  # pylint: disable=attribute-defined-outside-init
+                    optimizer,
+                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
+                    num_training_steps=num_training_steps,
+                    min_lr_ratio=self.args.cosine_min_lr_ratio,
+                )
+            else:
+                return super().create_scheduler(num_training_steps, optimizer)
+        else:
+            if use_cosine_quadratic:
+                LOG.warning("axolotl's cosine scheduler with quadratic warmup not used (e.g., because of deepspeed).")
+
+            if use_cosine_min_lr:
+                LOG.warning("axolotl's cosine scheduler with min lr not used (e.g., because of deepspeed).")
+
+        return self.lr_scheduler
+
    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
        if self.args.sample_packing and not self.args.pretraining:
            if self.args.multipack_real_batches:
@@ -861,14 +778,6 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
        for key, value in metrics.items():
            self._stored_metrics[train_eval][key].append(value)

-    def _save_checkpoint(self, model, trial, metrics=None):
-        # make sure the checkpoint dir exists, since trainer is flakey
-        checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
-        run_dir = self._get_output_dir(trial=trial)
-        output_dir = os.path.join(run_dir, checkpoint_folder)
-        os.makedirs(output_dir, exist_ok=True)
-        return super()._save_checkpoint(model, trial, metrics=metrics)
-

 class AxolotlMambaTrainer(AxolotlTrainer):
    """
@@ -898,6 +807,37 @@ class AxolotlMambaTrainer(AxolotlTrainer):
        return lm_loss


+class OneCycleLRSchedulerTrainer(AxolotlTrainer):
+    """
+    Trainer subclass that uses the OneCycleLR scheduler
+    """
+
+    tag_names = ["axolotl", "onecycle"]
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.lr_scheduler = None
+
+    def create_scheduler(
+        self,
+        num_training_steps: int,
+        optimizer: Optional[torch.optim.Optimizer] = None,
+    ):
+        optimizer = self.optimizer if optimizer is None else optimizer
+        num_warmup_steps = self.args.get_warmup_steps(num_training_steps)
+        pct_start = num_warmup_steps / num_training_steps
+
+        self.lr_scheduler = OneCycleLR(
+            optimizer,
+            max_lr=self.args.learning_rate,
+            total_steps=num_training_steps,
+            pct_start=pct_start,
+            div_factor=6,
+        )
+
+        return self.lr_scheduler
+
+
 class ReLoRATrainer(AxolotlTrainer):
    """
    Trainer subclass that uses the OneCycleLR scheduler
@@ -937,7 +877,7 @@ class ReLoRATrainer(AxolotlTrainer):
        return self.lr_scheduler


-class AxolotlDPOTrainer(SchedulerMixin, DPOTrainer):
+class AxolotlDPOTrainer(DPOTrainer):
    """
    Extend the base DPOTrainer for axolotl helpers
    """
@@ -998,7 +938,7 @@ class AxolotlDPOTrainer(SchedulerMixin, DPOTrainer):
        return res


-class AxolotlORPOTrainer(SchedulerMixin, ORPOTrainer):
+class AxolotlORPOTrainer(ORPOTrainer):
    """
    Extend the base ORPOTrainer for axolotl helpers
    """
@@ -1006,7 +946,7 @@ class AxolotlORPOTrainer(SchedulerMixin, ORPOTrainer):
    tag_names = ["axolotl", "orpo"]


-class AxolotlKTOTrainer(SchedulerMixin, KTOTrainer):
+class AxolotlKTOTrainer(KTOTrainer):
    """
    Extend the base KTOTrainer for axolotl helpers
    """
@@ -1014,14 +954,6 @@ class AxolotlKTOTrainer(SchedulerMixin, KTOTrainer):
    tag_names = ["axolotl", "kto"]


-class AxolotlCPOTrainer(SchedulerMixin, CPOTrainer):
-    """
-    Extend the base CPOTrainer for axolotl helpers
-    """
-
-    tag_names = ["axolotl", "cpo"]
-
-
 class TrainerBuilderBase(abc.ABC):
    """
    Base class for trainer builder
@@ -1181,6 +1113,10 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        return callbacks

    def _get_trainer_cls(self):
+        if self.cfg.lr_scheduler == "one_cycle" and (
+            self.cfg.fsdp or self.cfg.adapter == "qlora"
+        ):
+            return OneCycleLRSchedulerTrainer
        if self.cfg.relora_steps:
            return ReLoRATrainer
        if self.cfg.model_config_type == "mamba":
@@ -1230,9 +1166,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        if self.cfg.fsdp:
            training_arguments_kwargs["fsdp"] = self.cfg.fsdp
            if self.cfg.fsdp_config:
-                training_arguments_kwargs["fsdp_config"] = {
-                    k.lstrip("fsdp_"): v for k, v in dict(self.cfg.fsdp_config).items()
-                }
+                training_arguments_kwargs["fsdp_config"] = dict(self.cfg.fsdp_config)

        if self.cfg.adapter == "qlora":
            training_arguments_kwargs["qlora"] = True
@@ -1341,10 +1275,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                    training_arguments_kwargs[
                        "torch_compile_backend"
                    ] = self.cfg.torch_compile_backend
-                if self.cfg.torch_compile_mode:
-                    training_arguments_kwargs[
-                        "torch_compile_mode"
-                    ] = self.cfg.torch_compile_mode

        # DDP Config
        if self.cfg.ddp_timeout:
@@ -1430,15 +1360,12 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        training_arguments_kwargs[
            "loraplus_lr_embedding"
        ] = self.cfg.loraplus_lr_embedding
-        if self.cfg.lr_scheduler in ["one_cycle", "log_sweep"]:
-            training_arguments_kwargs["lr_scheduler_type"] = "cosine"
-            training_arguments_kwargs[
-                "alternate_lr_scheduler_type"
-            ] = self.cfg.lr_scheduler
-        else:
-            training_arguments_kwargs["lr_scheduler_type"] = (
-                self.cfg.lr_scheduler if self.cfg.lr_scheduler else "cosine"
-            )
+        training_arguments_kwargs["lr_scheduler_type"] = (
+            self.cfg.lr_scheduler
+            if self.cfg.lr_scheduler
+            and self.cfg.lr_scheduler not in ("one_cycle", "log_sweep")
+            else "cosine"
+        )
        training_arguments_kwargs["lr_scheduler_kwargs"] = (
            self.cfg.lr_scheduler_kwargs if self.cfg.lr_scheduler_kwargs else {}
        )
@@ -1509,12 +1436,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):

        trainer_kwargs = {}

-        if self.cfg.optimizer in [
-            "optimi_adamw",
-            "ao_adamw_4bit",
-            "ao_adamw_8bit",
-            "ao_adamw_fp8",
-        ]:
+        if self.cfg.optimizer == "optimi_adamw":
            # Set default so transformers doesn't throw
            training_arguments_kwargs["optim"] = "adamw_hf"
            training_arguments_kwargs["alternate_optimizer"] = self.cfg.optimizer
@@ -1547,11 +1469,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                sys.path.append(self.cfg.torchdistx_path)
                importlib.import_module("torchdistx")

-        if self.cfg.accelerator_config:
-            training_arguments_kwargs[
-                "accelerator_config"
-            ] = self.cfg.accelerator_config
-
        training_args = (
            AxolotlTrainingArguments(  # pylint: disable=unexpected-keyword-arg
                **training_arguments_kwargs,
@@ -1745,27 +1662,16 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            # default to saving each epoch if not defined
            training_args_kwargs["save_strategy"] = "epoch"

-        if self.cfg.rl_beta:
-            training_args_kwargs["beta"] = self.cfg.rl_beta
        if self.cfg.orpo_alpha:
            # trl does some odd mapping of alpha to beta to reuse the beta parameter ???
            training_args_kwargs["beta"] = self.cfg.orpo_alpha

-        training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
        training_args_cls = AxolotlDPOConfig
        if self.cfg.rpo_alpha is not None:
            training_args_kwargs["rpo_alpha"] = self.cfg.rpo_alpha
-
-        if self.cfg.rl == "simpo":
-            training_args_cls = AxolotlCPOConfig
-            training_args_kwargs["loss_type"] = "simpo"
-            training_args_kwargs["max_length"] = self.cfg.sequence_len
-            training_args_kwargs["simpo_gamma"] = self.cfg.simpo_gamma
-            if self.cfg.cpo_alpha is not None:
-                training_args_kwargs["cpo_alpha"] = self.cfg.cpo_alpha
-
        if self.cfg.rl == "orpo":
            training_args_cls = AxolotlORPOConfig
+            training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
            training_args_kwargs["max_length"] = self.cfg.sequence_len
            if self.cfg.max_prompt_len:
                training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len
@@ -1773,6 +1679,7 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
        if self.cfg.rl == "kto":
            training_args_cls = AxolotlKTOConfig

+            training_args_kwargs["beta"] = self.cfg.rl_beta or 0.1
            training_args_kwargs["desirable_weight"] = (
                self.cfg.kto_desirable_weight or 1.0
            )
@@ -1818,6 +1725,7 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            ] = self.cfg.precompute_ref_log_probs
        if self.cfg.rl in ["dpo", "ipo"]:
            trainer_cls = AxolotlDPOTrainer
+            dpo_trainer_kwargs["beta"] = self.cfg.rl_beta or 0.1
            trainer_cls_args = [self.model, self.model_ref]

            # these aren't used for the ORPO trainer
@@ -1825,15 +1733,14 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            dpo_trainer_kwargs["max_target_length"] = None
            dpo_trainer_kwargs["max_prompt_length"] = self.cfg.sequence_len
            dpo_trainer_kwargs["generate_during_eval"] = True
+            if self.cfg.rl == "dpo":
+                dpo_trainer_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
        elif self.cfg.rl == "orpo":
            trainer_cls = AxolotlORPOTrainer
            trainer_cls_args = [self.model]
        elif self.cfg.rl in ["kto"]:
            trainer_cls = AxolotlKTOTrainer
            trainer_cls_args = [self.model]
-        elif self.cfg.rl in ["simpo"]:
-            trainer_cls = AxolotlCPOTrainer
-            trainer_cls_args = [self.model]
        else:
            raise ValueError(f"Unsupported RL: {self.cfg.rl}")
        dpo_trainer = trainer_cls(
--- a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
@@ -78,33 +78,6 @@ def replace_llama_qkv_with_fused(model):
            set_module_name(model, name, qkv)


-def patch_llama_cross_entropy():
-    from flash_attn.losses.cross_entropy import CrossEntropyLoss
-
-    LOG.info("patching with flash_attn.losses.cross_entropy")
-    transformers.models.llama.modeling_llama.CrossEntropyLoss = partial(
-        CrossEntropyLoss, inplace_backward=True
-    )
-
-
-def patch_llama_rms_norm():
-    try:
-        from flash_attn.ops.rms_norm import RMSNorm
-
-        class LlamaRMSNorm(RMSNorm):
-            """Patched LLamaRMSNorm"""
-
-            def __init__(self, hidden_size, eps=1e-6):
-                super().__init__(hidden_size, eps=eps)
-
-        LOG.info("patching with flash_attn.ops.rms_norm")
-        transformers.models.llama.modeling_llama.LlamaRMSNorm = LlamaRMSNorm
-    except ImportError:
-        LOG.warning(
-            "optimized flash-attention RMSNorm not found (run `pip install 'git+https://github.com/Dao-AILab/flash-attention.git#egg=dropout_layer_norm&subdirectory=csrc/layer_norm'`)"
-        )
-
-
 def replace_llama_attn_with_flash_attn(
    packed: Optional[bool] = False,
    cross_entropy: Optional[bool] = False,
@@ -131,11 +104,30 @@ def replace_llama_attn_with_flash_attn(

    # skip only if explicitly disabled
    if cross_entropy:
-        patch_llama_cross_entropy()
+        from flash_attn.losses.cross_entropy import CrossEntropyLoss
+
+        LOG.info("patching with flash_attn.losses.cross_entropy")
+        transformers.models.llama.modeling_llama.CrossEntropyLoss = partial(
+            CrossEntropyLoss, inplace_backward=True
+        )

    # skip only if explicitly disabled
    if rms_norm:
-        patch_llama_rms_norm()
+        try:
+            from flash_attn.ops.rms_norm import RMSNorm
+
+            class LlamaRMSNorm(RMSNorm):
+                """Patched LLamaRMSNorm"""
+
+                def __init__(self, hidden_size, eps=1e-6):
+                    super().__init__(hidden_size, eps=eps)
+
+            LOG.info("patching with flash_attn.ops.rms_norm")
+            transformers.models.llama.modeling_llama.LlamaRMSNorm = LlamaRMSNorm
+        except ImportError:
+            LOG.warning(
+                "optimized flash-attention RMSNorm not found (run `pip install 'git+https://github.com/Dao-AILab/flash-attention.git#egg=dropout_layer_norm&subdirectory=csrc/layer_norm'`)"
+            )


 class FusedAttention(LlamaAttention):
--- a/src/axolotl/monkeypatch/mistral_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/mistral_attn_hijack_flash.py
@@ -2,7 +2,6 @@
 # pylint: disable=duplicate-code

 import logging
-from functools import partial
 from typing import List, Optional, Tuple, Union

 import torch
@@ -46,15 +45,6 @@ def replace_mistral_attn_with_flash_attn(
        )


-def patch_mistral_cross_entropy():
-    from flash_attn.losses.cross_entropy import CrossEntropyLoss
-
-    LOG.info("patching with flash_attn.losses.cross_entropy")
-    transformers.models.mistral.modeling_mistral.CrossEntropyLoss = partial(
-        CrossEntropyLoss, inplace_backward=True
-    )
-
-
@torch.jit.script
 def _make_sliding_window_causal_mask(
    bsz: int,
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -10,8 +10,6 @@ from axolotl.monkeypatch.mixtral import patch_mixtral_moe_forward_zero3
 from axolotl.monkeypatch.utils import get_unpad_data

 SUPPORTED_MULTIPACK_MODEL_TYPES = [
-    "llama",
-    "mistral",
    "mixtral",
    "qwen2",
    "qwen2_moe",
@@ -25,36 +23,13 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
 ]


-def patch_for_multipack(model_type, model_name=None, is_remote_code=False):
-    if model_type == "gemmoe":
-        patch_remote(model_name, ".configuration_gemmoe", ".modeling_gemmoe")
-    elif model_type == "deepseek_v2":
-        patch_remote(model_name, ".configuration_deepseek", ".modeling_deepseek")
-    elif hasattr(transformers, "modeling_flash_attention_utils") and not is_remote_code:
-        transformers.modeling_flash_attention_utils._get_unpad_data = (  # pylint: disable=protected-access
-            get_unpad_data
-        )
-        if model_type == "mixtral" and is_deepspeed_zero3_enabled():
-            patch_mixtral_moe_forward_zero3()
-        return
-
-    # retain for legacy
+def patch_for_multipack(model_type, model_name=None):
    if model_type == "mixtral":
        transformers.models.mixtral.modeling_mixtral._get_unpad_data = (  # pylint: disable=protected-access
            get_unpad_data
        )
        if is_deepspeed_zero3_enabled():
            patch_mixtral_moe_forward_zero3()
-    elif model_type == "llama":
-        if hasattr(transformers.models.llama.modeling_llama, "_get_unpad_data"):
-            transformers.models.llama.modeling_llama._get_unpad_data = (  # pylint: disable=protected-access
-                get_unpad_data
-            )
-    elif model_type == "mistral":
-        if hasattr(transformers.models.mistral.modeling_mistral, "_get_unpad_data"):
-            transformers.models.llama.modeling_llama._get_unpad_data = (  # pylint: disable=protected-access
-                get_unpad_data
-            )
    elif model_type == "qwen2":
        transformers.models.qwen2.modeling_qwen2._get_unpad_data = (  # pylint: disable=protected-access
            get_unpad_data
@@ -83,6 +58,12 @@ def patch_for_multipack(model_type, model_name=None, is_remote_code=False):
        transformers.models.starcoder2.modeling_starcoder2._get_unpad_data = (  # pylint: disable=protected-access
            get_unpad_data
        )
+    elif model_type == "gemmoe":
+        patch_remote(model_name, ".configuration_gemmoe", ".modeling_gemmoe")
+    elif model_type == "jamba":
+        patch_remote(model_name, ".configuration_jamba", ".modeling_jamba")
+    elif model_type == "deepseek_v2":
+        patch_remote(model_name, ".configuration_deepseek", ".modeling_deepseek")


 def patch_remote(model_name, config_name, modeling_name):
--- a/src/axolotl/monkeypatch/unsloth_.py
+++ b/src/axolotl/monkeypatch/unsloth_.py
@@ -1,20 +1,18 @@
 """module for patching with unsloth optimizations"""

 import inspect
+import logging
 import re
 import types
 from typing import Tuple

-import torch
-from accelerate.logging import get_logger
 from peft import PeftModelForCausalLM
-from torch import nn
 from transformers.models.llama.modeling_llama import (
    LlamaFlashAttention2,
    LlamaForCausalLM,
 )

-LOG = get_logger("axolotl.monkeypatch.unsloth")
+LOG = logging.getLogger("axolotl.monkeypatch.unsloth")

 ORIGINAL_CEL_CODE = """    if labels is not None:
        # Shift so that tokens < n predict n
@@ -99,51 +97,48 @@ def check_self_attn_is_patchable() -> bool:
    return ORIGINAL_QKV_CODE in qkv and ORIGINAL_O_CODE in qkv


-def integrate_cross_entropy_loss_patch(model_type: str = "llama") -> None:
-    if model_type == "llama":
-        forward = get_forward_code()
-        LlamaForCausalLM._original_forward = forward  # pylint: disable=protected-access
-        forward, _ = detab_code(forward)
-        assert ORIGINAL_CEL_CODE in forward, "Original forward code not found"
+def integrate_cross_entropy_loss_patch():
+    forward = get_forward_code()
+    LlamaForCausalLM._original_forward = forward  # pylint: disable=protected-access
+    forward, _ = detab_code(forward)
+    assert ORIGINAL_CEL_CODE in forward, "Original forward code not found"

-        forward = forward.replace(
-            "@add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)", ""
-        )
-        forward = forward.replace(
-            "@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)",
-            "",
-        )
-        forward = forward.replace(ORIGINAL_CEL_CODE, PATCHED_CEL_CODE)
-        forward = forward.replace(
-            "def forward(",
-            "def fast_cross_entropy_loss_forward(",
-            1,
-        )
+    forward = forward.replace(
+        "@add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)", ""
+    )
+    forward = forward.replace(
+        "@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)",
+        "",
+    )
+    forward = forward.replace(ORIGINAL_CEL_CODE, PATCHED_CEL_CODE)
+    forward = forward.replace(
+        "def forward(",
+        "def fast_cross_entropy_loss_forward(",
+        1,
+    )

-        # load imports necessary
-        import transformers.models.llama.modeling_llama
+    # load imports necessary
+    import transformers.models.llama.modeling_llama

-        items_to_import = []
-        for item in dir(transformers.models.llama.modeling_llama):
-            if item in forward:
-                items_to_import.append(item)
+    items_to_import = []
+    for item in dir(transformers.models.llama.modeling_llama):
+        if item in forward:
+            items_to_import.append(item)

-        exec(  # pylint: disable=exec-used  # nosec B102
-            "from unsloth.kernels.cross_entropy_loss import fast_cross_entropy_loss",
-            globals(),
-        )
+    exec(  # pylint: disable=exec-used  # nosec B102
+        "from unsloth.kernels.cross_entropy_loss import fast_cross_entropy_loss",
+        globals(),
+    )

-        exec(  # pylint: disable=exec-used  # nosec B102
-            "from transformers.models.llama.modeling_llama import ("
-            + ", ".join(x for x in items_to_import)
-            + ")",
-            globals(),
-        )
-        exec(forward, globals())  # pylint: disable=exec-used  # nosec B102
-        LOG.info("patching unsloth fast_cross_entropy_loss", main_process_only=True)
-        LlamaForCausalLM.forward = fast_cross_entropy_loss_forward  # pylint: disable=undefined-variable  # noqa: F821
-    else:
-        raise ValueError("Unsupported model type")
+    exec(  # pylint: disable=exec-used  # nosec B102
+        "from transformers.models.llama.modeling_llama import ("
+        + ", ".join(x for x in items_to_import)
+        + ")",
+        globals(),
+    )
+    exec(forward, globals())  # pylint: disable=exec-used  # nosec B102
+    print("patching unsloth fast_cross_entropy_loss")
+    LlamaForCausalLM.forward = fast_cross_entropy_loss_forward  # pylint: disable=undefined-variable  # noqa: F821


 def detab_code(code: str) -> Tuple[str, str]:
@@ -184,30 +179,12 @@ def patch_self_attn_lora():
        globals(),
    )
    exec(self_attn_forward, globals())  # pylint: disable=exec-used  # nosec B102
-    LOG.info("patching unsloth attn lora", main_process_only=True)
+    print("patching unsloth attn lora")
    LlamaFlashAttention2.forward = (
        unsloth_attn_forward  # pylint: disable=undefined-variable  # noqa: F821
    )


-def integrate_rope_embeddings():
-    import transformers.models.llama.modeling_llama
-    from unsloth.kernels.rope_embedding import fast_rope_embedding
-
-    def apply_rotary_pos_emb(  # pylint: disable=unused-argument
-        q,  # pylint: disable=invalid-name
-        k,  # pylint: disable=invalid-name
-        cos,
-        sin,
-        position_ids=None,
-        unsqueeze_dim=1,
-    ):
-        return fast_rope_embedding(q, k, cos, sin)
-
-    LOG.info("patching unsloth RoPE embeddings", main_process_only=True)
-    transformers.models.llama.modeling_llama.apply_rotary_pos_emb = apply_rotary_pos_emb
-
-
 def integrate_lora_mlp_patch(peft_model: PeftModelForCausalLM):
    if peft_model.base_model.config.model_type in ["llama", "mistral"]:
        from unsloth.kernels import apply_lora_mlp_swiglu
@@ -240,7 +217,7 @@ def integrate_lora_mlp_patch(peft_model: PeftModelForCausalLM):
        if is_mlp_lora and mlp_no_bias and mlp_not_dora:
            layer.mlp.forward = types.MethodType(apply_lora_mlp, layer.mlp)
        else:
-            LOG.warning("unable to apply unsloth lora mlp patch to layer %d", idx)
+            logging.warning("unable to apply unsloth lora mlp patch to layer %d", idx)


 def integrate_lora_patch(peft_model: PeftModelForCausalLM, cfg):
@@ -266,7 +243,9 @@ def integrate_lora_patch(peft_model: PeftModelForCausalLM, cfg):
                layer.self_attn.apply_qkv = apply_lora_qkv
            else:
                layer.self_attn.apply_qkv = original_apply_qkv
-                LOG.warning("unable to apply unsloth lora qkv patch to layer %d", idx)
+                logging.warning(
+                    "unable to apply unsloth lora qkv patch to layer %d", idx
+                )
        if cfg.unsloth_lora_o:
            layer_modules = [
                getattr(layer.self_attn, linear_proj) for linear_proj in ["o_proj"]
@@ -285,33 +264,6 @@ def integrate_lora_patch(peft_model: PeftModelForCausalLM, cfg):
                layer.self_attn.apply_o = apply_lora_o
            else:
                layer.self_attn.apply_o = original_apply_o
-                LOG.warning(
+                logging.warning(
                    "unable to apply unsloth lora o_proj patch to layer %d", idx
                )
-
-
-def patch_unsloth_layernorm():
-    try:
-        import transformers.models.llama.modeling_llama
-        from unsloth.kernels.rms_layernorm import Fast_RMS_Layernorm
-
-        class LlamaRMSNorm(nn.Module):
-            """LlamaRMSNorm"""
-
-            def __init__(self, hidden_size, eps=1e-6):
-                """
-                LlamaRMSNorm is equivalent to T5LayerNorm
-                """
-                super().__init__()
-                self.weight = nn.Parameter(torch.ones(hidden_size))
-                self.variance_epsilon = eps
-
-            def forward(self, hidden_states):
-                return Fast_RMS_Layernorm.apply(
-                    hidden_states, self.weight, self.variance_epsilon, False
-                )
-
-        LOG.info("patching with unsloth.kernels.rms_layernorm")
-        transformers.models.llama.modeling_llama.LlamaRMSNorm = LlamaRMSNorm
-    except ImportError:
-        LOG.warning("missing unsloth library")
--- a/src/axolotl/prompt_strategies/chat_template.py
+++ b/src/axolotl/prompt_strategies/chat_template.py
@@ -6,16 +6,14 @@ import logging
 from typing import Any, Dict, List, Optional

 from axolotl.prompt_tokenizers import PromptTokenizingStrategy
-from axolotl.prompters import IGNORE_TOKEN_ID, Prompter
+from axolotl.prompters import Prompter
 from axolotl.utils.chat_templates import chat_templates

-# Configure the logger
 LOG = logging.getLogger("axolotl")
-LOG.setLevel(logging.INFO)


 class ChatTemplatePrompter(Prompter):
-    """Prompter for HF chat templates"""
+    """prompter for HF chat templates"""

    def __init__(
        self,
@@ -24,8 +22,6 @@ class ChatTemplatePrompter(Prompter):
        max_length=2048,
        message_field_role: str = "from",
        message_field_content: str = "value",
-        message_field_training: str = "train",
-        message_field_training_detail: str = "train_detail",
        roles: Optional[Dict[str, List[str]]] = None,
        drop_system_message: bool = False,
    ):
@@ -41,8 +37,6 @@ class ChatTemplatePrompter(Prompter):
            }
        self.message_field_role = message_field_role
        self.message_field_content = message_field_content
-        self.message_field_training = message_field_training
-        self.message_field_training_detail = message_field_training_detail
        self.tokenizer = tokenizer
        self.chat_template = chat_template
        self.max_length = max_length
@@ -53,7 +47,6 @@ class ChatTemplatePrompter(Prompter):
            {
                "role": self.roles[t[self.message_field_role]],
                "content": t[self.message_field_content],
-                "training": t.get(self.message_field_training, None),
            }
            for t in conversation
        ]
@@ -69,108 +62,6 @@ class ChatTemplatePrompter(Prompter):
            chat_template=self.chat_template,
        )

-    def get_offsets_for_train_detail(
-        self, text: str, train_details: List[Dict], mask_untrainable: bool = True
-    ) -> List[int]:
-        tokenized_output = self.tokenizer(
-            text, return_offsets_mapping=True, add_special_tokens=False
-        )
-        tokens = tokenized_output.tokens()
-        token_offsets = tokenized_output["offset_mapping"]
-
-        LOG.debug(f"Tokenizing text: {text}")
-        LOG.debug(f"Tokens: {tokens}")
-        # Adjust the end offsets. For some reason by default they are set to the same value as the start offsets.
-        for i in range(len(token_offsets) - 1):
-            token_offsets[i] = (token_offsets[i][0], token_offsets[i + 1][0] - 1)
-        # Ensure the last token's end offset is set correctly
-        token_offsets[-1] = (token_offsets[-1][0], len(text) - 1)
-        LOG.debug(f"Token offsets: {token_offsets}")
-
-        # Initialize all offsets as IGNORE_TOKEN_ID (not trained)
-        result = [IGNORE_TOKEN_ID] * len(token_offsets)
-
-        # Adjust train_details to align with token boundaries
-        adjusted_train_details = self.adjust_train_details(train_details, token_offsets)
-
-        for idx, (start, end) in enumerate(token_offsets):
-            for detail in adjusted_train_details:
-                # Check if the token is completely within the detail's range
-                if start >= detail["begin_offset"] and end <= detail["end_offset"]:
-                    if detail["train"] or not mask_untrainable:
-                        result[idx] = start
-                        LOG.debug(f"Token {idx} ({tokens[idx]}) marked for training")
-                    else:
-                        LOG.debug(
-                            f"Token {idx} ({tokens[idx]}) marked as non-trainable"
-                        )
-                elif start < detail["end_offset"] and end > detail["begin_offset"]:
-                    # Token partially overlaps with detail, always mark as non-trainable
-                    LOG.debug(
-                        f"Token {idx} ({tokens[idx]}) partially overlaps detail, marked as non-trainable"
-                    )
-
-        LOG.debug(f"Final result: {result}")
-        return result
-
-    def adjust_train_details(
-        self, train_details: List[Dict], token_offsets: List[tuple]
-    ) -> List[Dict]:
-        adjusted_details = []
-        for detail in train_details:
-            begin_offset = detail["begin_offset"]
-            end_offset = detail["end_offset"]
-
-            # Find the first token that starts after or at the begin_offset
-            begin_token = next(
-                (
-                    i
-                    for i, (t_start, t_end) in enumerate(token_offsets)
-                    if t_start >= begin_offset
-                ),
-                len(token_offsets),
-            )
-            if begin_token > 0 and token_offsets[begin_token - 1][1] > begin_offset:
-                begin_token -= 1
-
-            # Find the last token that ends before or at the end_offset
-            end_token = next(
-                (
-                    i
-                    for i in range(len(token_offsets) - 1, -1, -1)
-                    if token_offsets[i][1] <= end_offset
-                ),
-                -1,
-            )
-            if (
-                end_token < len(token_offsets) - 1
-                and token_offsets[end_token + 1][0] < end_offset
-            ):
-                end_token += 1
-
-            if begin_token <= end_token:
-                adjusted_begin = token_offsets[begin_token][0]
-                adjusted_end = token_offsets[end_token][1]
-
-                if adjusted_begin != begin_offset or adjusted_end != end_offset:
-                    LOG.warning(
-                        f"Adjusting detail offsets: ({begin_offset}, {end_offset}) -> ({adjusted_begin}, {adjusted_end})"
-                    )
-
-                adjusted_details.append(
-                    {
-                        "begin_offset": adjusted_begin,
-                        "end_offset": adjusted_end,
-                        "train": detail["train"],
-                    }
-                )
-            else:
-                LOG.warning(
-                    f"Could not adjust detail offsets: ({begin_offset}, {end_offset}). Skipping this detail."
-                )
-
-        return adjusted_details
-

 class ChatTemplateStrategy(PromptTokenizingStrategy):
    """
@@ -179,19 +70,6 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):

    _messages = "conversations"

-    def __init__(
-        self,
-        prompter,
-        tokenizer,
-        train_on_inputs,
-        sequence_len,
-        roles_to_train=None,
-        train_on_eos="last",
-    ):
-        super().__init__(prompter, tokenizer, train_on_inputs, sequence_len)
-        self.roles_to_train = roles_to_train if roles_to_train is not None else []
-        self.train_on_eos = train_on_eos
-
    @property
    def messages(self):
        return self._messages
@@ -201,170 +79,62 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
        self._messages = messages

    def tokenize_prompt(self, prompt):
-        turns = prompt[self.messages]
+        turns = self.get_conversation_thread(prompt)
+        prompt_ids = self.prompter.build_prompt(turns[:-1], add_generation_prompt=True)
        input_ids = self.prompter.build_prompt(turns)
-        labels = [IGNORE_TOKEN_ID] * len(input_ids)

-        last_eos_idx = -1
-        for index, turn in enumerate(turns):
-            role = turn.get(self.prompter.message_field_role)
-            content = turn.get(self.prompter.message_field_content)
-            train_turn = turn.get(self.prompter.message_field_training)
-            train_detail = turn.get(self.prompter.message_field_training_detail)
+        if not self.train_on_inputs:
+            user_prompt_len = len(prompt_ids)
+            labels = [-100] * user_prompt_len + input_ids[user_prompt_len:]
+        else:
+            labels = input_ids

-            LOG.debug(
-                f"Processing turn {index}: role={role}, content={content}, train_turn={train_turn}, train_detail={train_detail}"
-            )
-
-            should_train = (
-                train_turn
-                if train_turn is not None
-                else bool(train_detail is not None)
-                if train_detail is not None
-                else self.train_on_inputs or role in self.roles_to_train
-            )
-
-            LOG.debug(f"Should train: {should_train}")
-
-            turn_start_idx, turn_end_idx = self.find_turn(
-                conversation_ids=input_ids, turn=index, turn_content=turn
-            )
-
-            LOG.debug(f"Turn indices: start={turn_start_idx}, end={turn_end_idx}")
-
-            if should_train and turn_start_idx != -1 and turn_end_idx != -1:
-                if train_detail:
-                    token_offsets = self.prompter.get_offsets_for_train_detail(
-                        content, train_detail
-                    )
-                    LOG.debug(f"Token offsets: {token_offsets}")
-                    for i, offset in enumerate(token_offsets):
-                        if offset != IGNORE_TOKEN_ID and turn_start_idx + i < len(
-                            input_ids
-                        ):
-                            labels[turn_start_idx + i] = input_ids[turn_start_idx + i]
-                            LOG.debug(
-                                f"Label set at index {turn_start_idx + i}: {input_ids[turn_start_idx + i]}"
-                            )
-                else:
-                    labels[turn_start_idx:turn_end_idx] = input_ids[
-                        turn_start_idx:turn_end_idx
-                    ]
-                    LOG.debug(f"Labels set for range {turn_start_idx}:{turn_end_idx}")
-
-                LOG.debug(f"Labels after processing turn {index}: {labels}")
-
-            # Handle EOS token
-            eos_idx = self.find_eos_token(input_ids, turn_end_idx)
-            if eos_idx == turn_end_idx:
-                last_eos_idx = eos_idx
-                if self.train_on_eos == "all" or (
-                    self.train_on_eos == "turn" and should_train
-                ):
-                    labels[eos_idx] = input_ids[eos_idx]
-                    LOG.debug(f"EOS token set for training at index {eos_idx}")
-            else:
-                LOG.debug(
-                    f"EOS token missing after turn {turn}. eos_idx: {eos_idx}, turn_end_idx: {turn_end_idx}"
-                )
-
-        # Handle 'last' option for train_on_eos
-        if self.train_on_eos == "last" and last_eos_idx != -1:
-            labels[last_eos_idx] = input_ids[last_eos_idx]
-            LOG.debug(f"Last EOS token set for training at index {last_eos_idx}")
-
-        LOG.debug(f"Final labels: {labels}")
-
-        return {
+        tokenized_prompt = {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": [1] * len(input_ids),
        }

-    def find_eos_token(self, input_ids, start_idx):
-        eos_token_id = self.tokenizer.eos_token_id
-        for i in range(start_idx, len(input_ids)):
-            if input_ids[i] == eos_token_id:
-                return i
-        return -1
-
-    def find_turn(self, conversation_ids, turn, turn_content):
-        """
-        Locate the starting and ending indices of the specified turn in a conversation.
-
-        Args:
-            conversation_ids (list[int]): Token IDs representing the conversation.
-            turn (int): The turn number to locate (based on EOS tokens).
-            turn_content (str): String containing the content of the turn.
-
-        Returns:
-            tuple: (start_idx, end_idx) indices of the start and end of the turn content.
-                   Returns (-1, -1) if the turn content is not found.
-        """
-        content = turn_content.get(self.prompter.message_field_content, "")
-        content_ids = self.tokenizer.encode(content, add_special_tokens=False)
-
-        eos_token_id = self.tokenizer.eos_token_id
-        eos_count = 0
-        start_search_idx = 0
-
-        # Locate the starting index after the specified number of EOS tokens
-        for i, token_id in enumerate(conversation_ids):
-            if token_id == eos_token_id:
-                eos_count += 1
-                if eos_count == turn:
-                    start_search_idx = (
-                        i + 1
-                    )  # Start searching after the specified turn's EOS token
-                    break
-
-        # Find the start index of the content within the conversation
-        start_idx = -1
-        for i in range(start_search_idx, len(conversation_ids) - len(content_ids) + 1):
-            if conversation_ids[i : i + len(content_ids)] == content_ids:
-                start_idx = i
-                break
-
-        if start_idx != -1:
-            end_idx = start_idx + len(content_ids)
-        else:
-            end_idx = -1
-
-        return start_idx, end_idx
+        return tokenized_prompt

    def get_conversation_thread(self, prompt):
        return prompt[self.messages]


 def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
-    ds_cfg = ds_cfg or {}
-
-    prompter_params = {
-        "tokenizer": tokenizer,
-        "chat_template": chat_templates(ds_cfg.get("chat_template", "chatml")),
-        "message_field_role": ds_cfg.get("message_field_role", "from"),
-        "message_field_content": ds_cfg.get("message_field_content", "value"),
-        "message_field_training": ds_cfg.get("message_field_training", "training"),
-        "message_field_training_detail": ds_cfg.get(
-            "message_field_training_detail", "train_detail"
-        ),
-        "roles": ds_cfg.get("roles"),
-        "drop_system_message": ds_cfg.get("drop_system_message", False),
-        "max_length": cfg.sequence_len,
-    }
-
-    strategy_params = {
-        "train_on_inputs": cfg.train_on_inputs,
-        "sequence_len": cfg.sequence_len,
-        "roles_to_train": ds_cfg.get("roles_to_train", ["gpt", "assistant"]),
-        "train_on_eos": ds_cfg.get("train_on_eos", "last"),
-    }
-
-    strategy = ChatTemplateStrategy(
-        ChatTemplatePrompter(**prompter_params), tokenizer=tokenizer, **strategy_params
+    chat_template = (
+        ds_cfg["chat_template"] if ds_cfg and "chat_template" in ds_cfg else "chatml"
+    )
+    message_field_role = (
+        ds_cfg["message_field_role"]
+        if ds_cfg and "message_field_role" in ds_cfg
+        else "from"
+    )
+    message_field_content = (
+        ds_cfg["message_field_content"]
+        if ds_cfg and "message_field_content" in ds_cfg
+        else "value"
+    )
+    roles = ds_cfg["roles"] if ds_cfg and "roles" in ds_cfg else None
+    drop_system_message = (
+        ds_cfg["drop_system_message"]
+        if ds_cfg and "drop_system_message" in ds_cfg
+        else False
    )

-    if "field_messages" in ds_cfg and hasattr(strategy, "messages"):
+    strategy = ChatTemplateStrategy(
+        ChatTemplatePrompter(
+            tokenizer,
+            chat_templates(chat_template),
+            message_field_role=message_field_role,
+            message_field_content=message_field_content,
+            roles=roles,
+            drop_system_message=drop_system_message,
+        ),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+    if ds_cfg and "field_messages" in ds_cfg and hasattr(strategy, "messages"):
        strategy.messages = ds_cfg["field_messages"]
-
    return strategy
--- a/src/axolotl/prompt_strategies/dpo/chat_template.py
+++ b/src/axolotl/prompt_strategies/dpo/chat_template.py
@@ -1,78 +0,0 @@
-"""
-DPO prompt strategies for using tokenizer chat templates.
-"""
-
-from axolotl.utils.chat_templates import chat_templates
-
-
-def default(
-    cfg, dataset_idx=0, **kwargs
-):  # pylint: disable=possibly-unused-variable,unused-argument
-    ds_cfg = cfg["datasets"][dataset_idx]
-    chat_template_str = chat_templates(cfg.chat_template)
-
-    field_messages = ds_cfg.get("field_messages", "messages")
-    field_chosen = ds_cfg.get("field_chosen", "chosen")
-    field_rejected = ds_cfg.get("field_rejected", "rejected")
-    field_message_role = ds_cfg.get("message_field_role", "role")
-    field_message_content = ds_cfg.get("message_field_content", "content")
-    role_map_inv = ds_cfg.get(
-        "roles",
-        {
-            "user": ["user"],
-            "assistant": ["assistant"],
-            "system": ["system"],
-        },
-    )
-    role_map = {}
-    for target, sources in role_map_inv.items():
-        for source in sources:
-            role_map[source] = target
-
-    def transform_fn(sample, tokenizer=None):
-        messages = sample[field_messages]
-        messages = [
-            {
-                "role": role_map[m[field_message_role]],
-                "content": m[field_message_content],
-            }
-            for m in messages
-        ]
-        chosen = {
-            "role": role_map[sample[field_chosen][field_message_role]],
-            "content": sample[field_chosen][field_message_content],
-        }
-        rejected = {
-            "role": role_map[sample[field_rejected][field_message_role]],
-            "content": sample[field_rejected][field_message_content],
-        }
-
-        result = {}
-        result["prompt"] = tokenizer.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            chat_template=chat_template_str,
-            tokenize=False,
-        )
-
-        result["chosen"] = tokenizer.apply_chat_template(
-            [chosen],
-            add_generation_prompt=False,
-            chat_template=chat_template_str,
-            tokenize=False,
-        )
-        chosen_strip_index = result["chosen"].find(chosen["content"])
-        result["chosen"] = result["chosen"][chosen_strip_index:].rstrip()
-
-        result["rejected"] = tokenizer.apply_chat_template(
-            [rejected],
-            add_generation_prompt=False,
-            chat_template=chat_template_str,
-            tokenize=False,
-        )
-        rejected_strip_index = result["rejected"].find(rejected["content"])
-        result["rejected"] = result["rejected"][rejected_strip_index:].rstrip()
-
-        return result
-
-    return transform_fn
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -19,7 +19,6 @@ from transformers import PreTrainedModel, PreTrainedTokenizer
 from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled

 from axolotl.common.cli import TrainerCliArgs
-from axolotl.core.tokenizer_utils import fix_untrained_tokens
 from axolotl.logging_config import configure_logging
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.freeze import freeze_layers_except
@@ -53,15 +52,6 @@ class TrainDatasetMeta:
 def train(
    *, cfg: DictDefault, cli_args: TrainerCliArgs, dataset_meta: TrainDatasetMeta
 ) -> Tuple[Union[PeftModel, PreTrainedModel], PreTrainedTokenizer]:
-    # enable expandable segments for cuda allocation to improve VRAM usage
-    torch_version = torch.__version__.split(".")
-    torch_major, torch_minor = int(torch_version[0]), int(torch_version[1])
-    if torch_major == 2 and torch_minor >= 2:
-        if os.getenv("PYTORCH_CUDA_ALLOC_CONF") is None:
-            os.environ[
-                "PYTORCH_CUDA_ALLOC_CONF"
-            ] = "expandable_segments:True,roundup_power2_divisions:16"
-
    # load the tokenizer first
    LOG.debug(
        f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}",
@@ -124,13 +114,6 @@ def train(
        total_num_steps,
    )

-    if cfg.fix_untrained_tokens:
-        fix_untrained_tokens(model, tokenizer, train_dataset)
-        if cfg.local_rank == 0:
-            model.save_pretrained(
-                str(Path(cfg.output_dir)), safe_serialization=safe_serialization
-            )
-
    # go ahead and presave, so we have the adapter config available to inspect
    if peft_config:
        LOG.info(f"Pre-saving adapter config to {cfg.output_dir}")
@@ -212,23 +195,26 @@ def train(
    elif cfg.deepspeed and is_deepspeed_zero3_enabled():
        # Copied over from: https://github.com/huggingface/accelerate/blob/5ae611118057232f441055f7ef9ba0b0f2b8d533/docs/source/usage_guides/deepspeed.md#saving-and-loading
        trainer.accelerator.wait_for_everyone()
-        trainer.save_model(cfg.output_dir)
+        unwrapped_model = trainer.accelerator.unwrap_model(trainer.model_wrapped)

        # the trainer saved a model.safetensors file in the output directory,
-        # but it is most likely a proxy model and if so, should be deleted
-        maybe_proxy = os.path.exists(os.path.join(cfg.output_dir, "model.safetensors"))
-        maybe_sharded = os.path.exists(
-            os.path.join(cfg.output_dir, "model.safetensors.index.json")
-        )
-
-        if maybe_proxy and maybe_sharded:
+        # but it is a proxy model and should be deleted
+        if os.path.exists(os.path.join(cfg.output_dir, "model.safetensors")):
            LOG.info(f"Deleting {os.path.join(cfg.output_dir, 'model.safetensors')}")
            LOG.info("This is a proxy model and should be deleted")
-            try:
-                os.remove(os.path.join(cfg.output_dir, "model.safetensors"))
-            except FileNotFoundError:
-                pass
+            os.remove(os.path.join(cfg.output_dir, "model.safetensors"))

+        # Saves the whole/unpartitioned fp16 model when in ZeRO Stage-3 to the output directory if
+        # `stage3_gather_16bit_weights_on_model_save` is True in DeepSpeed Config file or
+        # `zero3_save_16bit_model` is True in DeepSpeed Plugin.
+        # For Zero Stages 1 and 2, models are saved as usual in the output directory.
+        # The model name saved is `pytorch_model.bin`
+        unwrapped_model.save_pretrained(
+            cfg.output_dir,
+            is_main_process=trainer.accelerator.is_main_process,
+            save_function=trainer.accelerator.save,
+            state_dict=trainer.accelerator.get_state_dict(trainer.model_wrapped),
+        )
    elif cfg.local_rank == 0:
        if cfg.flash_optimum and BetterTransformer:
            model = BetterTransformer.reverse(model)
--- a/src/axolotl/utils/chat_templates.py
+++ b/src/axolotl/utils/chat_templates.py
@@ -26,7 +26,6 @@ def chat_templates(user_choice: str):
        "cohere": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'  + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
        "llama3": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
        "phi_3": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
-        "deepseek_v2": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '<｜User｜>' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '<｜Assistant｜>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<｜Assistant｜>' }}{% endif %}",
    }

    if user_choice in templates:
--- a/src/axolotl/utils/config/models/input/v0_4_1/init.py
+++ b/src/axolotl/utils/config/models/input/v0_4_1/init.py
@@ -7,7 +7,6 @@ Module for pydantic models for configuration
 import logging
 import os
 from enum import Enum
-from importlib.metadata import version
 from typing import Any, Dict, List, Literal, Optional, Tuple, Union

 from pydantic import BaseModel, Field, conlist, field_validator, model_validator
@@ -78,7 +77,6 @@ class PretrainingDataset(BaseModel):
    split: Optional[str] = "train"
    text_column: Optional[str] = "text"
    type: Optional[str] = "pretrain"
-    trust_remote_code: Optional[bool] = False


 class UserDefinedPrompterType(BaseModel):
@@ -116,16 +114,10 @@ class SFTDataset(BaseModel):
    field_messages: Optional[str] = None
    message_field_role: Optional[str] = None
    message_field_content: Optional[str] = None
-    message_field_training: Optional[str] = None
-    message_field_training_detail: Optional[str] = None
-    roles_to_train: Optional[List[str]] = None
-    train_on_eos: Optional[str] = None

    roles: Optional[Dict[str, List[str]]] = None
    drop_system_message: Optional[bool] = None

-    trust_remote_code: Optional[bool] = False
-

 class UserDefinedDPOType(BaseModel):
    """User defined typing for DPO"""
@@ -166,7 +158,6 @@ class KTODataset(BaseModel):
    split: Optional[str] = None
    type: Optional[Union[UserDefinedKTOType, str]] = None
    data_files: Optional[List[str]] = None
-    trust_remote_code: Optional[bool] = False


 class RLType(str, Enum):
@@ -176,7 +167,6 @@ class RLType(str, Enum):
    ipo = "ipo"  # pylint: disable=invalid-name
    orpo = "orpo"  # pylint: disable=invalid-name
    kto = "kto"  # pylint: disable=invalid-name
-    simpo = "simpo"  # pylint: disable=invalid-name


 class ChatTemplate(str, Enum):
@@ -189,7 +179,6 @@ class ChatTemplate(str, Enum):
    cohere = "cohere"  # pylint: disable=invalid-name
    llama3 = "llama3"  # pylint: disable=invalid-name
    phi_3 = "phi_3"  # pylint: disable=invalid-name
-    deepseek_v2 = "deepseek_v2"  # pylint: disable=invalid-name


 class LoftQConfig(BaseModel):
@@ -236,12 +225,6 @@ class LoraConfig(BaseModel):
    peft_use_rslora: Optional[bool] = None
    peft_layer_replication: Optional[List[Tuple[int, int]]] = None

-    qlora_sharded_model_loading: Optional[bool] = Field(
-        default=False,
-        metadata={
-            "help": "load qlora model in sharded format for FSDP using answer.ai technique."
-        },
-    )
    lora_on_cpu: Optional[bool] = None
    gptq: Optional[bool] = None
    bnb_config_kwargs: Optional[Dict[str, Any]] = None
@@ -321,8 +304,6 @@ class ModelInputConfig(BaseModel):
    )
    trust_remote_code: Optional[bool] = None

-    model_kwargs: Optional[Dict[str, Any]] = None
-
    @field_validator("trust_remote_code")
    @classmethod
    def hint_trust_remote_code(cls, trust_remote_code):
@@ -360,16 +341,7 @@ class HyperparametersConfig(BaseModel):
    learning_rate: Union[str, float]
    weight_decay: Optional[float] = 0.0
    optimizer: Optional[
-        Union[
-            OptimizerNames,
-            Literal[
-                "lion_pytorch",
-                "optimi_adamw",
-                "ao_adamw_4bit",
-                "ao_adamw_8bit",
-                "ao_adamw_fp8",
-            ],
-        ]
+        Union[OptimizerNames, Literal["lion_pytorch", "optimi_adamw"]]
    ] = OptimizerNames.ADAMW_HF.value
    optim_args: Optional[Union[str, Dict[str, Any]]] = Field(
        default=None, metadata={"help": "Optional arguments to supply to optimizer."}
@@ -381,7 +353,7 @@ class HyperparametersConfig(BaseModel):
        },
    )
    torchdistx_path: Optional[str] = None
-    lr_scheduler: Optional[Union[SchedulerType, Literal["one_cycle"]]] = "cosine"
+    lr_scheduler: Optional[SchedulerType] = "cosine"
    lr_scheduler_kwargs: Optional[Dict[str, Any]] = None
    lr_quadratic_warmup: Optional[bool] = None
    cosine_min_lr_ratio: Optional[float] = None
@@ -532,8 +504,6 @@ class AxolotlInputConfig(
    dataloader_prefetch_factor: Optional[int] = None
    dataloader_drop_last: Optional[bool] = None

-    accelerator_config: Optional[Dict[str, Any]] = None
-
    remove_unused_columns: Optional[bool] = None

    push_dataset_to_hub: Optional[str] = None
@@ -616,14 +586,10 @@ class AxolotlInputConfig(
    flash_attn_fuse_mlp: Optional[bool] = None
    flash_optimum: Optional[bool] = None

-    eager_attention: Optional[bool] = None
-
    unsloth_cross_entropy_loss: Optional[bool] = None
    unsloth_lora_mlp: Optional[bool] = None
    unsloth_lora_qkv: Optional[bool] = None
    unsloth_lora_o: Optional[bool] = None
-    unsloth_rms_norm: Optional[bool] = None
-    unsloth_rope: Optional[bool] = None

    deepspeed: Optional[Union[str, Dict[str, Any]]] = None
    fsdp: Optional[List[str]] = None
@@ -636,9 +602,6 @@ class AxolotlInputConfig(

    torch_compile: Optional[bool] = None
    torch_compile_backend: Optional[str] = None
-    torch_compile_mode: Optional[
-        Literal["default", "reduce-overhead", "max-autotune"]
-    ] = None

    max_steps: Optional[int] = None
    warmup_steps: Optional[int] = None
@@ -660,8 +623,6 @@ class AxolotlInputConfig(

    orpo_alpha: Optional[float] = None
    rpo_alpha: Optional[float] = None
-    simpo_gamma: Optional[float] = None
-    cpo_alpha: Optional[float] = None

    kto_desirable_weight: Optional[float] = None
    kto_undesirable_weight: Optional[float] = None
@@ -676,8 +637,6 @@ class AxolotlInputConfig(
    chat_template: Optional[ChatTemplate] = None
    default_system_message: Optional[str] = None

-    fix_untrained_tokens: Optional[bool] = None
-
    # INTERNALS - document for now, generally not set externally
    is_preprocess: Optional[bool] = None

@@ -743,24 +702,6 @@ class AxolotlInputConfig(
            )
        return data

-    @model_validator(mode="before")
-    @classmethod
-    def check_pretraining_split_batches_accelerate(cls, data):
-        # alternatively set ACCELERATE_SPLIT_BATCHES=False
-        if data.get("pretraining_dataset"):
-            accelerator_config = data.get("accelerator_config", {})
-            if not accelerator_config:
-                data["accelerator_config"] = {
-                    "split_batches": False,
-                    "dispatch_batches": False,
-                }
-            else:
-                if accelerator_config.get("split_batches") is None:
-                    data["accelerator_config"]["split_batches"] = False
-                if accelerator_config.get("dispatch_batches") is None:
-                    data["accelerator_config"]["dispatch_batches"] = False
-        return data
-
    @model_validator(mode="before")
    @classmethod
    def check_gptq_w_revision(cls, data):
@@ -879,7 +820,7 @@ class AxolotlInputConfig(
    @model_validator(mode="after")
    def check_adamw_optimizer_params(self):
        if any([self.adam_beta1, self.adam_beta2, self.adam_epsilon]) and (
-            not self.optimizer or "adamw" not in str(self.optimizer).lower()
+            not self.optimizer or "adamw" not in self.optimizer.value
        ):
            LOG.warning("adamw hyperparameters found, but no adamw optimizer set")
        return self
@@ -950,8 +891,6 @@ class AxolotlInputConfig(
    @model_validator(mode="before")
    @classmethod
    def check_eval_packing(cls, data):
-        # TODO also should check test_datasets and val_set_size as we can skip
-        # if there are no eval datasets/splits
        if (
            data.get("sample_packing")
            and data.get("eval_table_size")
@@ -1173,55 +1112,6 @@ class AxolotlInputConfig(
            raise ValueError("either datasets or pretraining_dataset is required")
        return data

-    @model_validator(mode="before")
-    @classmethod
-    def check_xentropy_patch_conflicts(cls, data):
-        if data.get("flash_attn_cross_entropy") and data.get(
-            "unsloth_cross_entropy_loss"
-        ):
-            raise ValueError(
-                "flash_attn_cross_entropy and unsloth_cross_entropy_loss cannot be both enabled"
-            )
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_qlora_unsloth(cls, data):
-        if (
-            data.get("unsloth_lora_mlp")
-            or data.get("unsloth_lora_qkv")
-            or data.get("unsloth_lora_o")
-        ):
-            if data.get("adapter") == "lora" or data.get("load_in_8bit"):
-                raise ValueError(
-                    "unsloth_lora_mlp, unsloth_lora_qkv, and unsloth_lora_o are not compatible with 8-bit LoRA"
-                )
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_unsloth_xformers_version(cls, data):
-        if (
-            data.get("unsloth_lora_mlp")
-            or data.get("unsloth_lora_qkv")
-            or data.get("unsloth_lora_o")
-        ):
-            xformers_version = version("xformers")
-            if xformers_version == "0.0.27":
-                raise ValueError(
-                    "xformers version 0.0.27 is not supported with unsloth. Please downgrade to 0.0.26.post1"
-                )
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_torch_compile_deepspeed(cls, data):
-        if data.get("deepspeed") and data.get("torch_compile"):
-            raise ValueError(
-                "torch_compile should be set within your deepspeed config file"
-            )
-        return data
-

 class AxolotlConfigWCapabilities(AxolotlInputConfig):
    """wrapper to valdiate gpu capabilities with the configured options"""
@@ -1273,18 +1163,3 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
        if data.get("deepspeed") and data.get("fsdp"):
            raise ValueError("deepspeed and fsdp cannot be used together.")
        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_multigpu_unsloth(cls, data):
-        if (
-            data.get("unsloth_lora_mlp")
-            or data.get("unsloth_lora_qkv")
-            or data.get("unsloth_lora_o")
-        ):
-            capabilities = data.get("capabilities")
-            if capabilities and capabilities.get("n_gpu", 0) > 1:
-                raise ValueError(
-                    "unsloth_lora_mlp, unsloth_lora_qkv, and unsloth_lora_o are not compatible with multi-GPU training."
-                )
-        return data
--- a/src/axolotl/utils/data/rl.py
+++ b/src/axolotl/utils/data/rl.py
@@ -1,5 +1,4 @@
 """data handling specific to DPO"""
-
 import inspect
 import logging
 from functools import partial
--- a/src/axolotl/utils/data/sft.py
+++ b/src/axolotl/utils/data/sft.py
@@ -42,7 +42,7 @@ from axolotl.prompters import (
 from axolotl.utils.data.pretraining import wrap_pretraining_dataset
 from axolotl.utils.data.utils import md5
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.distributed import is_local_main_process, zero_first
+from axolotl.utils.distributed import is_main_process, zero_first
 from axolotl.utils.trainer import (
    calculate_total_num_steps,
    process_datasets_for_packing,
@@ -54,7 +54,7 @@ LOG = logging.getLogger("axolotl")
 def prepare_dataset(cfg, tokenizer):
    prompters = []
    if not cfg.pretraining_dataset:
-        with zero_first(is_local_main_process()):
+        with zero_first(is_main_process()):
            if cfg.test_datasets:
                train_dataset, _, prompters = load_prepare_datasets(
                    tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH, split="train"
@@ -160,12 +160,8 @@ def load_tokenized_prepared_datasets(
    use_auth_token = cfg.hf_use_auth_token
    try:
        if cfg.push_dataset_to_hub:
-            LOG.info(
-                f"Attempting to load prepared dataset from Huggingface hub at {cfg.push_dataset_to_hub} (version {ds_hash})..."
-            )
            dataset = load_dataset(
-                cfg.push_dataset_to_hub,
-                ds_hash,
+                f"{cfg.push_dataset_to_hub}/{ds_hash}",
                token=use_auth_token,
            )
            dataset = dataset[split]
@@ -174,7 +170,6 @@ def load_tokenized_prepared_datasets(

    # pylint: disable=duplicate-code
    if dataset:
-        # This is for the case where we already loaded a pretokenized dataset from the hub
        ...
    elif (
        cfg.dataset_prepared_path
@@ -185,14 +180,7 @@ def load_tokenized_prepared_datasets(
        dataset = load_from_disk(str(prepared_ds_path))
        LOG.info("Prepared dataset loaded from disk...")
    else:
-        if cfg.push_dataset_to_hub:
-            LOG.info("Unable to find prepared dataset in Huggingface hub")
-        if cfg.is_preprocess:
-            LOG.info(
-                f"Skipping prepared dataset in {prepared_ds_path} for pre-processing..."
-            )
-        else:
-            LOG.info(f"Unable to find prepared dataset in {prepared_ds_path}")
+        LOG.info(f"Unable to find prepared dataset in {prepared_ds_path}")
        LOG.info("Loading raw datasets...")
        if not cfg.is_preprocess:
            LOG.warning(
@@ -210,8 +198,6 @@ def load_tokenized_prepared_datasets(
        def for_d_in_datasets(dataset_configs):
            for dataset in dataset_configs:
                if dataset.name and isinstance(dataset.name, list):
-                    # load_dataset doesn't properly handle multiple named configurations
-                    # at the same time for a given dataset
                    for name in dataset.name:
                        yield DictDefault({**dataset, "name": name})
                else:
@@ -222,8 +208,6 @@ def load_tokenized_prepared_datasets(
            ds: Optional[Union[Dataset, DatasetDict]] = None
            ds_from_hub = False
            try:
-                # this is just a basic check to see if the path is a
-                # valid HF dataset that's loadable
                load_dataset(
                    config_dataset.path,
                    name=config_dataset.name,
@@ -444,12 +428,10 @@ def load_tokenized_prepared_datasets(
            dataset.save_to_disk(str(prepared_ds_path))
            if cfg.push_dataset_to_hub:
                LOG.info(
-                    f"Pushing merged prepared dataset to Huggingface hub at {cfg.push_dataset_to_hub} (version {ds_hash})..."
+                    f"Saving merged prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
                )
                dataset.push_to_hub(
-                    cfg.push_dataset_to_hub,
-                    ds_hash,
-                    private=True,
+                    f"{cfg.push_dataset_to_hub}/{ds_hash}", private=True
                )

    return dataset, prompters
--- a/src/axolotl/utils/distributed.py
+++ b/src/axolotl/utils/distributed.py
@@ -44,10 +44,6 @@ def is_main_process():
    return dist.get_rank() == 0


-def is_local_main_process():
-    return PartialState().is_main_process
-
-
 def get_world_size():
    return int(os.getenv("WORLD_SIZE", "1"))

@@ -153,11 +149,11 @@ def compute_and_broadcast(fn):  # pylint: disable=invalid-name
    if is_main_process():
        value_scalar = fn()
        value_tensor = torch.tensor(
-            value_scalar, device=torch.cuda.current_device(), dtype=torch.float32
-        )
+            value_scalar, device=torch.cuda.current_device()
+        ).float()
    else:
        value_tensor = torch.tensor(
-            0.0, device=torch.cuda.current_device(), dtype=torch.float32
+            0.0, device=torch.cuda.current_device()
        )  # Placeholder tensor

    # Broadcast the tensor to all processes.
--- a/src/axolotl/utils/model_shard_quant.py
+++ b/src/axolotl/utils/model_shard_quant.py
@@ -13,7 +13,6 @@ from fastcore.parallel import parallel
 from torch import Tensor, nn
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM
-from transformers.quantizers import AutoHfQuantizer
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, hub


@@ -174,7 +173,6 @@ def load_sharded_model_quant(
    low_memory=True,
    verbose=False,
    loading_workers=2,
-    quantization_config=None,
 ):
    with init_empty_weights():
        model = AutoModelForCausalLM.from_config(
@@ -188,26 +186,15 @@ def load_sharded_model_quant(
                compute_dtype=compute_dtype,
                quant_type="nf4",
                quant_storage=quant_storage,
-                compress_statistics=True,  # bnb_4bit_use_double_quant
-                skip_modules=[
-                    "lm_head",
-                    "embed_out",
-                ],
            )
        else:
            # this is the more common case with HF transformers
-            # TODO can we detect the model arch and dynamically set skip_modules
            model.model = _replace_linear(
                model.model,
                Linear4bit,
                compute_dtype=compute_dtype,
                quant_type="nf4",
                quant_storage=quant_storage,
-                compress_statistics=True,  # bnb_4bit_use_double_quant
-                skip_modules=[
-                    "lm_head",
-                    "embed_out",
-                ],
            )
    model.is_loaded_in_4bit = True

@@ -264,11 +251,6 @@ def load_sharded_model_quant(
            quant_method=quant_method,
        )

-    # these attributes are needed to inform transformers/peft of the quantization
-    model.is_quantized = True
-    model.quantization_method = "bitsandbytes"
-    model.hf_quantizer = AutoHfQuantizer.from_config(quantization_config)
-
    if cfg.local_rank == 0 and verbose:
        print(f"Loaded model weights in {time.time()-start:.3f} seconds")
    # cleanup any extra memory usage from parallel loading
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -1,7 +1,7 @@
 """Module for models and model loading"""

 # pylint: disable=too-many-lines
-import gc
+
 import logging
 import math
 import os
@@ -29,7 +29,6 @@ from transformers import (  # noqa: F401
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
-    AwqConfig,
    BitsAndBytesConfig,
    GPTQConfig,
    PreTrainedModel,
@@ -37,7 +36,6 @@ from transformers import (  # noqa: F401
 )
 from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled

-from axolotl.common.architectures import MOE_ARCH_BLOCK
 from axolotl.models.mamba import fix_mamba_attn_for_loss
 from axolotl.monkeypatch.multipack import (
    SUPPORTED_MULTIPACK_MODEL_TYPES,
@@ -96,7 +94,7 @@ def check_model_config(cfg: DictDefault, model_config: Union[AutoConfig, DictDef
            "Please make sure to point to a GPTQ model."
        )

-    if not cfg.gptq and quant_config_exists and not cfg.load_in_4bit:
+    if not cfg.gptq and quant_config_exists:
        raise ValueError(
            "model_config.quantization_config is set but `gptq` flag is not. "
            "Please use the `gptq` flag to train quantized model or point to a non-quantized model."
@@ -348,36 +346,7 @@ def load_model(
        and cfg.flash_attention
        and cfg.sample_packing
    ):
-        patch_for_multipack(
-            cfg.model_config_type,
-            model_name=cfg.base_model,
-            is_remote_code=cfg.trust_remote_code,
-        )
-
-        if cfg.is_llama_derived_model:
-            from axolotl.monkeypatch.llama_attn_hijack_flash import (
-                patch_llama_cross_entropy,
-                patch_llama_rms_norm,
-            )
-
-            if cfg.flash_attn_cross_entropy:
-                patch_llama_cross_entropy()
-            if cfg.flash_attn_rms_norm:
-                patch_llama_rms_norm()
-            elif cfg.unsloth_rms_norm:
-                from axolotl.monkeypatch.unsloth_ import patch_unsloth_layernorm
-
-                patch_unsloth_layernorm()
-            if cfg.unsloth_cross_entropy_loss:
-                from axolotl.monkeypatch.unsloth_ import (
-                    integrate_cross_entropy_loss_patch,
-                )
-
-                integrate_cross_entropy_loss_patch(model_type="llama")
-            if cfg.unsloth_lora_qkv or cfg.unsloth_lora_o:
-                from axolotl.monkeypatch.unsloth_ import patch_self_attn_lora
-
-                patch_self_attn_lora()
+        patch_for_multipack(cfg.model_config_type, model_name=cfg.base_model)
    elif cfg.is_llama_derived_model:
        # Modify all llama derived models in one block

@@ -430,7 +399,7 @@ def load_model(
        if cfg.unsloth_cross_entropy_loss:
            from axolotl.monkeypatch.unsloth_ import integrate_cross_entropy_loss_patch

-            integrate_cross_entropy_loss_patch(model_type="llama")
+            integrate_cross_entropy_loss_patch()

        if cfg.unsloth_lora_qkv or cfg.unsloth_lora_o:
            from axolotl.monkeypatch.unsloth_ import patch_self_attn_lora
@@ -438,12 +407,23 @@ def load_model(
            patch_self_attn_lora()

    # Modify mistral derived models
-    if cfg.model_config_type == "mistral" and cfg.flash_attn_cross_entropy_loss:
+    if (
+        cfg.model_config_type == "mistral"
+        and cfg.flash_attention
+        and cfg.sample_packing
+    ):
        from axolotl.monkeypatch.mistral_attn_hijack_flash import (
-            patch_mistral_cross_entropy,
+            replace_mistral_attn_with_flash_attn,
        )

-        patch_mistral_cross_entropy()
+        LOG.info("patching mistral with flash attention")
+        replace_mistral_attn_with_flash_attn(packed=cfg.sample_packing)
+
+    if cfg.is_llama_derived_model and cfg.sample_packing and not inference:
+        from axolotl.monkeypatch.llama_expand_mask import hijack_expand_mask
+
+        LOG.info("patching _expand_mask")
+        hijack_expand_mask()

    model_kwargs: Dict[str, Any] = {}

@@ -516,25 +496,7 @@ def load_model(
            model_kwargs["quantization_config"] = GPTQConfig(
                **model_config.quantization_config
            )
-    if (
-        cfg.adapter in ["qlora", "lora"]
-        and hasattr(model_config, "quantization_config")
-        and model_config.quantization_config["quant_method"]
-        in ["gptq", "awq", "bitsandbytes"]
-    ):
-        if model_config.quantization_config["quant_method"] == "gptq":
-            model_kwargs["quantization_config"] = GPTQConfig(
-                **model_config.quantization_config
-            )
-        elif model_config.quantization_config["quant_method"] == "awq":
-            model_kwargs["quantization_config"] = AwqConfig(
-                **model_config.quantization_config
-            )
-        elif model_config.quantization_config["quant_method"] == "bitsandbytes":
-            model_kwargs["quantization_config"] = BitsAndBytesConfig(
-                **model_config.quantization_config
-            )
-    elif cfg.adapter == "qlora" and cfg.load_in_4bit:
+    if cfg.adapter == "qlora" and cfg.load_in_4bit:
        bnb_config = {
            "load_in_4bit": True,
            "llm_int8_threshold": 6.0,
@@ -628,21 +590,14 @@ def load_model(
        elif (
            qlora_fsdp
            and cfg.fsdp_config.fsdp_cpu_ram_efficient_loading
-            and (cfg.model_config_type == "dbrx" or cfg.qlora_sharded_model_loading)
+            and cfg.model_config_type == "dbrx"
        ):
            quant_storage = cfg.torch_dtype
-            quantization_config = hasattr(
-                model_config, "quantization_config"
-            ) and getattr(model_config, "quantization_config")
-            quantization_config = (
-                quantization_config or model_kwargs["quantization_config"]
-            )
            model = load_sharded_model_quant(
                base_model,
                model_config,
                cfg,
                quant_storage=quant_storage,
-                quantization_config=quantization_config,
            )
            skip_move_to_device = True
        elif (
@@ -650,7 +605,7 @@ def load_model(
            and not cfg.trust_remote_code
            and not cfg.gptq
        ):
-            if cfg.fsdp and cfg.fsdp_config.fsdp_cpu_ram_efficient_loading:
+            if qlora_fsdp and cfg.fsdp_config.fsdp_cpu_ram_efficient_loading:
                skip_move_to_device = True
                if "device_map" in model_kwargs:
                    del model_kwargs["device_map"]
@@ -732,7 +687,7 @@ def load_model(
                    **model_kwargs,
                )
            else:
-                if cfg.fsdp and cfg.fsdp_config.fsdp_cpu_ram_efficient_loading:
+                if qlora_fsdp and cfg.fsdp_config.fsdp_cpu_ram_efficient_loading:
                    # disabling either of these two still leads to VRAM spike before setting back down
                    skip_move_to_device = True
                    if "device_map" in model_kwargs:
@@ -816,16 +771,12 @@ def load_model(
            set_z3_leaf_modules,
        )

-        if cfg.model_config_type in MOE_ARCH_BLOCK:
-            moe_blocks = MOE_ARCH_BLOCK[cfg.model_config_type]
-            moe_blocks = [moe_blocks] if isinstance(moe_blocks, str) else moe_blocks
-            set_z3_leaf_modules(
-                model,
-                [
-                    get_module_class_from_name(model, module_name)
-                    for module_name in moe_blocks
-                ],
-            )
+        if cfg.model_config_type == "mixtral":
+            moe_block = get_module_class_from_name(model, "MixtralSparseMoeBlock")
+            set_z3_leaf_modules(model, [moe_block])
+        elif cfg.model_config_type == "dbrx":
+            moe_block = get_module_class_from_name(model, "DbrxFFN")
+            set_z3_leaf_modules(model, [moe_block])

    if cfg.model_config_type == "qwen" and cfg.adapter == "lora":
        # Qwen doesn't play nicely with LoRA if this is enabled
@@ -839,9 +790,6 @@ def load_model(
        # make sure everything is in the same dtype
        skip_prepare_model_for_kbit_training = True

-    if is_deepspeed_zero3_enabled():
-        skip_prepare_model_for_kbit_training = True
-
    if cfg.adapter in ["lora", "qlora"]:
        if cfg.gradient_checkpointing:
            model.gradient_checkpointing_enable(
@@ -876,9 +824,6 @@ def load_model(
        else:
            model, lora_config = load_adapter(model, cfg, cfg.adapter)

-    if is_deepspeed_zero3_enabled():
-        skip_move_to_device = True
-
    if (
        cfg.ddp
        and not load_in_8bit
@@ -918,15 +863,6 @@ def load_model(

        integrate_lora_patch(model, cfg)

-    if cfg.unsloth_rope:
-        from axolotl.monkeypatch.unsloth_ import integrate_rope_embeddings
-
-        integrate_rope_embeddings()
-
-    for _ in range(3):
-        gc.collect()
-        torch.cuda.empty_cache()
-
    # TODO resume_from_checkpoint handling
    return model, lora_config

@@ -1024,7 +960,7 @@ def load_lora(model, cfg, inference=False, config_only=False):

    if cfg.lora_target_linear:
        linear_names = find_all_linear_names(model)
-        LOG.info(f"found linear modules: {repr(sorted(linear_names))}")
+        LOG.info(f"found linear modules: {repr(linear_names)}")
        lora_target_modules = list(set(lora_target_modules + linear_names))

    lora_config_kwargs = {}
--- a/src/axolotl/utils/tokenization.py
+++ b/src/axolotl/utils/tokenization.py
@@ -62,7 +62,7 @@ def process_tokens_for_rl_debug(tokens, color, tokenizer, text_only):
    """Helper function to process and color tokens."""
    colored_tokens = [
        color_token_for_rl_debug(tokenizer.decode(token), token, color, text_only)
-        for token in tokenizer.encode(tokens, add_special_tokens=False)
+        for token in tokenizer.encode(tokens)
    ]
    return colored_tokens

--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -1,5 +1,4 @@
 """Module containing the Trainer class and related functions"""
-import json
 import math
 import os
 import random
@@ -16,7 +15,7 @@ from torch.utils.data import DataLoader, RandomSampler
 from transformers.utils import is_torch_bf16_gpu_available

 from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFRLTrainerBuilder
-from axolotl.utils.distributed import reduce_and_broadcast
+from axolotl.utils.distributed import is_main_process, reduce_and_broadcast, zero_first
 from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths

 LOG = get_logger("axolotl")
@@ -183,88 +182,90 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
        sequence_len=cfg.sequence_len,
        min_sequence_len=cfg.min_sample_len or 2,
    )
+    with zero_first(is_main_process()):
+        if cfg.is_preprocess:
+            min_input_len = np.min(get_dataset_lengths(train_dataset))
+            LOG.debug(f"min_input_len: {min_input_len}", main_process_only=True)
+            max_input_len = np.max(get_dataset_lengths(train_dataset))
+            LOG.debug(f"max_input_len: {max_input_len}", main_process_only=True)

-    if cfg.is_preprocess:
-        min_input_len = np.min(get_dataset_lengths(train_dataset))
-        LOG.debug(f"min_input_len: {min_input_len}", main_process_only=True)
-        max_input_len = np.max(get_dataset_lengths(train_dataset))
-        LOG.debug(f"max_input_len: {max_input_len}", main_process_only=True)
+        if (
+            cfg.is_mistral_derived_model and cfg.flash_attention
+        ) or cfg.model_config_type == "mamba":
+            LOG.info("dropping attention_mask column")
+            train_dataset = train_dataset.remove_columns("attention_mask")
+            if eval_dataset:
+                eval_dataset = eval_dataset.remove_columns("attention_mask")

-    if cfg.model_config_type == "mamba":
-        LOG.info("dropping attention_mask column")
-        train_dataset = train_dataset.remove_columns("attention_mask")
-        if eval_dataset:
-            eval_dataset = eval_dataset.remove_columns("attention_mask")
+        if cfg.model_config_type == "falcon":
+            LOG.info("dropping token_type_ids column if it exists")
+            if "token_type_ids" in train_dataset.column_names:
+                train_dataset = train_dataset.remove_columns("token_type_ids")
+            if eval_dataset and "token_type_ids" in eval_dataset.column_names:
+                eval_dataset = eval_dataset.remove_columns("token_type_ids")

-    if cfg.model_config_type == "falcon":
-        LOG.info("dropping token_type_ids column if it exists")
-        if "token_type_ids" in train_dataset.column_names:
-            train_dataset = train_dataset.remove_columns("token_type_ids")
-        if eval_dataset and "token_type_ids" in eval_dataset.column_names:
-            eval_dataset = eval_dataset.remove_columns("token_type_ids")
-
-    train_dataset = train_dataset.filter(
-        drop_long,
-        num_proc=cfg.dataset_processes,
-        load_from_cache_file=not cfg.is_preprocess,
-        desc="Dropping Long Sequences",
-    )
-    if eval_dataset:
-        eval_dataset = eval_dataset.filter(
+        train_dataset = train_dataset.filter(
            drop_long,
            num_proc=cfg.dataset_processes,
            load_from_cache_file=not cfg.is_preprocess,
            desc="Dropping Long Sequences",
        )
+        if eval_dataset:
+            eval_dataset = eval_dataset.filter(
+                drop_long,
+                num_proc=cfg.dataset_processes,
+                load_from_cache_file=not cfg.is_preprocess,
+                desc="Dropping Long Sequences",
+            )

-    if cfg.group_by_length:
-        train_dataset = train_dataset.map(
-            add_length,
-            num_proc=cfg.dataset_processes,
-            load_from_cache_file=not cfg.is_preprocess,
-            desc="Group By Length",
-        )
+        if cfg.group_by_length:
+            train_dataset = train_dataset.map(
+                add_length,
+                num_proc=cfg.dataset_processes,
+                load_from_cache_file=not cfg.is_preprocess,
+                desc="Group By Length",
+            )

-    if cfg.use_pose:
-        pose_kwargs = {}
-        if cfg.pose_num_chunks is not None:
-            pose_kwargs["chunks"] = cfg.pose_num_chunks
-        pose_fn = partial(
-            add_pose_position_ids,
-            max_context_len=cfg.pose_max_context_len,
-            split_on_token_ids=cfg.pose_split_on_token_ids,
-            **pose_kwargs,
-        )
-        train_dataset = train_dataset.map(
-            pose_fn,
-            num_proc=cfg.dataset_processes,
-            load_from_cache_file=not cfg.is_preprocess,
-            desc="Add position_id column (PoSE)",
-        )
-        train_dataset = train_dataset.sort("sequence_len")
-        if cfg.eval_sample_packing is not False:
-            if eval_dataset:
-                eval_dataset = eval_dataset.map(
-                    pose_fn,
-                    num_proc=cfg.dataset_processes,
-                    load_from_cache_file=not cfg.is_preprocess,
-                    desc="Add position_id column (PoSE)",
-                )
-    elif cfg.sample_packing:
-        train_dataset = train_dataset.map(
-            add_position_ids,
-            num_proc=cfg.dataset_processes,
-            load_from_cache_file=not cfg.is_preprocess,
-            desc="Add position_id column (Sample Packing)",
-        )
-        if cfg.eval_sample_packing is not False:
-            if eval_dataset:
-                eval_dataset = eval_dataset.map(
-                    add_position_ids,
-                    num_proc=cfg.dataset_processes,
-                    load_from_cache_file=not cfg.is_preprocess,
-                    desc="Add position_id column (Sample Packing)",
-                )
+        if cfg.use_pose:
+            pose_kwargs = {}
+            if cfg.pose_num_chunks is not None:
+                pose_kwargs["chunks"] = cfg.pose_num_chunks
+            pose_fn = partial(
+                add_pose_position_ids,
+                max_context_len=cfg.pose_max_context_len,
+                split_on_token_ids=cfg.pose_split_on_token_ids,
+                **pose_kwargs,
+            )
+            train_dataset = train_dataset.map(
+                pose_fn,
+                num_proc=cfg.dataset_processes,
+                load_from_cache_file=not cfg.is_preprocess,
+                desc="Add position_id column (PoSE)",
+            )
+            train_dataset = train_dataset.sort("sequence_len")
+            if cfg.eval_sample_packing is not False:
+                if eval_dataset:
+                    eval_dataset = eval_dataset.map(
+                        pose_fn,
+                        num_proc=cfg.dataset_processes,
+                        load_from_cache_file=not cfg.is_preprocess,
+                        desc="Add position_id column (PoSE)",
+                    )
+        elif cfg.sample_packing:
+            train_dataset = train_dataset.map(
+                add_position_ids,
+                num_proc=cfg.dataset_processes,
+                load_from_cache_file=not cfg.is_preprocess,
+                desc="Add position_id column (Sample Packing)",
+            )
+            if cfg.eval_sample_packing is not False:
+                if eval_dataset:
+                    eval_dataset = eval_dataset.map(
+                        add_position_ids,
+                        num_proc=cfg.dataset_processes,
+                        load_from_cache_file=not cfg.is_preprocess,
+                        desc="Add position_id column (Sample Packing)",
+                    )

    return train_dataset, eval_dataset

@@ -390,15 +391,6 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
    return total_num_steps


-def setup_deepspeed_env(cfg, stage=None):
-    os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
-    os.environ["ACCELERATE_DEEPSPEED_CONFIG_FILE"] = cfg.deepspeed
-    if stage:
-        os.environ["ACCELERATE_DEEPSPEED_ZERO_STAGE"] = str(stage)
-        if stage == 3:
-            os.environ["ACCELERATE_DEEPSPEED_ZERO3_INIT"] = "true"
-
-
 def setup_fsdp_envs(cfg):
    os.environ["ACCELERATE_USE_FSDP"] = "true"
    if cfg.fsdp_config.fsdp_activation_checkpointing:
@@ -425,14 +417,8 @@ def prepare_optim_env(cfg):
    if cfg.fsdp:
        setup_fsdp_envs(cfg)
    elif cfg.deepspeed:
-        stage = None
-        # check if the cfg.deepspeed is a file
-        if os.path.isfile(cfg.deepspeed):
-            # parse with json
-            with open(cfg.deepspeed, "r", encoding="utf-8") as fin:
-                deepspeed_config = json.load(fin)
-            stage = deepspeed_config.get("zero_optimization", {}).get("stage", None)
-        setup_deepspeed_env(cfg, stage=stage)
+        os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
+        os.environ["ACCELERATE_DEEPSPEED_CONFIG_FILE"] = cfg.deepspeed

    if (cfg.bf16 == "auto" and is_torch_bf16_gpu_available()) or cfg.bf16 is True:
        os.environ["ACCELERATE_MIXED_PRECISION"] = "bf16"
@@ -440,14 +426,8 @@ def prepare_optim_env(cfg):
        os.environ["ACCELERATE_MIXED_PRECISION"] = "fp16"


-def prepare_opinionated_env(cfg):
-    if cfg.qlora_sharded_model_loading:
-        # model loading is forked after the tokenizer
-        os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-
 def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps):
-    if cfg.rl in ["dpo", "ipo", "orpo", "kto", "simpo"]:
+    if cfg.rl in ["dpo", "ipo", "orpo", "kto"]:
        trainer_builder = HFRLTrainerBuilder(cfg, model[0], tokenizer)
        trainer_builder.model_ref = model[1]
        trainer_builder.peft_config = model[2]
--- a/tests/e2e/multigpu/init.py
+++ b/tests/e2e/multigpu/init.py
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -1,341 +0,0 @@
-"""
-E2E tests for multigpu lora tinyllama
-"""
-
-import logging
-import os
-import unittest
-from pathlib import Path
-
-import pytest
-import yaml
-from accelerate.test_utils import execute_subprocess_async
-
-from axolotl.utils.dict import DictDefault
-
-from ..utils import with_temp_dir
-
-LOG = logging.getLogger("axolotl.tests.e2e.multigpu")
-os.environ["WANDB_DISABLED"] = "true"
-
-
-class TestMultiGPULlama(unittest.TestCase):
-    """
-    Test case for Llama models using LoRA
-    """
-
-    @with_temp_dir
-    def test_lora_ddp(self, temp_dir):
-        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "TinyLlama/TinyLlama_v1.1",
-                "tokenizer_type": "LlamaTokenizer",
-                "sequence_len": 2048,
-                "adapter": "lora",
-                "lora_r": 8,
-                "lora_alpha": 16,
-                "lora_dropout": 0.05,
-                "lora_target_linear": True,
-                "val_set_size": 0.05,
-                "special_tokens": {
-                    "unk_token": "<unk>",
-                    "bos_token": "<s>",
-                    "eos_token": "</s>",
-                },
-                "datasets": [
-                    {
-                        "path": "tatsu-lab/alpaca",
-                        "type": "alpaca",
-                    },
-                ],
-                "num_epochs": 1,
-                "max_steps": 100,
-                "micro_batch_size": 4,
-                "gradient_accumulation_steps": 4,
-                "output_dir": temp_dir,
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_8bit",
-                "lr_scheduler": "cosine",
-                "flash_attention": True,
-            }
-        )
-
-        # write cfg to yaml file
-        Path(temp_dir).mkdir(parents=True, exist_ok=True)
-        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
-            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
-
-        execute_subprocess_async(
-            [
-                "accelerate",
-                "launch",
-                "--num-processes",
-                "2",
-                "-m",
-                "axolotl.cli.train",
-                str(Path(temp_dir) / "config.yaml"),
-            ]
-        )
-
-    @with_temp_dir
-    def test_lora_ddp_packed(self, temp_dir):
-        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "TinyLlama/TinyLlama_v1.1",
-                "tokenizer_type": "LlamaTokenizer",
-                "sequence_len": 2048,
-                "sample_packing": True,
-                "eval_sample_packing": False,
-                "pad_to_sequence_len": True,
-                "adapter": "lora",
-                "lora_r": 8,
-                "lora_alpha": 16,
-                "lora_dropout": 0.05,
-                "lora_target_linear": True,
-                "val_set_size": 0.05,
-                "special_tokens": {
-                    "unk_token": "<unk>",
-                    "bos_token": "<s>",
-                    "eos_token": "</s>",
-                },
-                "datasets": [
-                    {
-                        "path": "tatsu-lab/alpaca",
-                        "type": "alpaca",
-                    },
-                ],
-                "num_epochs": 1,
-                "max_steps": 50,
-                "micro_batch_size": 4,
-                "gradient_accumulation_steps": 4,
-                "output_dir": temp_dir,
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_8bit",
-                "lr_scheduler": "cosine",
-                "flash_attention": True,
-            }
-        )
-
-        # write cfg to yaml file
-        Path(temp_dir).mkdir(parents=True, exist_ok=True)
-        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
-            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
-
-        execute_subprocess_async(
-            [
-                "accelerate",
-                "launch",
-                "--num-processes",
-                "2",
-                "-m",
-                "axolotl.cli.train",
-                str(Path(temp_dir) / "config.yaml"),
-            ]
-        )
-
-    @with_temp_dir
-    def test_fsdp(self, temp_dir):
-        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "TinyLlama/TinyLlama_v1.1",
-                "tokenizer_type": "LlamaTokenizer",
-                "sequence_len": 2048,
-                "val_set_size": 0.05,
-                "special_tokens": {
-                    "unk_token": "<unk>",
-                    "bos_token": "<s>",
-                    "eos_token": "</s>",
-                },
-                "datasets": [
-                    {
-                        "path": "tatsu-lab/alpaca",
-                        "type": "alpaca",
-                    },
-                ],
-                "num_epochs": 1,
-                "max_steps": 100,
-                "micro_batch_size": 4,
-                "gradient_accumulation_steps": 4,
-                "output_dir": temp_dir,
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_torch",
-                "lr_scheduler": "cosine",
-                "flash_attention": True,
-                "fsdp": [
-                    "full_shard",
-                    "auto_wrap",
-                ],
-                "fsdp_config": {
-                    "fsdp_limit_all_gathers": True,
-                    "fsdp_offload_params": False,
-                    "fsdp_sync_module_states": True,
-                    "fsdp_use_orig_params": False,
-                    "fsdp_cpu_ram_efficient_loading": False,
-                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
-                    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
-                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
-                },
-            }
-        )
-
-        # write cfg to yaml file
-        Path(temp_dir).mkdir(parents=True, exist_ok=True)
-        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
-            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
-
-        execute_subprocess_async(
-            [
-                "accelerate",
-                "launch",
-                "--num-processes",
-                "2",
-                "-m",
-                "axolotl.cli.train",
-                str(Path(temp_dir) / "config.yaml"),
-            ]
-        )
-
-    @with_temp_dir
-    def test_fsdp_packed(self, temp_dir):
-        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "TinyLlama/TinyLlama_v1.1",
-                "tokenizer_type": "LlamaTokenizer",
-                "sample_packing": True,
-                "eval_sample_packing": False,
-                "pad_to_sequence_len": True,
-                "sequence_len": 2048,
-                "val_set_size": 0.05,
-                "special_tokens": {
-                    "unk_token": "<unk>",
-                    "bos_token": "<s>",
-                    "eos_token": "</s>",
-                },
-                "datasets": [
-                    {
-                        "path": "tatsu-lab/alpaca",
-                        "type": "alpaca",
-                    },
-                ],
-                "num_epochs": 1,
-                "max_steps": 100,
-                "micro_batch_size": 4,
-                "gradient_accumulation_steps": 4,
-                "output_dir": temp_dir,
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_torch",
-                "lr_scheduler": "cosine",
-                "flash_attention": True,
-                "fsdp": [
-                    "full_shard",
-                    "auto_wrap",
-                ],
-                "fsdp_config": {
-                    "fsdp_limit_all_gathers": True,
-                    "fsdp_offload_params": False,
-                    "fsdp_sync_module_states": True,
-                    "fsdp_use_orig_params": False,
-                    "fsdp_cpu_ram_efficient_loading": False,
-                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
-                    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
-                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
-                },
-            }
-        )
-
-        # write cfg to yaml file
-        Path(temp_dir).mkdir(parents=True, exist_ok=True)
-        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
-            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
-
-        execute_subprocess_async(
-            [
-                "accelerate",
-                "launch",
-                "--num-processes",
-                "2",
-                "-m",
-                "axolotl.cli.train",
-                str(Path(temp_dir) / "config.yaml"),
-            ]
-        )
-
-    @pytest.mark.skip("disabled due to upstream issue")
-    @with_temp_dir
-    def test_fsdp_qlora_prequant_packed(self, temp_dir):
-        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "axolotl-ai-co/TinyLlama_v1.1-bnb-nf4-bf16",
-                "tokenizer_type": "AutoTokenizer",
-                "adapter": "qlora",
-                "load_in_4bit": True,
-                "lora_r": 8,
-                "lora_alpha": 16,
-                "lora_dropout": 0.05,
-                "lora_target_linear": True,
-                "lora_modules_to_save": [
-                    "embed_tokens",
-                    "lm_head",
-                ],
-                "sample_packing": True,
-                "eval_sample_packing": False,
-                "pad_to_sequence_len": True,
-                "sequence_len": 2048,
-                "val_set_size": 0.05,
-                "special_tokens": {
-                    "pad_token": "<|end_of_text|>",
-                },
-                "datasets": [
-                    {
-                        "path": "tatsu-lab/alpaca",
-                        "type": "alpaca",
-                        "split": "train[:25%]",
-                    },
-                ],
-                "num_epochs": 1,
-                "max_steps": 100,
-                "micro_batch_size": 4,
-                "gradient_accumulation_steps": 4,
-                "output_dir": temp_dir,
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_torch",
-                "lr_scheduler": "cosine",
-                "flash_attention": True,
-                "fsdp": [
-                    "full_shard",
-                    "auto_wrap",
-                ],
-                "fsdp_config": {
-                    "fsdp_limit_all_gathers": True,
-                    "fsdp_offload_params": False,
-                    "fsdp_sync_module_states": True,
-                    "fsdp_use_orig_params": False,
-                    "fsdp_cpu_ram_efficient_loading": True,
-                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
-                    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
-                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
-                },
-            }
-        )
-
-        # write cfg to yaml file
-        Path(temp_dir).mkdir(parents=True, exist_ok=True)
-        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
-            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
-
-        execute_subprocess_async(
-            [
-                "accelerate",
-                "launch",
-                "--num-processes",
-                "2",
-                "-m",
-                "axolotl.cli.train",
-                str(Path(temp_dir) / "config.yaml"),
-            ]
-        )
--- a/tests/e2e/patched/test_model_patches.py
+++ b/tests/e2e/patched/test_model_patches.py
@@ -4,8 +4,6 @@ E2E smoke tests to check that the monkeypatches are in place for certain configu

 import unittest

-import transformers
-
 from axolotl.common.cli import TrainerCliArgs
 from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault
@@ -89,9 +87,9 @@ class TestModelPatches(unittest.TestCase):
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        tokenizer = load_tokenizer(cfg)
-        load_model(cfg, tokenizer, inference=cli_args.inference)
+        model, _ = load_model(cfg, tokenizer, inference=cli_args.inference)

        assert (
-            "torch.jit"
-            in transformers.modeling_flash_attention_utils._get_unpad_data.__module__  # pylint: disable=protected-access
+            "axolotl.monkeypatch.mistral_attn_hijack_flash"
+            in model.model.layers[0].self_attn.forward.__module__
        )
--- a/tests/e2e/test_imports.py
+++ b/tests/e2e/test_imports.py
@@ -1,20 +0,0 @@
-"""
-test module to import various submodules that have historically broken due to dependency issues
-"""
-import unittest
-
-
-class TestImports(unittest.TestCase):
-    """
-    Test class to import various submodules that have historically broken due to dependency issues
-    """
-
-    def test_import_causal_trainer(self):
-        from axolotl.core.trainer_builder import (  # pylint: disable=unused-import  # noqa: F401
-            HFCausalTrainerBuilder,
-        )
-
-    def test_import_rl_trainer(self):
-        from axolotl.core.trainer_builder import (  # pylint: disable=unused-import  # noqa: F401
-            HFRLTrainerBuilder,
-        )
--- a/tests/e2e/test_llama_pretrain.py
+++ b/tests/e2e/test_llama_pretrain.py
@@ -1,67 +0,0 @@
-"""
-E2E tests for llama pretrain
-"""
-
-import logging
-import os
-import unittest
-from pathlib import Path
-
-from axolotl.cli import load_datasets
-from axolotl.common.cli import TrainerCliArgs
-from axolotl.train import train
-from axolotl.utils.config import normalize_config
-from axolotl.utils.dict import DictDefault
-
-from .utils import with_temp_dir
-
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
-
-class TestPretrainLlama(unittest.TestCase):
-    """
-    Test case for Llama models w pretraining
-    """
-
-    @with_temp_dir
-    def test_pretrain_w_sample_packing(self, temp_dir):
-        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "JackFram/llama-68m",
-                "tokenizer_type": "LlamaTokenizer",
-                "flash_attention": True,
-                "sequence_len": 1024,
-                "sample_packing": True,
-                "special_tokens": {
-                    "unk_token": "<unk>",
-                    "bos_token": "<s>",
-                    "eos_token": "</s>",
-                },
-                "pretraining_dataset": [
-                    {
-                        "path": "allenai/c4",
-                        "name": "en",
-                        "type": "pretrain",
-                    }
-                ],
-                "max_steps": 5,
-                "num_epochs": 1,
-                "micro_batch_size": 1,
-                "gradient_accumulation_steps": 1,
-                "val_set_size": 0.0,
-                "output_dir": temp_dir,
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_torch",
-                "lr_scheduler": "cosine",
-                "save_safetensors": True,
-                "bf16": "auto",
-            }
-        )
-        normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
-
-        train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
-        assert (Path(temp_dir) / "model.safetensors").exists()
--- a/tests/prompt_strategies/test_chat_templates.py
+++ b/tests/prompt_strategies/test_chat_templates.py
--- a/tests/prompt_strategies/test_dpo_chat_templates.py
+++ b/tests/prompt_strategies/test_dpo_chat_templates.py
@@ -1,156 +0,0 @@
-"""
-tests for chat_template prompt strategy
-"""
-
-import unittest
-
-import pytest
-from datasets import Dataset
-from transformers import AutoTokenizer
-
-from axolotl.prompt_strategies.dpo.chat_template import default
-from axolotl.utils.dict import DictDefault
-
-
-@pytest.fixture(name="assistant_dataset")
-def fixture_assistant_dataset():
-    # pylint: disable=duplicate-code
-    return Dataset.from_list(
-        [
-            {
-                "messages": [
-                    {
-                        "role": "user",
-                        "content": "hello",
-                    },
-                    {
-                        "role": "assistant",
-                        "content": "hello",
-                    },
-                    {
-                        "role": "user",
-                        "content": "goodbye",
-                    },
-                ],
-                "chosen": {
-                    "role": "assistant",
-                    "content": "goodbye",
-                },
-                "rejected": {
-                    "role": "assistant",
-                    "content": "party on",
-                },
-            }
-        ]
-    )
-
-
-@pytest.fixture(name="custom_assistant_dataset")
-def fixture_custom_assistant_dataset():
-    # pylint: disable=duplicate-code
-    return Dataset.from_list(
-        [
-            {
-                "conversation": [
-                    {
-                        "speaker": "human",
-                        "text": "hello",
-                    },
-                    {
-                        "speaker": "agent",
-                        "text": "hello",
-                    },
-                    {
-                        "speaker": "human",
-                        "text": "goodbye",
-                    },
-                ],
-                "better": {
-                    "speaker": "agent",
-                    "text": "goodbye",
-                },
-                "worse": {
-                    "speaker": "agent",
-                    "text": "party on",
-                },
-            }
-        ]
-    )
-
-
-@pytest.fixture(name="llama3_tokenizer")
-def fixture_llama3_tokenizer():
-    tokenizer = AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3-8B")
-    tokenizer.eos_token = "<|eot_id|>"
-
-    return tokenizer
-
-
-class TestAssistantDPOChatTemplateLlama3:
-    """
-    Test class for assistant style datasets with llama-3 prompts using the chat_template strategy.
-    """
-
-    def test_llama3_defaults(self, llama3_tokenizer, assistant_dataset):
-        # pylint: disable=duplicate-code
-        transform_fn = default(
-            DictDefault(
-                {
-                    "chat_template": "llama3",
-                    "datasets": [
-                        {
-                            "chat_template": "llama3",
-                        }
-                    ],
-                }
-            )
-        )
-        result = transform_fn(assistant_dataset[0], tokenizer=llama3_tokenizer)
-        assert result["prompt"] == (
-            "<|begin_of_text|>"
-            + "<|start_header_id|>user<|end_header_id|>\n\nhello<|eot_id|>"
-            + "<|start_header_id|>assistant<|end_header_id|>\n\nhello<|eot_id|>"
-            + "<|start_header_id|>user<|end_header_id|>\n\ngoodbye<|eot_id|>"
-            + "<|start_header_id|>assistant<|end_header_id|>\n\n"
-        )
-        assert result["chosen"] == "goodbye<|eot_id|>"
-        assert result["rejected"] == "party on<|eot_id|>"
-
-    def test_llama3_configured(self, llama3_tokenizer, custom_assistant_dataset):
-        # pylint: disable=duplicate-code
-        transform_fn = default(
-            DictDefault(
-                {
-                    "chat_template": "llama3",
-                    "datasets": [
-                        {
-                            "chat_template": "llama3",
-                            "field_messages": "conversation",
-                            "field_chosen": "better",
-                            "field_rejected": "worse",
-                            "message_field_role": "speaker",
-                            "message_field_content": "text",
-                            "roles": {
-                                "user": ["human"],
-                                "assistant": ["agent"],
-                                "system": ["sys"],
-                            },
-                        }
-                    ],
-                }
-            )
-        )
-        result = transform_fn(custom_assistant_dataset[0], tokenizer=llama3_tokenizer)
-        assert result["prompt"] == (
-            "<|begin_of_text|>"
-            + "<|start_header_id|>user<|end_header_id|>\n\nhello<|eot_id|>"
-            + "<|start_header_id|>assistant<|end_header_id|>\n\nhello<|eot_id|>"
-            + "<|start_header_id|>user<|end_header_id|>\n\ngoodbye<|eot_id|>"
-            + "<|start_header_id|>assistant<|end_header_id|>\n\n"
-        )
-        assert result["chosen"] == "goodbye<|eot_id|>"
-        assert result["rejected"] == "party on<|eot_id|>"
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tests/prompt_strategies/test_sharegpt.py
+++ b/tests/prompt_strategies/test_sharegpt.py
@@ -192,7 +192,6 @@ class TestSharegptLlama3:
        input_ids = dataset_wrapper[0]["input_ids"]

        # fmt: off
-        # pylint: disable=duplicate-code
        assert input_ids == [
            128000,  # bos
            128006, 9125, 128007,  # system header
@@ -229,7 +228,6 @@ class TestSharegptLlama3:
        input_ids = dataset_wrapper[0]["input_ids"]

        # fmt: off
-        # pylint: disable=duplicate-code
        assert input_ids == [
            128000,  # bos
            128006, 9125, 128007,  # system header
--- a/tests/test_packed_pretraining.py
+++ b/tests/test_packed_pretraining.py
@@ -24,7 +24,7 @@ class TestPretrainingPacking(unittest.TestCase):
    def test_packing_stream_dataset(self):
        # pylint: disable=duplicate-code
        dataset = load_dataset(
-            "allenai/c4",
+            "c4",
            "en",
            streaming=True,
        )["train"]
@@ -33,7 +33,7 @@ class TestPretrainingPacking(unittest.TestCase):
            {
                "pretraining_dataset": [
                    {
-                        "path": "allenai/c4",
+                        "path": "c4",
                        "name": "en",
                        "type": "pretrain",
                    }